Beruflich Dokumente
Kultur Dokumente
rdd_from_coll = sc.parallelize(coll)
#Data loading
data = sc.textFile("~/ml-100k/udata.csv")
#loaded data will be a spark RDD type, run the below command to findout the data
type of data object.
type(data)
<class 'pyspark.rdd.RDD'>
# total length of the data loaded is given by:
data.count()
100001
# to check the first 5 rows of the data RDD, we use take() action method :
data.take(5)
['UserID\tItemId \tRating\tTimestamp', '196\t242\t3\t881250949',
'186\t302\t3\t891717742', '22\t377\t1\t878887116', '244\t51\t2\t880606923']
#filter function
data = data.filter(lambda l:l!=header)
#Now let us check the count of data RDD object, it has reduced from 100001 to
100000.
data.count()
100000
data.first()
'196\t242\t3\t881250949'
# check the first 5 records of the ratings pipelinedrdd object by running below
code:
ratings.take(5)
#Data exploration
#total number of unique users:
df.select('user').distinct().show(5)
+----+
|user|
+----+
| 26|
| 29|
| 474|
| 191|
| 65|
+----+
only showing top 5 rows
plt.xlabel('ratings')
plt.ylabel('Counts')
plt.title('Distribution of ratings')
plt.xticks(index + bar_width, ('1.0', '2.0', '3.0', '4.0', '5.0'))
plt.legend()
plt.tight_layout() plt.show()
Make predictions:
#below code extracts each row in the test data and extracts userID, ItemID and puts
it in testdata pipelinedRDD object.
testdata = test.map(lambda p: (p[0], p[1]))
type(testdata)
<class 'pyspark.rdd.PipelinedRDD'>
#below code displays the formated data required for making predictions:
testdata.take(5)
[(119, 392), (38, 95), (63, 277), (160, 234), (225, 193)]
Predictions:
#This method is used when we want to make predictions for a combination of user and
item
pred_ind = model.predict(119, 392)
#we can observe below that the prediction value for a user 119 and movie 392 is
4.3926091845289275, just see above the original value for the same combination in
test data
pred_ind
4.3926091845289275
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
UCBF:
#below code shows that recommendations are generated for all the 943 users.
recommedItemsToUsers.count()
943
Model evaluation:
#cal
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
[Stage 860:> (0 + 4) / 6]
rmse = sqrt(MSE)
rmse
1.092055175606539
type(ratings)
<class 'pyspark.rdd.PipelinedRDD'>
#sql Context object is created when starting the spark session using pyspark
sqlContext
<pyspark.sql.context.SQLContext object at 0x7f24c94f7d68>
type(df)
<class 'pyspark.sql.dataframe.DataFrame'>
+----+-------+------+
|user|product|rating|
+----+-------+------+
# creating random samples of training set and test set using randomSplit() method.
(training, test) = df.randomSplit([0.8, 0.2])
ALS_45108d6e011beae88f4c
# let us see how the default parameters are set for ALS model
als.explainParams()
"alpha: alpha for implicit preference (default: 1.0)\ncheckpointInterval: set
checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache
will get checkpointed every 10 iterations. (default: 10)\nfinalStorageLevel:
StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)\nimplicitPrefs:
whether to use implicit preference (default: False)\nintermediateStorageLevel:
StorageLevel for intermediate datasets. Cannot be 'NONE'. (default:
MEMORY_AND_DISK)\nitemCol: column name for item ids. Ids must be within the integer
value range. (default: item, current: ItemId )\nmaxIter: max number of iterations
(>= 0). (default: 10)\nnonnegative: whether to use nonnegative constraint for least
squares (default: False)\nnumItemBlocks: number of item blocks (default:
10)\nnumUserBlocks: number of user blocks (default: 10)\npredictionCol: prediction
column name. (default: prediction)\nrank: rank of the factorization (default:
10)\nratingCol: column name for ratings (default: rating, current:
Rating)\nregParam: regularization parameter (>= 0). (default: 0.1)\nseed: random
seed. (default: -1517157561977538513)\nuserCol: column name for user ids. Ids must
be within the integer value range. (default: user, current: UserID)"
#create pipeline object and setting the created als model as a stage in the
pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[als])
type(pipeline)
<class 'pyspark.ml.pipeline.Pipeline'>
#loading cross-validation and param grid builder modules to create range of
parameters.
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramMapExplicit = ParamGridBuilder() \
.addGrid(als.rank, [8, 12]) \
.addGrid(als.maxIter, [10, 15]) \
.addGrid(als.regParam, [1.0, 10.0]) \
.build()
cvModel = cvExplicit.fit(training)
preds = cvModel.bestModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(pred)
print("Root-mean-square error = " + str(rmse))
rmse
0.924617823674082