数据中有3个字段分别使用逗号(,)隔开,如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | 用户 ,商品 ,评分 user1 , 101 , 5.0 user1 , 102 , 3.0 user1 , 103 , 2.5 user2 , 101 , 2.0 user2 , 102 , 2.5 user2 , 103 , 5.0 user2 , 104 , 2.0 user3 , 101 , 2.0 user3 , 104 , 4.0 user3 , 105 , 4.5 user3 , 107 , 5.0 user4 , 101 , 5.0 user4 , 103 , 3.0 user4 , 104 , 4.5 user4 , 106 , 4.0 user5 , 101 , 4.0 user5 , 102 , 3.0 user5 , 103 , 2.0 user5 , 104 , 4.0 user5 , 105 , 3.5 user5 , 106 , 4.0 |
使用源数据(01_user_goods_score.data), 通过MRJob计算出唯一的商品。
MRJob代码(pandas_01_goods.py)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | #!/usr/bin/env python # -*- coding: utf-8 -*- from mrjob . job import MRJob class Pandas01GoodsIndex ( MRJob ) : “” “商品用户评分矩阵” “” def mapper ( self , _ , line ) : # 解析行: 用户, 商品, 评分 user , goods , score = line . split ( ‘,’ ) yield goods , None def reducer ( self , key , values ) : yield key , 1 def main ( ) : Pandas01GoodsIndex . run ( ) if __name_ _ == ‘__main__’ : main ( ) |
执行
1 2 3 4 5 6 7 8 9 | python pandas_01_goods . py 01_user_goods_score.data > pandas_01_goods . data cat pandas_01_goods . data “101” 1 “102” 1 “103” 1 “104” 1 “105” 1 “106” 1 “107” 1 |
注意: 上面输出的结果其中后面的 1 是没有用的
方法和计算出唯一的商品是类似的, 同样也是使用(01_user_goods_score.data)中的数据通过MRJob分析得出
MRJob代码(pandas_01_user.py)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | #!/usr/bin/env python # -*- coding: utf-8 -*- from mrjob . job import MRJob class Pandas01GoodsIndex ( MRJob ) : “” “商品用户评分矩阵” “” def mapper ( self , _ , line ) : # 解析行: 用户, 商品, 评分 user , goods , score = line . split ( ‘,’ ) yield user , None def reducer ( self , key , values ) : yield key , 1 def main ( ) : Pandas01GoodsIndex . run ( ) if __name_ _ == ‘__main__’ : main ( ) |
执行
1 2 3 4 5 6 7 | python pandas_01_user . py 01_user_goods_score.data > pandas_01_user . data cat pandas_01_user . data “user1” 1 “user2” 1 “user3” 1 “user4” 1 “user5” 1 |
思路:
1 2 3 4 5 6 7 8 9 | goods _dict = { ‘101’ : 0 , ‘102’ : 1 , ‘103’ : 2 , ‘104’ : 3 , ‘105’ : 4 , ‘106’ : 5 , ‘107’ : 6 , } |
1 2 3 4 5 6 7 | user _dict = { ‘user1’ : 0 , ‘user2’ : 1 , ‘user3’ : 2 , ‘user4’ : 3 , ‘user5’ : 4 , } |
这边使用了 Python 的数据分析框架 scipy
在生成矩阵的时候使用的是 goods_dict 中的值来代替的, 到了最后在替换回来
在生成矩阵的时候使用的是 user_dict 中的值来代替的, 到了最后在替换回来
计算物品推荐代码(pandas_02_final.py)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | #!/usr/bin/env python #-*- coding:utf-8 -*- from scipy import sparse import numpy as np import pandas as pd class Pandas02Final ( object ) : “” “计算物品推荐数据” “” def __init__ ( self ) : self . goods _dict = { } self . user _dict = { } self . goods_bought_count _matrix = None self . user_goods_score _matrix = None self . every_user_goods_score _matrix = None self . every_user_goods_score _df = None def get_goods_dict ( self , file_name ) : “” “通过读取文件从而获得商品字典” “” with open ( file_name ) as f : for index , line in enumerate ( f ) : items = line . split ( ‘”‘ ) goods = items [ 1 ] self . goods_dict [ goods ] = index def get_user_dict ( self , file_name ) : “” “通过读取文件从而获得用户字典” “” with open ( file_name ) as f : for index , line in enumerate ( f ) : items = line . split ( ‘”‘ ) user = items [ 1 ] self . user_dict [ user ] = index def get_goods_bought_count_matrix ( self , file_name ) : “” “获得商品购买次数矩阵” “” # 定义稀疏矩阵的行, 列, 值 row _indices = [ ] # 行是商品 col _indices = [ ] # 列是商品 values = [ ] # 值是 1 with open ( file_name ) as f : for line in f : # 获得 用户 商品:评分,商品:评分… items = line . split ( ‘”‘ ) goods _scores = items [ 3 ] for goods _score in goods_scores . split ( ‘,’ ) : goods _row = goods_score . split ( ‘:’ ) [ 0 ] # 获得行的商品 for goods _score in goods_scores . split ( ‘,’ ) : goods _col = goods_score . split ( ‘:’ ) [ 0 ] # 获得列的商品 # 添加矩阵的 行 列 值 row_indices . append ( self . goods_dict [ goods_row ] ) # 使用 goods_dict 的值代替 col_indices . append ( self . goods_dict [ goods_col ] ) # 使用 goods_dict 的值代替 values . append ( 1 ) # 值为1 # row_indices col_indices values 这三个变量中的值满足了稀疏矩阵的值, # 通过这三个变量构造矩阵 row _indices = np . array ( row_indices ) col _indices = np . array ( col_indices ) values = np . array ( values ) # 生成矩阵(同现矩阵) self . goods_bought_count _matrix = sparse . coo_matrix ( ( values , ( row_indices , col_indices ) ) , shape = ( len ( self . goods_dict ) , len ( self . goods_dict ) ) ) . todense ( ) def get_user_goods_score_matrix ( self , file_name ) : “” “获得用户商品评分矩阵(仅仅是用户购买购买过的商品)” “” # 定义稀疏矩阵的行, 列, 值 row _indices = [ ] # 行是商品 col _indices = [ ] # 列是用户 values = [ ] # 值是用户对商品的评分(score) with open ( file_name ) as f : for line in f : user , goods , score = line . split ( ‘,’ ) # 添加矩阵的 行 列 值 row_indices . append ( self . goods_dict [ goods ] ) # 使用 goods_dict 的值代替 col_indices . append ( self . user_dict [ user ] ) # 使用 user_dict 的值代替 values . append ( float ( score ) ) # 值为用户评分(score) # row_indices col_indices values 这三个变量中的值满足了稀疏矩阵的值, # 通过这三个变量构造矩阵 row _indices = np . array ( row_indices ) col _indices = np . array ( col_indices ) values = np . array ( values ) # 生成矩阵(同现矩阵) self . user_goods_score _matrix = sparse . coo_matrix ( ( values , ( row_indices , col_indices ) ) , shape = ( len ( self . goods_dict ) , len ( self . user_dict ) ) ) . todense ( ) def get_every_user_goods_score_matrix ( self ) : “” “获得每个用户每个商品的评分情况 计算公式: <商品购买次数矩阵> X <用户商品评分矩阵> “ “” self . every_user_goods_score _matrix = ( self . goods_bought_count _matrix * self . user_goods_score_matrix ) def get_every_user_goods_score_df ( self ) : “” “<每个用户每个商品评分矩阵> 和 Pandas, goods_dict, user_dict 结合获得最终的数据” “” self . every_user_goods_score _df = pd . DataFrame ( self . every_user_goods_score_matrix , columns = sorted ( self . user_dict ) , # 用户为列 index = sorted ( self . goods_dict ) ) # 商品为索引 def main ( ) : pandas_02 _final = Pandas02Final ( ) # 1. 获得商品数据字典 pandas_02_final . get_goods_dict ( ‘pandas_01_goods.data’ ) print ‘==============================================’ print ‘1. 获得商品数据字典’ print ‘==============================================’ print pandas_02_final . goods _dict # 2. 获得用户数据字典 pandas_02_final . get_user_dict ( ‘pandas_01_user.data’ ) print ‘==============================================’ print ‘2. 获得用户数据字典’ print ‘==============================================’ print pandas_02_final . user _dict # 3. 获得商品购买次数矩阵 pandas_02_final . get_goods_bought_count_matrix ( ’02_user_goods_score_record.data’ ) print ‘==============================================’ print ‘3. 获得商品购买次数矩阵’ print ‘==============================================’ print pandas_02_final . goods_bought_count _matrix # 4. 获得用户商品评分矩矩阵(仅仅包含用户买过的商品) pandas_02_final . get_user_goods_score_matrix ( ’01_user_goods_score.data’ ) print ‘==============================================’ print ‘4. 获得用户商品评分矩矩阵(仅仅包含用户买过的商品)’ print ‘==============================================’ print pandas_02_final . user_goods_score _matrix # 5. 两个矩阵相乘获得最终的 <每个用户每个商品评分矩阵> pandas_02_final . get_every_user_goods_score_matrix ( ) print ‘==============================================’ print ‘5. 两个矩阵相乘获得最终的 <每个用户每个商品评分矩阵>’ print ‘==============================================’ print pandas_02_final . every_user_goods_score _matrix # 6. <每个用户每个商品评分矩阵> 和 Pandas, goods_dict, user_dict 结合获得最终的数据 pandas_02_final . get_every_user_goods_score_df ( ) print ‘==============================================’ print ‘6. <每个用户每个商品评分矩阵> 和 Pandas, goods_dict, user_dict 结合获得最终的数据’ print ‘==============================================’ print pandas_02_final . every_user_goods_score _df if __name_ _ == ‘__main__’ : main ( ) |
执行与结果
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | python pandas_02_final . py === === === === === === === === === === === === === === === = 1. 获得商品数据字典 === === === === === === === === === === === === === === === = { ‘102’ : 1 , ‘103’ : 2 , ‘101’ : 0 , ‘106’ : 5 , ‘107’ : 6 , ‘104’ : 3 , ‘105’ : 4 } === === === === === === === === === === === === === === === = 2. 获得用户数据字典 === === === === === === === === === === === === === === === = { ‘user4’ : 3 , ‘user5’ : 4 , ‘user2’ : 1 , ‘user3’ : 2 , ‘user1’ : 0 } === === === === === === === === === === === === === === === = 3. 获得商品购买次数矩阵 === === === === === === === === === === === === === === === = [ [ 5 3 4 4 2 2 1 ] [ 3 3 3 2 1 1 0 ] [ 4 3 4 3 1 2 0 ] [ 4 2 3 4 2 2 1 ] [ 2 1 1 2 2 1 1 ] [ 2 1 2 2 1 2 0 ] [ 1 0 0 1 1 0 1 ] ] === === === === === === === === === === === === === === === = 4. 获得用户商品评分矩矩阵 (仅仅包含用户买过的商品 ) === === === === === === === === === === === === === === === = [ [ 5. 2. 2. 5. 4. ] [ 3. 2.5 0. 0. 3. ] [ 2.5 5. 0. 3. 2. ] [ 0. 2. 4. 4.5 4. ] [ 0. 0. 4.5 0. 3.5 ] [ 0. 0. 0. 4. 4. ] [ 0. 0. 5. 0. 0. ] ] === === === === === === === === === === === === === === === = 5. 两个矩阵相乘获得最终的 <每个用户每个商品评分矩阵 > === === === === === === === === === === === === === === === = [ [ 44. 45.5 40. 63. 68. ] [ 31.5 32.5 18.5 37. 42.5 ] [ 39. 41.5 24.5 53.5 56.5 ] [ 33.5 36. 38. 55. 59. ] [ 15.5 15.5 26. 26. 32. ] [ 18. 20.5 16.5 33. 34.5 ] [ 5. 4. 15.5 9.5 11.5 ] ] === === === === === === === === === === === === === === === = 6. <每个用户每个商品评分矩阵 > 和 Pandas , goods_dict , user _dict 结合获得最终的数据 === === === === === === === === === === === === === === === = user1 user2 user3 user4 user5 101 44.0 45.5 40.0 63.0 68.0 102 31.5 32.5 18.5 37.0 42.5 103 39.0 41.5 24.5 53.5 56.5 104 33.5 36.0 38.0 55.0 59.0 105 15.5 15.5 26.0 26.0 32.0 106 18.0 20.5 16.5 33.0 34.5 107 5.0 4.0 15.5 9.5 11.5 |
从最后的结果可以很容易的看出用户相关商品的得分是多少, 从而可以得到需要推荐的商品(剔除用户购买评分过的商品)。
文章转载来自:trustauth.cn