当前位置: 首页 > news >正文

数据分析作业四-基于用户及物品数据进行内容推荐

## 导入支持库
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import os, sys
import re
import seaborn as sns
## 加载数据集并检查书籍,用户和评级数据集的形状
books = pd.read_csv('F:\\data\\bleeding_data\\BX-Books.csv',sep=None,encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor','yearOfPublication', 'publisher','imageUrlS', 'imageUrlM', 'imageUrlL']users = pd.read_csv('F:\\data\\bleeding_data\\BX-Users.csv',sep=None, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']ratings = pd.read_csv('F:\\data\\bleeding_data\\BX-Book-Ratings.csv',sep=None, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']print (books.shape)
print (users.shape)
print (ratings.shape)
(271360, 8)
(278858, 3)
(1149780, 3)
## 一、图书数据集
books.head()
ISBNbookTitlebookAuthoryearOfPublicationpublisherimageUrlSimageUrlMimageUrlL
00195153448Classical MythologyMark P. O. Morford2002Oxford University Presshttp://images.amazon.com/images/P/0195153448.0...http://images.amazon.com/images/P/0195153448.0...http://images.amazon.com/images/P/0195153448.0...
10002005018Clara CallanRichard Bruce Wright2001HarperFlamingo Canadahttp://images.amazon.com/images/P/0002005018.0...http://images.amazon.com/images/P/0002005018.0...http://images.amazon.com/images/P/0002005018.0...
20060973129Decision in NormandyCarlo D'Este1991HarperPerennialhttp://images.amazon.com/images/P/0060973129.0...http://images.amazon.com/images/P/0060973129.0...http://images.amazon.com/images/P/0060973129.0...
30374157065Flu: The Story of the Great Influenza Pandemic...Gina Bari Kolata1999Farrar Straus Girouxhttp://images.amazon.com/images/P/0374157065.0...http://images.amazon.com/images/P/0374157065.0...http://images.amazon.com/images/P/0374157065.0...
40393045218The Mummies of UrumchiE. J. W. Barber1999W. W. Norton & Companyhttp://images.amazon.com/images/P/0393045218.0...http://images.amazon.com/images/P/0393045218.0...http://images.amazon.com/images/P/0393045218.0...
## url不需要分析,进行删除
books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)
books.head()
ISBNbookTitlebookAuthoryearOfPublicationpublisher
00195153448Classical MythologyMark P. O. Morford2002Oxford University Press
10002005018Clara CallanRichard Bruce Wright2001HarperFlamingo Canada
20060973129Decision in NormandyCarlo D'Este1991HarperPerennial
30374157065Flu: The Story of the Great Influenza Pandemic...Gina Bari Kolata1999Farrar Straus Giroux
40393045218The Mummies of UrumchiE. J. W. Barber1999W. W. Norton & Company
## books.dtypes
books.dtypes
ISBN                 object
bookTitle            object
bookAuthor           object
yearOfPublication    object
publisher            object
dtype: object
## 现在检查属性的唯一值
books.bookTitle.unique()
array(['Classical Mythology', 'Clara Callan', 'Decision in Normandy', ...,'Lily Dale : The True Story of the Town that Talks to the Dead',"Republic (World's Classics)","A Guided Tour of Rene Descartes' Meditations on First Philosophy with Complete Translations of the Meditations by Ronald Rubin"],dtype=object)
books.yearOfPublication.unique()
array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988','2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995','1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987','1990', '1981', '1989', '1984', '0', '1968', '1961', '1958','1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970','1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959','1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950','1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936','1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923','2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949','1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934','1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026','1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806','2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909','2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],dtype=object)
books.loc[books.yearOfPublication == 'DK Publishing Inc',:]
books.yearOfPublication.unique()
array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988','2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995','1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987','1990', '1981', '1989', '1984', '0', '1968', '1961', '1958','1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970','1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959','1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950','1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936','1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923','2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949','1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934','1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026','1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806','2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909','2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],dtype=object)
print(books.loc[books.yearOfPublication == 'DK Publishing Inc',:])
              ISBN                                          bookTitle  \
209538  078946697X  DK Readers: Creating the X-Men, How It All Beg...   
221678  0789466953  DK Readers: Creating the X-Men, How Comic Book...   bookAuthor  yearOfPublication  \
209538       2000  DK Publishing Inc   
221678       2000  DK Publishing Inc   publisher  
209538  http://images.amazon.com/images/P/078946697X.0...  
221678  http://images.amazon.com/images/P/0789466953.0...  
books.loc[books.yearOfPublication == 'DK Publishing Inc',:]
ISBNbookTitlebookAuthoryearOfPublicationpublisher
209538078946697XDK Readers: Creating the X-Men, How It All Beg...2000DK Publishing Inchttp://images.amazon.com/images/P/078946697X.0...
2216780789466953DK Readers: Creating the X-Men, How Comic Book...2000DK Publishing Inchttp://images.amazon.com/images/P/0789466953.0...
## 从上面可以看出,bookAuthor错误地装载了bookTitle,因此需要进行修正。
# ISBN '0789466953'
books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000
books.loc[books.ISBN == '0789466953','bookAuthor'] = "James Buckley"
books.loc[books.ISBN == '0789466953','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '0789466953','bookTitle'] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)"#ISBN '078946697X'
books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000
books.loc[books.ISBN == '078946697X','bookAuthor'] = "Michael Teitelbaum"
books.loc[books.ISBN == '078946697X','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '078946697X','bookTitle'] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)"
books.loc[(books.ISBN == '0789466953') | (books.ISBN == '078946697X'),:]
ISBNbookTitlebookAuthoryearOfPublicationpublisher
209538078946697XDK Readers: Creating the X-Men, How It All Beg...Michael Teitelbaum2000DK Publishing Inc
2216780789466953DK Readers: Creating the X-Men, How Comic Book...James Buckley2000DK Publishing Inc
## 继续纠正出版年鉴的类型
books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')
sorted(books['yearOfPublication'].unique())
[0.0,1376.0,1378.0,1806.0,1897.0,1900.0,1901.0,1902.0,1904.0,1906.0,1908.0,1909.0,1910.0,1911.0,1914.0,1917.0,1919.0,1920.0,1921.0,1922.0,1923.0,1924.0,1925.0,1926.0,1927.0,1928.0,1929.0,1930.0,1931.0,1932.0,1933.0,1934.0,1935.0,1936.0,1937.0,1938.0,1939.0,1940.0,1941.0,1942.0,1943.0,1944.0,1945.0,1946.0,1947.0,1948.0,1949.0,1950.0,1951.0,1952.0,1953.0,1954.0,1955.0,1956.0,1957.0,1958.0,1959.0,1960.0,1961.0,1962.0,1963.0,1964.0,1965.0,1966.0,1967.0,1968.0,1969.0,1970.0,1971.0,1972.0,1973.0,1974.0,1975.0,1976.0,1977.0,1978.0,1979.0,1980.0,1981.0,1982.0,1983.0,1984.0,1985.0,1986.0,1987.0,1988.0,1989.0,1990.0,1991.0,1992.0,1993.0,1994.0,1995.0,1996.0,1997.0,1998.0,1999.0,2000.0,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2008.0,2010.0,2011.0,2012.0,2020.0,2021.0,2024.0,2026.0,2030.0,2037.0,2038.0,2050.0,nan]
## 现在可以看出yearOfPublication的类型为int,其值范围为0-2050。## 由于该数据集建于2004年,我假设2006年之后的所有年份都无效,保留两年的保证金,以防数据集可能已更新。## 对于所有无效条目(包括0),我将这些条目转换为NaN,然后​​用剩余年份的平均值替换它们。
books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN
# 用年出版的平均价值代替NaNs在案例数据集被更新的情况下保留一定的空白
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)
books.yearOfPublication.isnull().sum()
0
books.yearOfPublication = books.yearOfPublication.astype(np.int32)
## publisher
books.loc[books.publisher.isnull(),:]
ISBNbookTitlebookAuthoryearOfPublicationpublisher
128890193169656XTyrant MoonElaine Corvidae2002NaN
1290371931696993Finders KeepersLinnea Sinclair2001NaN
## 检查行是否有书签作为查找器,看看我们是否能得到任何线索## 与不同的出版商和图书作者的所有行
books.loc[(books.bookTitle == 'Tyrant Moon'),:]
ISBNbookTitlebookAuthoryearOfPublicationpublisher
128890193169656XTyrant MoonElaine Corvidae2002NaN
books.loc[(books.bookTitle == 'Finders Keepers'),:]
ISBNbookTitlebookAuthoryearOfPublicationpublisher
10799082177364XFinders KeepersFern Michaels2002Zebra Books
420190070465037Finders KeepersBarbara Nickolae1989McGraw-Hill Companies
582640688118461Finders KeepersEmily Rodda1993Harpercollins Juvenile Books
666781575663236Finders KeepersFern Michaels1998Kensington Publishing Corporation
1290371931696993Finders KeepersLinnea Sinclair2001NaN
1343090156309505Finders KeepersWill1989Voyager Books
1734730973146907Finders KeepersSean M. Costello2002Red Tower Publications
1958850061083909Finders KeepersSharon Sala2003HarperTorch
2118740373261160Finders KeepersElizabeth Travis1993Worldwide Library
## 由图书作者检查以找到模式## 都有不同的出版商。这里没有线索
books.loc[(books.bookAuthor == 'Elaine Corvidae'),:]
ISBNbookTitlebookAuthoryearOfPublicationpublisher
1267621931696934Winter's OrphansElaine Corvidae2001Novelbooks
128890193169656XTyrant MoonElaine Corvidae2002NaN
1290010759901880WolfkinElaine Corvidae2001Hard Shell Word Factory
## 由图书作者检查以找到模式
books.loc[(books.bookAuthor == 'Linnea Sinclair'),:]
ISBNbookTitlebookAuthoryearOfPublicationpublisher
1290371931696993Finders KeepersLinnea Sinclair2001NaN
## 因为没有什么共同的东西可以推断出NaNs的发布者,将它们替换为“other”
books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'
books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'
## 二、用户数据集
print (users.shape)
users.head()
(278858, 3)
userIDLocationAge
01nyc, new york, usaNaN
12stockton, california, usa18.0
23moscow, yukon territory, russiaNaN
34porto, v.n.gaia, portugal17.0
45farnborough, hants, united kingdomNaN
users.dtypes
userID        int64
Location     object
Age         float64
dtype: object
users.userID.values
array([     1,      2,      3, ..., 278856, 278857, 278858], dtype=int64)
## Age 
sorted(users.Age.unique())
[nan,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0,64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0,75.0,76.0,77.0,78.0,79.0,80.0,81.0,82.0,83.0,84.0,85.0,86.0,87.0,88.0,89.0,90.0,91.0,92.0,93.0,94.0,95.0,96.0,97.0,98.0,99.0,100.0,101.0,102.0,103.0,104.0,105.0,106.0,107.0,108.0,109.0,110.0,111.0,113.0,114.0,115.0,116.0,118.0,119.0,123.0,124.0,127.0,128.0,132.0,133.0,136.0,137.0,138.0,140.0,141.0,143.0,146.0,147.0,148.0,151.0,152.0,156.0,157.0,159.0,162.0,168.0,172.0,175.0,183.0,186.0,189.0,199.0,200.0,201.0,204.0,207.0,208.0,209.0,210.0,212.0,219.0,220.0,223.0,226.0,228.0,229.0,230.0,231.0,237.0,239.0,244.0]
## 年龄栏有一些无效的条目,比如nan,0和非常高的值,比如100和以上
users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan
## 用平均值代替NaN
## 将数据类型设置为int
users.Age = users.Age.fillna(users.Age.mean())
users.Age = users.Age.astype(np.int32)
sorted(users.Age.unique())
[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90]
## 三、评级数据集
ratings.shape
(1149780, 3)
## 如果每个用户对每个条目进行评级,那么评级数据集将有nusers * nbooks条目,这表明数据集非常稀疏。
n_users = users.shape[0]
n_books = books.shape[0]
print (n_users * n_books)
75670906880
ratings.head(5)
userIDISBNbookRating
0276725034545104X0
127672601550612245
227672704465208020
3276729052165615X3
427672905217950286
ratings.bookRating.unique()
array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)
ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]
print (ratings.shape)
print (ratings_new.shape)
(1149780, 3)
(1031136, 3)
## 没有新用户添加,因此我们将使用高于数据集的新用户(1031136,3)
print ("number of users: " + str(n_users))
print ("number of books: " + str(n_books))
number of users: 278858
number of books: 271360
sparsity=1.0-len(ratings_new)/float(n_users*n_books)
print ('图书交叉数据集的稀疏级别是 ' +  str(sparsity*100) + ' %')
图书交叉数据集的稀疏级别是 99.99863734155898 %
ratings.bookRating.unique()
array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)
ratings_explicit = ratings_new[ratings_new.bookRating != 0]
ratings_implicit = ratings_new[ratings_new.bookRating == 0]
print (ratings_new.shape)
print( ratings_explicit.shape)
print (ratings_implicit.shape)
(1031136, 3)
(383842, 3)
(647294, 3)
## 统计
sns.countplot(data=ratings_explicit , x='bookRating')
plt.show()

在这里插入图片描述

## 基于简单流行度的推荐系统
ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())
top10 = ratings_count.sort_values('bookRating', ascending = False).head(10)
print ("推荐下列书籍")
top10.merge(books, left_index = True, right_on = 'ISBN')
推荐下列书籍
bookRatingISBNbookTitlebookAuthoryearOfPublicationpublisher
40857870316666343The Lovely Bones: A NovelAlice Sebold2002Little, Brown
74841080385504209The Da Vinci CodeDan Brown2003Doubleday
52231340312195516The Red Tent (Bestselling Backlist)Anita Diamant1998Picador USA
21432798059035342XHarry Potter and the Sorcerer's Stone (Harry P...J. K. Rowling1999Arthur A. Levine Books
35625950142001740The Secret Life of BeesSue Monk Kidd2003Penguin Books
2625510971880107Wild AnimusRich Shapero2004Too Far
110525240060928336Divine Secrets of the Ya-Ya Sisterhood: A NovelRebecca Wells1997Perennial
70624020446672211Where the Heart Is (Oprah's Book Club (Paperba...Billie Letts1998Warner Books
23122190452282152Girl with a Pearl EarringTracy Chevalier2001Plume Books
11821790671027360Angels &amp; DemonsDan Brown2001Pocket Star
users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]
users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]
print (users.shape)
print (users_exp_ratings.shape)
print (users_imp_ratings.shape)
(278858, 3)
(68091, 3)
(52451, 3)
## 基于协同过滤的推荐系统
counts1 = ratings_explicit['userID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 100].index)]
counts = ratings_explicit['bookRating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 100].index)]
ratings_matrix = ratings_explicit.pivot(index='userID', columns='ISBN', values='bookRating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix.head()
(449, 66574)
ISBN00009131540001046438000104687X00010472130001047973000104799X0001048082000105373600010537440001055607...B000092Q0AB00009EF82B00009NDANB0000DYXIDB0000T6KHIB0000VZEJQB0000X8HIEB00013AX9EB0001I1KOGB000234N3A
userID
2033NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2110NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2276NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4017NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4385NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN

5 rows × 66574 columns

n_users = ratings_matrix.shape[0] #只考虑那些给出明确评级的用户
n_books = ratings_matrix.shape[1]
print (n_users, n_books)
449 66574
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)
ratings_matrix.head(5)
ISBN00009131540001046438000104687X00010472130001047973000104799X0001048082000105373600010537440001055607...B000092Q0AB00009EF82B00009NDANB0000DYXIDB0000T6KHIB0000VZEJQB0000X8HIEB00013AX9EB0001I1KOGB000234N3A
userID
20330000000000...0000000000
21100000000000...0000000000
22760000000000...0000000000
40170000000000...0000000000
43850000000000...0000000000

5 rows × 66574 columns

sparsity=1.0-len(ratings_explicit)/float(users_exp_ratings.shape[0]*n_books)
print ('图书交叉数据集的稀疏级别是 ' +  str(sparsity*100) + ' %')
图书交叉数据集的稀疏级别是 99.99772184106935 %
## 基于用户的协同过滤
global metric,k
k=10
metric='cosine'
def findksimilarusers(user_id, ratings, metric = metric, k=k):similarities=[]indices=[]model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') model_knn.fit(ratings)loc = ratings.index.get_loc(user_id)distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)similarities = 1-distances.flatten()return similarities,indices
def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):prediction=0user_loc = ratings.index.get_loc(user_id)item_loc = ratings.columns.get_loc(item_id)similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similaritymean_rating = ratings.iloc[user_loc,:].mean() #to adjust for zero based indexingsum_wt = np.sum(similarities)-1product=1wtd_sum = 0 for i in range(0, len(indices.flatten())):if indices.flatten()[i] == user_loc:continue;else: ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])product = ratings_diff * (similarities[i])wtd_sum = wtd_sum + product#在非常稀疏的数据集的情况下,使用基于协作的方法的相关度量可能会给出负面的评价#在这里的处理如下if prediction <= 0:prediction = 1   elif prediction >10:prediction = 10prediction = int(round(mean_rating + (wtd_sum/sum_wt)))print ('用户预测等级 {0} -> item {1}: {2}'.format(user_id,item_id,prediction))return prediction
## 测试
predict_userbased(11676,'0001056107',ratings_matrix)
用户预测等级 11676 -> item 0001056107: 22
## 基于项目的协同过滤
def findksimilaritems(item_id, ratings, metric=metric, k=k):similarities=[]indices=[]ratings=ratings.Tloc = ratings.index.get_loc(item_id)model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')model_knn.fit(ratings)distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)similarities = 1-distances.flatten()return similarities,indices
def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):prediction= wtd_sum =0user_loc = ratings.index.get_loc(user_id)item_loc = ratings.columns.get_loc(item_id)similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficientssum_wt = np.sum(similarities)-1product=1for i in range(0, len(indices.flatten())):if indices.flatten()[i] == item_loc:continue;else:product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])wtd_sum = wtd_sum + product                              prediction = int(round(wtd_sum/sum_wt))#在非常稀疏的数据集的情况下,使用基于协作的方法的相关度量可能会给出负面的评价#在这里处理的是下面的//代码,没有下面的代码片段,下面的代码片段是为了避免负面影响#在使用相关度规时,可能会出现非常稀疏的数据集的预测if prediction <= 0:prediction = 1   elif prediction >10:prediction = 10print ('用户预测等级 {0} -> item {1}: {2}'.format(user_id,item_id,prediction)    )  return prediction
## 测试
prediction = predict_itembased(11676,'0001056107',ratings_matrix)
用户预测等级 11676 -> item 0001056107: 1

http://www.lryc.cn/news/144094.html

相关文章:

  • 在腾讯云服务器OpenCLoudOS系统中安装svn(有图详解)
  • C语言日常刷题5
  • 【LeetCode-中等题】73. 矩阵置零
  • 本地部署 FastGPT
  • 软件工程(十八) 行为型设计模式(四)
  • Socket通信与WebSocket协议
  • 新KG视点 | Jeff Pan、陈矫彦等——大语言模型与知识图谱的机遇与挑战
  • 详解过滤器Filter和拦截器Interceptor的区别和联系
  • List常用的操作
  • Android studio APK切换多个摄像头(Camera2)
  • ChatGPT 对教育的影响,AI 如何颠覆传统教育
  • Spring(九)声明式事务
  • java中用HSSFWorkbook生成xls格式的excel(亲测)
  • 做平面设计一般电脑可以吗 优漫动游
  • 设计模式备忘录+命令模式实现Word撤销恢复操作
  • Linux centos7 bash编程小训练
  • 创作2周年纪念日-特别篇
  • 【UE5】用法简介-使用MAWI高精度树林资产的地形材质与添加风雪效果
  • 兼容AD210 车规级高精度隔离放大器:ISO EM210
  • R语言常用数组函数
  • 前端开发之Element Plus的分页组件el-pagination显示英文转变为中文
  • 基于Java+SpringBoot+Vue前后端分离社区医院管理系统设计和实现
  • 浅谈单例模式在游戏开发中的应用
  • Stable Diffusion WebUI 整合包
  • 什么是 RESTful API
  • 如何搭建关键字驱动自动化测试框架?
  • WPF实战项目十二(API篇):配置AutoMapper
  • Linux 内核模块加载过程之重定位
  • Flink流批一体计算(19):PyFlink DataStream API之State
  • adb shell获取安卓设备电量ROM内存帧率等信息