[도서] 추천 시스템 구축 초간단한 방법

PassionPython 2019. 8. 19. 16:47

import pandas as pd
books = pd.read_csv('data/books.csv')
ratings = pd.read_csv('data/ratings.csv')

C:\Users\one\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3057: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

books.head()

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher	imageUrlS	imageUrlM	imageUrlL
0	0195153448	Classical Mythology	Mark P. O. Morford	2002	Oxford University Press	http://images.amazon.com/images/P/0195153448.0...	http://images.amazon.com/images/P/0195153448.0...	http://images.amazon.com/images/P/0195153448.0...
1	0002005018	Clara Callan	Richard Bruce Wright	2001	HarperFlamingo Canada	http://images.amazon.com/images/P/0002005018.0...	http://images.amazon.com/images/P/0002005018.0...	http://images.amazon.com/images/P/0002005018.0...
2	0060973129	Decision in Normandy	Carlo D'Este	1991	HarperPerennial	http://images.amazon.com/images/P/0060973129.0...	http://images.amazon.com/images/P/0060973129.0...	http://images.amazon.com/images/P/0060973129.0...
3	0374157065	Flu: The Story of the Great Influenza Pandemic...	Gina Bari Kolata	1999	Farrar Straus Giroux	http://images.amazon.com/images/P/0374157065.0...	http://images.amazon.com/images/P/0374157065.0...	http://images.amazon.com/images/P/0374157065.0...
4	0393045218	The Mummies of Urumchi	E. J. W. Barber	1999	W. W. Norton & Company	http://images.amazon.com/images/P/0393045218.0...	http://images.amazon.com/images/P/0393045218.0...	http://images.amazon.com/images/P/0393045218.0...

print(ratings.shape)
ratings.head()

(1149780, 3)

	userID	ISBN	bookRating
0	276725	034545104X	0
1	276726	0155061224	5
2	276727	0446520802	0
3	276729	052165615X	3
4	276729	0521795028	6

books = books[['ISBN', 'bookTitle']]
ratings = ratings[['ISBN', 'userID', 'bookRating']]

target = "Sorcerer's"
for i in books["bookTitle"]:
    if target in i:
        print(i)

The Sorcerer's Companion: A Guide to the Magical World of Harry Potter
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
Harry Potter and the Sorcerer's Stone (Book 1)
A Sorcerer's Treason: A Novel of Isavalta
Disney's Sorcerer's Apprentice/Fantasia (Little Golden Book)
Harry Potter and the Sorcerer's Stone Movie Poster Book
The Sorcerer's Apprentice : Storybook and Magic Tricks
Harry Potter and the Sorcerer's Stone (Book 1 Audio CD)
The Sorcerer's Academy
Sorcerer's Apprentice (Well Loved Tales)
The Sorcerer's Heir
Sorcerer's Son
Harry Potter and the Sorcerer's Stone (Book 1, Audio)
Harry Potter and the Sorcerer's Stone (Book 1, Large Print)
The Spell of the Sorcerer's Skull (Johnny Dixon Mystery)
Walt Disney's the Sorcerer's Apprentice (Disney's Wonderful World of Reading, No. 12)
Dark Sister : Sorcerer's Love Story, A (Medicine Woman Series , No 10)
The Sorcerer's Skull
The Sorcerer's Gun (Changeling Saga, No 2)
The Sorcerer's Apprentice
The Sorcerer's Lady
Harry Potter and the  Sorcerer's Stone
Walt Disney's the Sorcerer's Apprentice (Disney's Wonderful World of Reading,)
Harry Potter and the Sorcerer's Stone (Urdu Edition)
A Sorcerer's Apprentice
Merlin: The Sorcerer's Guide to Survival in College (Freshman Orientation)
Sorcerer's Legacy
The Sorcerer's Apprentice (Contemporary American Fiction)
Harry Potter and the Sorcerer's Stone (Harry Potter (Hardcover))
Sorcerer's Stone (Tsr-Books Novel)
The Sorcerer's Gun (Chageling Saga, 2)
The Sorcerer's Lady (Timeswept : Love Spell Time-Travel Romance)
Harry Potter and the Sorcerer's Stone
Harry Potter and the Sorcerer's Stone: A Deluxe Pop-up Book
The Sorcerer's Apprentices (Night Lights)
The Sorcerer's Sacred Isle (The Danan's, Book 1)
The Spell of the Sorcerer's Skull

import numpy as np

target = "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"
cols = ['userID', 'bookTitle', 'bookRating']

movies = books
#ratings = ratings[:100000]

combined = pd.merge(movies, ratings)
pivoted = combined.pivot_table(index = cols[0], columns = cols[1], values = cols[2])
matrix_similar = pivoted.corrwith(pivoted[target]).dropna()

counted = combined[cols].groupby(cols[1]).agg({cols[2] : [np.size, np.mean, np.std]})
popular = counted[cols[2]]['size'] >= 250

combined_result = counted[popular].join(pd.DataFrame(matrix_similar, columns = ['corr']))
combined_result.sort_values(by = ['corr'], ascending = False).head(10)

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-6-915ccfd8f63a> in <module>
      8 
      9 combined = pd.merge(movies, ratings)
---> 10 pivoted = combined.pivot_table(index = cols[0], columns = cols[1], values = cols[2])
     11 matrix_similar = pivoted.corrwith(pivoted[target]).dropna()
     12 


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
   5757                            aggfunc=aggfunc, fill_value=fill_value,
   5758                            margins=margins, dropna=dropna,
-> 5759                            margins_name=margins_name)
   5760 
   5761     def stack(self, level=-1, dropna=True):


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
    107             else:
    108                 to_unstack.append(name)
--> 109         table = agged.unstack(to_unstack)
    110 
    111     if not dropna:


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in unstack(self, level, fill_value)
   5990         """
   5991         from pandas.core.reshape.reshape import unstack
-> 5992         return unstack(self, level, fill_value)
   5993 
   5994     _shared_docs['melt'] = ("""


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
    386     if isinstance(obj, DataFrame):
    387         if isinstance(obj.index, MultiIndex):
--> 388             return _unstack_frame(obj, level, fill_value=fill_value)
    389         else:
    390             return obj.T.stack(dropna=False)


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_frame(obj, level, fill_value)
    409                                value_columns=obj.columns,
    410                                fill_value=fill_value,
--> 411                                constructor=obj._constructor)
    412         return unstacker.get_result()
    413 


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in __init__(self, values, index, level, value_columns, fill_value, constructor)
    122 
    123         if num_rows > 0 and num_columns > 0 and num_cells <= 0:
--> 124             raise ValueError('Unstacked DataFrame is too big, '
    125                              'causing int32 overflow')
    126 


ValueError: Unstacked DataFrame is too big, causing int32 overflow

간단하게 해결하는 방법은, 데이터를 정제해서 쓸데없이 용량만 차지하는 부분을 지우는 것입니다.