[중급] 가볍게 이것저것

[도서] 추천 시스템 구축 초간단한 방법

PassionPython 2019. 8. 19. 16:47
import pandas as pd
books = pd.read_csv('data/books.csv')
ratings = pd.read_csv('data/ratings.csv')
C:\Users\one\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3057: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
books.head()
  ISBN bookTitle bookAuthor yearOfPublication publisher imageUrlS imageUrlM imageUrlL
0 0195153448 Classical Mythology Mark P. O. Morford 2002 Oxford University Press http://images.amazon.com/images/P/0195153448.0... http://images.amazon.com/images/P/0195153448.0... http://images.amazon.com/images/P/0195153448.0...
1 0002005018 Clara Callan Richard Bruce Wright 2001 HarperFlamingo Canada http://images.amazon.com/images/P/0002005018.0... http://images.amazon.com/images/P/0002005018.0... http://images.amazon.com/images/P/0002005018.0...
2 0060973129 Decision in Normandy Carlo D'Este 1991 HarperPerennial http://images.amazon.com/images/P/0060973129.0... http://images.amazon.com/images/P/0060973129.0... http://images.amazon.com/images/P/0060973129.0...
3 0374157065 Flu: The Story of the Great Influenza Pandemic... Gina Bari Kolata 1999 Farrar Straus Giroux http://images.amazon.com/images/P/0374157065.0... http://images.amazon.com/images/P/0374157065.0... http://images.amazon.com/images/P/0374157065.0...
4 0393045218 The Mummies of Urumchi E. J. W. Barber 1999 W. W. Norton & Company http://images.amazon.com/images/P/0393045218.0... http://images.amazon.com/images/P/0393045218.0... http://images.amazon.com/images/P/0393045218.0...
print(ratings.shape)
ratings.head()
(1149780, 3)
  userID ISBN bookRating
0 276725 034545104X 0
1 276726 0155061224 5
2 276727 0446520802 0
3 276729 052165615X 3
4 276729 0521795028 6
books = books[['ISBN', 'bookTitle']]
ratings = ratings[['ISBN', 'userID', 'bookRating']]
target = "Sorcerer's"
for i in books["bookTitle"]:
    if target in i:
        print(i)
The Sorcerer's Companion: A Guide to the Magical World of Harry Potter
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
Harry Potter and the Sorcerer's Stone (Book 1)
A Sorcerer's Treason: A Novel of Isavalta
Disney's Sorcerer's Apprentice/Fantasia (Little Golden Book)
Harry Potter and the Sorcerer's Stone Movie Poster Book
The Sorcerer's Apprentice : Storybook and Magic Tricks
Harry Potter and the Sorcerer's Stone (Book 1 Audio CD)
The Sorcerer's Academy
Sorcerer's Apprentice (Well Loved Tales)
The Sorcerer's Heir
Sorcerer's Son
Harry Potter and the Sorcerer's Stone (Book 1, Audio)
Harry Potter and the Sorcerer's Stone (Book 1, Large Print)
The Spell of the Sorcerer's Skull (Johnny Dixon Mystery)
Walt Disney's the Sorcerer's Apprentice (Disney's Wonderful World of Reading, No. 12)
Dark Sister : Sorcerer's Love Story, A (Medicine Woman Series , No 10)
The Sorcerer's Skull
The Sorcerer's Gun (Changeling Saga, No 2)
The Sorcerer's Apprentice
The Sorcerer's Lady
Harry Potter and the  Sorcerer's Stone
Walt Disney's the Sorcerer's Apprentice (Disney's Wonderful World of Reading,)
Harry Potter and the Sorcerer's Stone (Urdu Edition)
A Sorcerer's Apprentice
Merlin: The Sorcerer's Guide to Survival in College (Freshman Orientation)
Sorcerer's Legacy
The Sorcerer's Apprentice (Contemporary American Fiction)
Harry Potter and the Sorcerer's Stone (Harry Potter (Hardcover))
Sorcerer's Stone (Tsr-Books Novel)
The Sorcerer's Gun (Chageling Saga, 2)
The Sorcerer's Lady (Timeswept : Love Spell Time-Travel Romance)
Harry Potter and the Sorcerer's Stone
Harry Potter and the Sorcerer's Stone: A Deluxe Pop-up Book
The Sorcerer's Apprentices (Night Lights)
The Sorcerer's Sacred Isle (The Danan's, Book 1)
The Spell of the Sorcerer's Skull
import numpy as np

target = "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"
cols = ['userID', 'bookTitle', 'bookRating']

movies = books
#ratings = ratings[:100000]

combined = pd.merge(movies, ratings)
pivoted = combined.pivot_table(index = cols[0], columns = cols[1], values = cols[2])
matrix_similar = pivoted.corrwith(pivoted[target]).dropna()

counted = combined[cols].groupby(cols[1]).agg({cols[2] : [np.size, np.mean, np.std]})
popular = counted[cols[2]]['size'] >= 250

combined_result = counted[popular].join(pd.DataFrame(matrix_similar, columns = ['corr']))
combined_result.sort_values(by = ['corr'], ascending = False).head(10)
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-6-915ccfd8f63a> in <module>
      8 
      9 combined = pd.merge(movies, ratings)
---> 10 pivoted = combined.pivot_table(index = cols[0], columns = cols[1], values = cols[2])
     11 matrix_similar = pivoted.corrwith(pivoted[target]).dropna()
     12 


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
   5757                            aggfunc=aggfunc, fill_value=fill_value,
   5758                            margins=margins, dropna=dropna,
-> 5759                            margins_name=margins_name)
   5760 
   5761     def stack(self, level=-1, dropna=True):


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
    107             else:
    108                 to_unstack.append(name)
--> 109         table = agged.unstack(to_unstack)
    110 
    111     if not dropna:


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in unstack(self, level, fill_value)
   5990         """
   5991         from pandas.core.reshape.reshape import unstack
-> 5992         return unstack(self, level, fill_value)
   5993 
   5994     _shared_docs['melt'] = ("""


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
    386     if isinstance(obj, DataFrame):
    387         if isinstance(obj.index, MultiIndex):
--> 388             return _unstack_frame(obj, level, fill_value=fill_value)
    389         else:
    390             return obj.T.stack(dropna=False)


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_frame(obj, level, fill_value)
    409                                value_columns=obj.columns,
    410                                fill_value=fill_value,
--> 411                                constructor=obj._constructor)
    412         return unstacker.get_result()
    413 


~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in __init__(self, values, index, level, value_columns, fill_value, constructor)
    122 
    123         if num_rows > 0 and num_columns > 0 and num_cells <= 0:
--> 124             raise ValueError('Unstacked DataFrame is too big, '
    125                              'causing int32 overflow')
    126 


ValueError: Unstacked DataFrame is too big, causing int32 overflow
간단하게 해결하는 방법은, 데이터를 정제해서 쓸데없이 용량만 차지하는 부분을 지우는 것입니다.