본문 바로가기

[중급] 가볍게 이것저것

고객 장바구니 분석 level_2

고객 장바구니 분석 level_2
In [ ]:
 
In [ ]:
 

아이 용품과 가장 연관이 높은 물건은 무엇일까?

In [41]:
# https://www.kaggle.com/c/instacart-market-basket-analysis/data
In [2]:
from glob import glob
import pandas as pd
In [3]:
glob("*")
Out[3]:
['aisles.csv',
 'departments.csv',
 'orders.csv',
 'order_products__prior.csv',
 'order_products__train.csv',
 'products.csv',
 'sample_submission.csv',
 '고객 장바구니 분석 level_1.ipynb',
 '고객 장바구니 분석 level_2.ipynb']
In [4]:
pd.read_csv('products.csv').head()
Out[4]:
product_id product_name aisle_id department_id
0 1 Chocolate Sandwich Cookies 61 19
1 2 All-Seasons Salt 104 13
2 3 Robust Golden Unsweetened Oolong Tea 94 7
3 4 Smart Ones Classic Favorites Mini Rigatoni Wit... 38 1
4 5 Green Chile Anytime Sauce 5 13
In [5]:
pd.read_csv('order_products__train.csv').head()
Out[5]:
order_id product_id add_to_cart_order reordered
0 1 49302 1 1
1 1 11109 2 1
2 1 10246 3 0
3 1 49683 4 0
4 1 43633 5 1
In [6]:
pd.read_csv('order_products__prior.csv').head()
Out[6]:
order_id product_id add_to_cart_order reordered
0 2 33120 1 1
1 2 28985 2 1
2 2 9327 3 0
3 2 45918 4 1
4 2 30035 5 0
In [7]:
pd.read_csv('orders.csv').head()
Out[7]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
0 2539329 1 prior 1 2 8 NaN
1 2398795 1 prior 2 3 7 15.0
2 473747 1 prior 3 3 12 21.0
3 2254736 1 prior 4 4 7 29.0
4 431534 1 prior 5 4 15 28.0
In [80]:
pd.read_csv('departments.csv').head(100)
Out[80]:
department_id department
0 1 frozen
1 2 other
2 3 bakery
3 4 produce
4 5 alcohol
5 6 international
6 7 beverages
7 8 pets
8 9 dry goods pasta
9 10 bulk
10 11 personal care
11 12 meat seafood
12 13 pantry
13 14 breakfast
14 15 canned goods
15 16 dairy eggs
16 17 household
17 18 babies
18 19 snacks
19 20 deli
20 21 missing
In [9]:
pd.read_csv('aisles.csv').head()
Out[9]:
aisle_id aisle
0 1 prepared soups salads
1 2 specialty cheeses
2 3 energy granola bars
3 4 instant foods
4 5 marinades meat preparation
In [89]:
aisle = pd.read_csv('aisles.csv')
pdt = pd.read_csv('products.csv')[['product_id', 'aisle_id']]
pd_ai = pd.merge(aisle, pdt)
pd_ai.head()
Out[89]:
aisle_id aisle product_id
0 1 prepared soups salads 209
1 1 prepared soups salads 554
2 1 prepared soups salads 886
3 1 prepared soups salads 1600
4 1 prepared soups salads 2539
In [90]:
d_1 = pd.read_csv('order_products__train.csv')[['order_id', 'product_id', 'reordered']]
d_2 = pd.read_csv('order_products__prior.csv')[['order_id', 'product_id', 'reordered']]
order_product = pd.concat([d_1, d_2])
order_product.head()
Out[90]:
order_id product_id reordered
0 1 49302 1
1 1 11109 1
2 1 10246 0
3 1 49683 0
4 1 43633 1
In [91]:
combined = pd.merge(order_product, pd_ai)[['order_id', 'aisle', 'reordered']]
combined.head()
Out[91]:
order_id aisle reordered
0 1 yogurt 1
1 816049 yogurt 1
2 1242203 yogurt 1
3 1383349 yogurt 1
4 1787378 yogurt 0
In [92]:
user_order = pd.read_csv('orders.csv')[['user_id', 'order_id']]
user_order.head()
Out[92]:
user_id order_id
0 1 2539329
1 1 2398795
2 1 473747
3 1 2254736
4 1 431534
In [94]:
final = pd.merge(user_order, combined)[['user_id', 'aisle', 'reordered']]
print(final.shape)
final.head()
(33819106, 3)
Out[94]:
user_id aisle reordered
0 1 soft drinks 0
1 1 popcorn jerky 0
2 1 soy lactosefree 0
3 1 paper goods 0
4 1 popcorn jerky 0
In [95]:
import numpy as np
%matplotlib inline
user_count = final.groupby('user_id').agg({'reordered' : [np.size, np.mean, np.std]})[:]
user_count.hist(('reordered', 'size'), bins=500)
print(user_count.shape)
(206209, 3)
In [102]:
user_count.loc[user_count[('reordered', 'size')] > 50, 'heavy'] = 1
grouped_idx = user_count[user_count['heavy'] == 1]
grouped_idx.hist(('reordered', 'size'), bins=500)
print(grouped_idx.shape)
(145776, 4)
In [103]:
final_filtered = final.loc[final['user_id'].isin(grouped_idx.index)]
final_filtered.head()
Out[103]:
user_id aisle reordered
0 1 soft drinks 0
1 1 popcorn jerky 0
2 1 soy lactosefree 0
3 1 paper goods 0
4 1 popcorn jerky 0
In [104]:
print(final.shape)
print(final_filtered.shape)
(33819106, 3)
(32041336, 3)
In [105]:
pvt = final_filtered.pivot_table(index = 'user_id', columns = 'aisle', values = 'reordered')
pvt.head()
Out[105]:
aisle air fresheners candles asian foods baby accessories baby bath body care baby food formula bakery desserts baking ingredients baking supplies decor beauty beers coolers ... spreads tea tofu meat alternatives tortillas flat bread trail mix snack mix trash bags liners vitamins supplements water seltzer sparkling water white wines yogurt
user_id
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 0.0 NaN NaN NaN NaN NaN NaN NaN NaN 0.500000
2 NaN 0.333333 NaN NaN NaN NaN 0.000000 NaN NaN NaN ... 0.0 0.0 0.0 NaN NaN NaN NaN 0.5 NaN 0.666667
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 0.5 0.0 NaN NaN NaN NaN NaN 0.5 NaN NaN
7 NaN NaN NaN NaN NaN NaN 0.333333 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.800000
8 NaN 0.000000 NaN NaN NaN NaN 0.000000 NaN NaN NaN ... NaN NaN 0.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 134 columns

In [106]:
corr = pvt.corr()
corr.head()
Out[106]:
aisle air fresheners candles asian foods baby accessories baby bath body care baby food formula bakery desserts baking ingredients baking supplies decor beauty beers coolers ... spreads tea tofu meat alternatives tortillas flat bread trail mix snack mix trash bags liners vitamins supplements water seltzer sparkling water white wines yogurt
aisle
air fresheners candles 1.000000 0.091770 0.065998 -0.002700 0.077041 0.068956 0.095971 0.015324 0.113685 0.041045 ... 0.086948 0.127405 0.049427 0.098696 0.054409 0.135516 0.102627 0.084800 0.132801 0.098967
asian foods 0.091770 1.000000 0.141231 0.122654 0.138802 0.118809 0.151334 0.117783 0.053513 0.111905 ... 0.146299 0.123600 0.196432 0.153653 0.103492 0.136582 0.120203 0.118680 0.071340 0.156958
baby accessories 0.065998 0.141231 1.000000 0.055613 0.215751 0.117233 0.110193 0.079798 0.089913 0.058241 ... 0.130325 0.116562 0.031429 0.157724 0.058691 0.178630 0.125893 0.171265 0.425205 0.192914
baby bath body care -0.002700 0.122654 0.055613 1.000000 0.139511 0.009053 0.113821 0.066624 0.149574 0.108886 ... 0.136989 0.090880 0.091536 0.111135 0.066639 0.147592 0.074446 0.104273 0.091513 0.156196
baby food formula 0.077041 0.138802 0.215751 0.139511 1.000000 0.126697 0.162811 0.072900 0.105111 0.219911 ... 0.215741 0.129856 0.179175 0.171607 0.157447 0.181142 0.121044 0.192916 0.120679 0.290786

5 rows × 134 columns

In [107]:
corr.sort_values(by=('baby accessories'), ascending=False)['baby accessories']
Out[107]:
aisle
baby accessories        1.000000
white wines             0.425205
spirits                 0.326418
diapers wipes           0.241350
fresh fruits            0.226517
                          ...   
packaged produce        0.013749
energy sports drinks    0.008611
skin care              -0.013996
facial care            -0.052337
kosher foods           -0.057063
Name: baby accessories, Length: 134, dtype: float64
In [ ]:
 
In [ ]: