본문 바로가기

[중급] 가볍게 이것저것

고객 장바구니 분석 level_1

고객 장바구니 분석 level_1
In [ ]:
 
In [ ]:
 

유아용품과 가장 연관이 있는 제품군 알아보기

In [41]:
# https://www.kaggle.com/c/instacart-market-basket-analysis/data
In [1]:
from glob import glob
import pandas as pd
In [2]:
glob("*")
Out[2]:
['aisles.csv',
 'departments.csv',
 'orders.csv',
 'order_products__prior.csv',
 'order_products__train.csv',
 'products.csv',
 'sample_submission.csv',
 'Untitled.ipynb']
In [9]:
pd.read_csv('products.csv').head()
Out[9]:
product_id product_name aisle_id department_id
0 1 Chocolate Sandwich Cookies 61 19
1 2 All-Seasons Salt 104 13
2 3 Robust Golden Unsweetened Oolong Tea 94 7
3 4 Smart Ones Classic Favorites Mini Rigatoni Wit... 38 1
4 5 Green Chile Anytime Sauce 5 13
In [8]:
pd.read_csv('order_products__train.csv').head()
Out[8]:
order_id product_id add_to_cart_order reordered
0 1 49302 1 1
1 1 11109 2 1
2 1 10246 3 0
3 1 49683 4 0
4 1 43633 5 1
In [7]:
pd.read_csv('order_products__prior.csv').head()
Out[7]:
order_id product_id add_to_cart_order reordered
0 2 33120 1 1
1 2 28985 2 1
2 2 9327 3 0
3 2 45918 4 1
4 2 30035 5 0
In [5]:
pd.read_csv('orders.csv').head()
Out[5]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
0 2539329 1 prior 1 2 8 NaN
1 2398795 1 prior 2 3 7 15.0
2 473747 1 prior 3 3 12 21.0
3 2254736 1 prior 4 4 7 29.0
4 431534 1 prior 5 4 15 28.0
In [4]:
pd.read_csv('departments.csv').head()
Out[4]:
department_id department
0 1 frozen
1 2 other
2 3 bakery
3 4 produce
4 5 alcohol
In [3]:
pd.read_csv('aisles.csv').head()
Out[3]:
aisle_id aisle
0 1 prepared soups salads
1 2 specialty cheeses
2 3 energy granola bars
3 4 instant foods
4 5 marinades meat preparation
In [19]:
dept = pd.read_csv('departments.csv')[['department_id', 'department']]
pdt = pd.read_csv('products.csv')[['product_id', 'department_id']]
dept_pdt = pd.merge(dept, pdt)[['product_id', 'department']]
dept_pdt.head()
Out[19]:
product_id department
0 4 frozen
1 8 frozen
2 12 frozen
3 18 frozen
4 30 frozen
In [20]:
d_1 = pd.read_csv('order_products__train.csv')[['order_id', 'product_id', 'reordered']]
d_2 = pd.read_csv('order_products__prior.csv')[['order_id', 'product_id', 'reordered']]
order_product = pd.concat([d_1, d_2])
order_product.head()
Out[20]:
order_id product_id reordered
0 1 49302 1
1 1 11109 1
2 1 10246 0
3 1 49683 0
4 1 43633 1
In [21]:
combined = pd.merge(order_product, dept_pdt)[['order_id', 'department', 'reordered']]
combined.head()
Out[21]:
order_id department reordered
0 1 dairy eggs 1
1 816049 dairy eggs 1
2 1242203 dairy eggs 1
3 1383349 dairy eggs 1
4 1787378 dairy eggs 0
In [11]:
user_order = pd.read_csv('orders.csv')[['user_id', 'order_id']]
user_order.head()
Out[11]:
user_id order_id
0 1 2539329
1 1 2398795
2 1 473747
3 1 2254736
4 1 431534
In [24]:
final = pd.merge(user_order, combined)[['user_id', 'department', 'reordered']]
final.head()
Out[24]:
user_id department reordered
0 1 beverages 0
1 1 snacks 0
2 1 dairy eggs 0
3 1 household 0
4 1 snacks 0
In [28]:
pvt = final.pivot_table(index = 'user_id', columns = 'department', values = 'reordered')
pvt.head()
Out[28]:
department alcohol babies bakery beverages breakfast bulk canned goods dairy eggs deli dry goods pasta ... household international meat seafood missing other pantry personal care pets produce snacks
user_id
1 NaN NaN NaN 0.866667 0.75 NaN NaN 0.647059 NaN NaN ... 0.666667 NaN NaN NaN NaN 0.000000 NaN NaN 0.200000 0.840000
2 NaN NaN 0.0 0.333333 0.00 NaN 0.0 0.612245 0.416667 NaN ... NaN 0.333333 0.0 NaN NaN 0.363636 0.0 NaN 0.581395 0.489362
3 NaN NaN NaN 0.333333 NaN NaN NaN 0.714286 0.500000 0.75 ... 0.000000 NaN NaN NaN NaN 0.500000 NaN NaN 0.684211 0.444444
4 0.0 NaN 0.0 0.000000 NaN NaN 0.0 NaN 0.000000 NaN ... 0.000000 NaN NaN NaN NaN NaN NaN NaN 0.000000 0.000000
5 NaN NaN NaN NaN NaN NaN 0.0 0.444444 0.000000 0.00 ... 0.000000 0.500000 NaN NaN NaN 0.333333 NaN NaN 0.478261 0.000000

5 rows × 21 columns

In [40]:
corr = pvt.corr()
corr.head()
Out[40]:
department alcohol babies bakery beverages breakfast bulk canned goods dairy eggs deli dry goods pasta ... household international meat seafood missing other pantry personal care pets produce snacks
department
alcohol 1.000000 0.160500 0.140734 0.219895 0.093450 0.213245 0.138580 0.164133 0.137878 0.133933 ... 0.196268 0.113650 0.138170 0.026186 0.177361 0.148339 0.151248 0.148252 0.132936 0.180079
babies 0.160500 1.000000 0.264185 0.271661 0.253578 0.119915 0.282913 0.368916 0.258728 0.302911 ... 0.258133 0.178657 0.237875 0.129178 0.115108 0.279944 0.214191 0.171488 0.358751 0.334850
bakery 0.140734 0.264185 1.000000 0.311523 0.314608 0.173006 0.318422 0.456866 0.353979 0.342322 ... 0.251465 0.235531 0.315626 0.118171 0.129102 0.355871 0.216225 0.196993 0.435772 0.377820
beverages 0.219895 0.271661 0.311523 1.000000 0.294760 0.150307 0.293046 0.406673 0.301139 0.307865 ... 0.301129 0.223017 0.279380 0.153245 0.157346 0.326473 0.257679 0.253459 0.393546 0.396824
breakfast 0.093450 0.253578 0.314608 0.294760 1.000000 0.182291 0.292565 0.422232 0.292080 0.318724 ... 0.250348 0.205902 0.257178 0.139459 0.127807 0.330778 0.224265 0.185809 0.385949 0.386953

5 rows × 21 columns

In [39]:
corr.sort_values(by='babies', ascending=False)['babies']
Out[39]:
department
babies             1.000000
dairy eggs         0.368916
produce            0.358751
snacks             0.334850
frozen             0.311711
dry goods pasta    0.302911
canned goods       0.282913
pantry             0.279944
beverages          0.271661
bakery             0.264185
deli               0.258728
household          0.258133
breakfast          0.253578
meat seafood       0.237875
personal care      0.214191
international      0.178657
pets               0.171488
alcohol            0.160500
missing            0.129178
bulk               0.119915
other              0.115108
Name: babies, dtype: float64
In [ ]: