본문 바로가기

[중급] 가볍게 이것저것

데이터로 알아보는 코로나를 대표하는 키워드!

# 아래 데이터 셋은 kaggle에서 제공합니다.

patient.csv
0.34MB

from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
glob('*.csv')
['case.csv', 'patient.csv', 'route.csv', 'time.csv', 'trend.csv']
patient = pd.read_csv('patient.csv')
patient.head()
  patient_id sex birth_year country region disease group infection_reason infection_order infected_by contact_number confirmed_date released_date deceased_date state
0 1 female 1984.0 China filtered at airport NaN NaN visit to Wuhan 1.0 NaN 45.0 2020-01-20 2020-02-06 NaN released
1 2 male 1964.0 Korea filtered at airport NaN NaN visit to Wuhan 1.0 NaN 75.0 2020-01-24 2020-02-05 NaN released
2 3 male 1966.0 Korea capital area NaN NaN visit to Wuhan 1.0 NaN 16.0 2020-01-26 2020-02-12 NaN released
3 4 male 1964.0 Korea capital area NaN NaN visit to Wuhan 1.0 NaN 95.0 2020-01-27 2020-02-09 NaN released
4 5 male 1987.0 Korea capital area NaN NaN visit to Wuhan 1.0 NaN 31.0 2020-01-30 2020-03-02 NaN released
patient['age'] = 2020 - patient['birth_year']
patient['sex'] = patient['sex'].map({'female' : 0, 'male' : 1})
str_cols = ['country', 'region', 'group', 'infection_reason']
num_cols = ['sex', 'disease', 'infection_order', 'infected_by', 'contact_number', 'age']
label = 'state'
num_dset = patient[num_cols].fillna(0)
str_dset = pd.get_dummies(patient[str_cols])
all_df = pd.concat([num_dset, str_dset], axis=1)
all_df[label] = patient[label]
all_df.head()
  sex disease infection_order infected_by contact_number age country_China country_Korea country_Mongolia region_Busan region_Chungcheongbuk-do region_Chungcheongnam-do region_Daegu region_Daejeon region_Gangwon-do region_Gwangju region_Gyeongsangbuk-do region_Jeju-do region_Jeollabuk-do region_Jeollanam-do region_Ulsan region_capital area region_filtered at airport group_Cheongdo Daenam Hospital group_Eunpyeong St. Mary's Hospital group_Pilgrimage group_Shincheonji Church infection_reason_contact with patient infection_reason_contact with patient in Daegu infection_reason_contact with patient in Japan infection_reason_contact with patient in Singapore infection_reason_pilgrimage to Israel infection_reason_residence in Wuhan infection_reason_visit to China infection_reason_visit to Daegu infection_reason_visit to Italy infection_reason_visit to Japan infection_reason_visit to Thailand infection_reason_visit to Vietnam infection_reason_visit to Wuhan infection_reason_visit to ooo state
0 0.0 0.0 1.0 0.0 45.0 36.0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 released
1 1.0 0.0 1.0 0.0 75.0 56.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 released
2 1.0 0.0 1.0 0.0 16.0 54.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 released
3 1.0 0.0 1.0 0.0 95.0 56.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 released
4 1.0 0.0 1.0 0.0 31.0 33.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 released

rel_des_iso relation

1. PCA approach

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

rdi_df = all_df.copy()
rdi_df.fillna(0)
scaler = MinMaxScaler()

rdi_X = rdi_df[list(rdi_df.columns)[:-1]]
rdi_y = rdi_df['state']
scaled_X = pd.DataFrame(scaler.fit_transform(rdi_X), columns=rdi_X.columns)
pca = PCA(n_components=2)

y_numeric = rdi_y.map({"isolated": 0, "released" : 1, 'deceased' : 2})
PCA_data = scaled_X.copy()
PCA_data['state'] = list(y_numeric)
principalComponents = pca.fit_transform(PCA_data)

principalDf = pd.DataFrame(principalComponents
             , columns = ['principal component 1', 'principal component 2'])

labeled_Df = pd.concat([principalDf, rdi_y], axis=1)

ax = sns.scatterplot(x = 'principal component 1', 
                     y = 'principal component 2', 
                     hue = 'state',
                     data = labeled_Df, 
                     palette ='Spectral')

주성분분석 시행

PC1_feature_importance = sorted(zip(map(
                                        lambda x : int(x * 1000) / 1000, 
                                        pca.components_[0]), 
                                    PCA_data.columns), 
                                reverse=True)

PC2_feature_importance = sorted(zip(map(
                                        lambda x : int(x * 1000) / 1000, 
                                        pca.components_[1]), 
                                    PCA_data.columns), 
                                reverse=True)
all_set = set()
for n, z in enumerate(zip(PC1_feature_importance, PC2_feature_importance)):
    all_set.add(z[0][1])
    all_set.add(z[1][1])
    if n == 2: break
print(all_set)
{'age', 'state', 'region_capital area', 'sex', 'disease', 'region_Gyeongsangbuk-do'}

PCA기반 해당 데이터 셋 대표변수 리스트.

'''
age
    나이
state
    격리 / 치료 / 사망
region_capital area
    수도권, 도심지 여부
sex
    성별
disease
    기저질환 보유여부
region_Gyeongsangbuk-do
    경북지역 여부    
'''

2. PLS approach

from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=2)

y_numeric = rdi_y.map({"isolated": 0, "released" : 1, 'deceased' : 2})
PLS_data = scaled_X.copy()
pls.fit(PLS_data, y_numeric)

x_scores = pd.DataFrame(pls.x_scores_, columns=['x_scores_PC1', 'x_scores_PC2'])
y_scores = pd.DataFrame(pls.y_scores_, columns=['y_scores_PC1', 'y_scores_PC2'])
xy_scores_l = pp.concat([x_scores, y_scores, y_numeric], axis=1)

x_loadings = pd.DataFrame(pls.x_loadings_, columns=['x_loadings_PC1', 'x_loadings_PC2'])
x_weights = pd.DataFrame(pls.x_weights_, columns=['x_weights_PC1', 'x_weights_PC2'])

x_loading_weight = pp.concat([x_loadings, x_weights], axis=1)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.scatterplot(x = 'x_scores_PC1', y = 'x_scores_PC2',
                data = xy_scores_l, hue = 'state', palette ='Spectral',
                ax=axes[0])

sns.scatterplot(x = 'x_scores_PC1', y = 'y_scores_PC1',
                data = xy_scores_l, hue = 'state', palette ='Spectral',
                ax=axes[1])

sns.scatterplot(x = 'y_scores_PC1', y = 'y_scores_PC2',
                data = xy_scores_l, hue = 'state', palette ='Spectral',
                ax=axes[2])

plt.show()

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.scatterplot(x = 'x_loadings_PC1', y = 'x_loadings_PC2',
                data = x_loading_weight, palette ='Spectral',
                ax=axes[0])

sns.scatterplot(x = 'x_weights_PC1', y = 'x_weights_PC2',
                data = x_loading_weight, palette ='Spectral',
                ax=axes[1])

sns.scatterplot(x = 'x_loadings_PC1', y = 'x_weights_PC1',
                data = x_loading_weight, palette ='Spectral',
                ax=axes[2])
plt.show()

importance_sumup = pd.concat([x_loading_weight['x_weights_PC1'], 
                              x_loading_weight['x_weights_PC2']])

feature_importance = sorted(zip(map(
                                    lambda x : int(x * 1000) / 1000, 
                                    importance_sumup), 
                                PCA_data.columns), 
                            reverse=True)
feature_importance[:6]
[(0.635, 'disease'),
 (0.346, 'region_Daegu'),
 (0.346, 'age'),
 (0.279, 'group_Cheongdo Daenam Hospital'),
 (0.27, 'sex'),
 (0.168, 'infection_reason_visit to Wuhan')]

PLS기반 해당 데이터 셋 대표변수 리스트.

'''
disease
    기저질환 보유여부
region_Daegu
    대구지역 여부    
age
    나이
group_Cheongdo Daenam Hospital
    청도 대남병원 그룹 여부
sex
    성별
infection_reason_visit to Wuhan
    발병이유가 우한인 경우(우한 방문여부)
'''

변수간 상관계수 파악

pearson_df = scaled_X.copy()
desc = pearson_df.describe()
std_s = [desc[l][2] for l in desc.columns]
sorted_list = sorted(zip(std_s, desc.columns), reverse=True)
good_col_set = sorted_list[:len(sorted_list) // 3]
good_cols = [ll[1] for ll in good_col_set]
sel_pearson = pearson_df[good_cols]
sel_pearson['state'] = list(y_numeric)
sel_pearson = sel_pearson.corr()
C:\Users\EDCORE\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
fig = plt.figure(figsize=(10,5))
sns.heatmap(sel_pearson.sort_values(by='state', ascending=True))
plt.show()

상관계수 기반으로는
disease -> 기저질환 여부
age -> 나이가 많은지 적은지 여부
region_Daegu -> 대구 지역 인지의 여부
정도가 격리 / 치료 / 사망 여부를
설명하는데 가장 크게 기여한 변수입니다.