from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
pd.set_option('display.max_rows' , 500 )
pd.set_option('display.max_columns' , 500 )
pd.set_option('display.width' , 1000 )
glob ('*.csv' )
['case.csv', 'patient.csv', 'route.csv', 'time.csv', 'trend.csv']
patient = pd.read_csv('patient.csv' )
patient .head ()
patient_id
sex
birth_year
country
region
disease
group
infection_reason
infection_order
infected_by
contact_number
confirmed_date
released_date
deceased_date
state
0
1
female
1984.0
China
filtered at airport
NaN
NaN
visit to Wuhan
1.0
NaN
45.0
2020-01-20
2020-02-06
NaN
released
1
2
male
1964.0
Korea
filtered at airport
NaN
NaN
visit to Wuhan
1.0
NaN
75.0
2020-01-24
2020-02-05
NaN
released
2
3
male
1966.0
Korea
capital area
NaN
NaN
visit to Wuhan
1.0
NaN
16.0
2020-01-26
2020-02-12
NaN
released
3
4
male
1964.0
Korea
capital area
NaN
NaN
visit to Wuhan
1.0
NaN
95.0
2020-01-27
2020-02-09
NaN
released
4
5
male
1987.0
Korea
capital area
NaN
NaN
visit to Wuhan
1.0
NaN
31.0
2020-01-30
2020-03-02
NaN
released
patient['age' ] = 2020 - patient['birth_year' ]
patient['sex' ] = patient['sex' ].map({'female' : 0 , 'male' : 1 })
str_cols = ['country' , 'region' , 'group' , 'infection_reason' ]
num_cols = ['sex' , 'disease' , 'infection_order' , 'infected_by' , 'contact_number' , 'age' ]
label = 'state'
num_dset = patient[num_cols].fillna(0)
str_dset = pd.get_dummies(patient[str_cols])
all_df = pd.concat([num_dset, str_dset], axis=1)
all_df[label] = patient[label]
all_df.head()
sex
disease
infection_order
infected_by
contact_number
age
country_China
country_Korea
country_Mongolia
region_Busan
region_Chungcheongbuk-do
region_Chungcheongnam-do
region_Daegu
region_Daejeon
region_Gangwon-do
region_Gwangju
region_Gyeongsangbuk-do
region_Jeju-do
region_Jeollabuk-do
region_Jeollanam-do
region_Ulsan
region_capital area
region_filtered at airport
group_Cheongdo Daenam Hospital
group_Eunpyeong St. Mary's Hospital
group_Pilgrimage
group_Shincheonji Church
infection_reason_contact with patient
infection_reason_contact with patient in Daegu
infection_reason_contact with patient in Japan
infection_reason_contact with patient in Singapore
infection_reason_pilgrimage to Israel
infection_reason_residence in Wuhan
infection_reason_visit to China
infection_reason_visit to Daegu
infection_reason_visit to Italy
infection_reason_visit to Japan
infection_reason_visit to Thailand
infection_reason_visit to Vietnam
infection_reason_visit to Wuhan
infection_reason_visit to ooo
state
0
0.0
0.0
1.0
0.0
45.0
36.0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
1
1.0
0.0
1.0
0.0
75.0
56.0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
2
1.0
0.0
1.0
0.0
16.0
54.0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
3
1.0
0.0
1.0
0.0
95.0
56.0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
4
1.0
0.0
1.0
0.0
31.0
33.0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
rel_des_iso relation
1. PCA approach
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
rdi_df = all_df.copy()
rdi_df.fillna(0)
scaler = MinMaxScaler()
rdi_X = rdi_df[list(rdi_df.columns)[:-1]]
rdi_y = rdi_df['state']
scaled_X = pd.DataFrame(scaler.fit_transform(rdi_X), columns=rdi_X.columns)
pca = PCA(n_components=2)
y_numeric = rdi_y.map({"isolated" : 0, "released" : 1, 'deceased' : 2})
PCA_data = scaled_X.copy()
PCA_data['state'] = list(y_numeric)
principalComponents = pca.fit_transform(PCA_data)
principalDf = pd.DataFrame(principalComponents
, columns = ['principal component 1', 'principal component 2'])
labeled_Df = pd.concat([principalDf, rdi_y], axis=1)
ax = sns.scatterplot(x = 'principal component 1',
y = 'principal component 2',
hue = 'state',
data = labeled_Df,
palette ='Spectral')
주성분분석 시행
PC1_feature_importance = sorted(zip(map(
lambda x : int(x * 1000) / 1000,
pca.components_[0]),
PCA_data.columns),
reverse =True)
PC2_feature_importance = sorted(zip(map(
lambda x : int(x * 1000) / 1000,
pca.components_[1]),
PCA_data.columns),
reverse =True)
all_set = set ()
for n, z in enumerate(zip(PC1_feature_importance, PC2_feature_importance)):
all_set.add(z[0][1])
all_set.add(z[1][1])
if n == 2: break
print (all_set)
{'age' , 'state' , 'region_capital area' , 'sex' , 'disease' , 'region_Gyeongsangbuk-do' }
PCA기반 해당 데이터 셋 대표변수 리스트.
'''
age
나이
state
격리 / 치료 / 사망
region_capital area
수도권, 도심지 여부
sex
성별
disease
기저질환 보유여부
region_Gyeongsangbuk-do
경북지역 여부
'''
2. PLS approach
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=2)
y_numeric = rdi_y.map({"isolated" : 0, "released" : 1, 'deceased' : 2})
PLS_data = scaled_X.copy()
pls.fit(PLS_data, y_numeric)
x_scores = pd.DataFrame(pls.x_scores_, columns=['x_scores_PC1', 'x_scores_PC2'])
y_scores = pd.DataFrame(pls.y_scores_, columns=['y_scores_PC1', 'y_scores_PC2'])
xy_scores_l = pp.concat([x_scores, y_scores, y_numeric], axis=1)
x_loadings = pd.DataFrame(pls.x_loadings_, columns=['x_loadings_PC1', 'x_loadings_PC2'])
x_weights = pd.DataFrame(pls.x_weights_, columns=['x_weights_PC1', 'x_weights_PC2'])
x_loading_weight = pp.concat([x_loadings, x_weights], axis=1)
fig, axes = plt.subplots(1 , 3 , figsize=(15 , 4 ))
sns.scatterplot(x = 'x_scores_PC1' , y = 'x_scores_PC2' ,
data = xy_scores_l, hue = 'state' , palette ='Spectral' ,
ax=axes[0 ])
sns.scatterplot(x = 'x_scores_PC1' , y = 'y_scores_PC1' ,
data = xy_scores_l, hue = 'state' , palette ='Spectral' ,
ax=axes[1 ])
sns.scatterplot(x = 'y_scores_PC1' , y = 'y_scores_PC2' ,
data = xy_scores_l, hue = 'state' , palette ='Spectral' ,
ax=axes[2 ])
plt.show()
fig, axes = plt.subplots(1 , 3 , figsize=(15 , 4 ))
sns.scatterplot(x = 'x_loadings_PC1' , y = 'x_loadings_PC2' ,
data = x_loading_weight, palette ='Spectral' ,
ax=axes[0 ])
sns.scatterplot(x = 'x_weights_PC1' , y = 'x_weights_PC2' ,
data = x_loading_weight, palette ='Spectral' ,
ax=axes[1 ])
sns.scatterplot(x = 'x_loadings_PC1' , y = 'x_weights_PC1' ,
data = x_loading_weight, palette ='Spectral' ,
ax=axes[2 ])
plt.show()
importance_sumup = pd.concat([x_loading_weight['x_weights_PC1' ],
x_loading_weight['x_weights_PC2' ]])
feature_importance = sorted(zip(map (
lambda x : int (x * 1000 ) / 1000 ,
importance_sumup),
PCA_data.columns),
reverse =True)
feature_importance [:6]
[(0.635 , 'disease'),
(0.346 , 'region_Daegu'),
(0.346 , 'age'),
(0.279 , 'group_Cheongdo Daenam Hospital'),
(0.27 , 'sex'),
(0.168 , 'infection_reason_visit to Wuhan')]
PLS기반 해당 데이터 셋 대표변수 리스트.
'''
disease
기저질환 보유여부
region_Daegu
대구지역 여부
age
나이
group_Cheongdo Daenam Hospital
청도 대남병원 그룹 여부
sex
성별
infection_reason_visit to Wuhan
발병이유가 우한인 경우(우한 방문여부)
'''
변수간 상관계수 파악
pearson_df = scaled_X.copy()
desc = pearson_df.describe()
std_s = [desc[l ][2 ] for l in desc.columns]
sorted_list = sorted(zip(std_ s, desc.columns), reverse=True)
good_col_ set = sorted_list[:len(sorted_ list) // 3]
good_cols = [ll[1] for ll in good_ col_set]
sel_pearson = pearson_df[good_cols]
sel_pearson['state'] = list(y_numeric)
sel_pearson = sel_pearson.corr()
C:\Users\EDCORE\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http ://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html
fig = plt.figure(figsize=(10 ,5 ))
sns.heatmap(sel_pearson.sort_values(by ='state' , ascending =True))
plt.show()
상관계수 기반으로는
disease -> 기저질환 여부
age -> 나이가 많은지 적은지 여부
region_Daegu -> 대구 지역 인지의 여부
정도가 격리 / 치료 / 사망 여부를
설명하는데 가장 크게 기여한 변수입니다.