# 아래 데이터 셋은 kaggle에서 제공합니다.
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
glob('*.csv')
['case.csv', 'patient.csv', 'route.csv', 'time.csv', 'trend.csv']
patient = pd.read_csv('patient.csv')
patient.head()
patient_id
sex
birth_year
country
region
disease
group
infection_reason
infection_order
infected_by
contact_number
confirmed_date
released_date
deceased_date
state
0
1
female
1984.0
China
filtered at airport
NaN
NaN
visit to Wuhan
1.0
NaN
45.0
2020-01-20
2020-02-06
NaN
released
1
2
male
1964.0
Korea
filtered at airport
NaN
NaN
visit to Wuhan
1.0
NaN
75.0
2020-01-24
2020-02-05
NaN
released
2
3
male
1966.0
Korea
capital area
NaN
NaN
visit to Wuhan
1.0
NaN
16.0
2020-01-26
2020-02-12
NaN
released
3
4
male
1964.0
Korea
capital area
NaN
NaN
visit to Wuhan
1.0
NaN
95.0
2020-01-27
2020-02-09
NaN
released
4
5
male
1987.0
Korea
capital area
NaN
NaN
visit to Wuhan
1.0
NaN
31.0
2020-01-30
2020-03-02
NaN
released
patient['age'] = 2020 - patient['birth_year']
patient['sex'] = patient['sex'].map({'female' : 0, 'male' : 1})
str_cols = ['country', 'region', 'group', 'infection_reason']
num_cols = ['sex', 'disease', 'infection_order', 'infected_by', 'contact_number', 'age']
label = 'state'
num_dset = patient[num_cols].fillna(0)
str_dset = pd.get_dummies(patient[str_cols])
all_df = pd.concat([num_dset, str_dset], axis=1)
all_df[label] = patient[label]
all_df.head()
sex
disease
infection_order
infected_by
contact_number
age
country_China
country_Korea
country_Mongolia
region_Busan
region_Chungcheongbuk-do
region_Chungcheongnam-do
region_Daegu
region_Daejeon
region_Gangwon-do
region_Gwangju
region_Gyeongsangbuk-do
region_Jeju-do
region_Jeollabuk-do
region_Jeollanam-do
region_Ulsan
region_capital area
region_filtered at airport
group_Cheongdo Daenam Hospital
group_Eunpyeong St. Mary's Hospital
group_Pilgrimage
group_Shincheonji Church
infection_reason_contact with patient
infection_reason_contact with patient in Daegu
infection_reason_contact with patient in Japan
infection_reason_contact with patient in Singapore
infection_reason_pilgrimage to Israel
infection_reason_residence in Wuhan
infection_reason_visit to China
infection_reason_visit to Daegu
infection_reason_visit to Italy
infection_reason_visit to Japan
infection_reason_visit to Thailand
infection_reason_visit to Vietnam
infection_reason_visit to Wuhan
infection_reason_visit to ooo
state
0
0.0
0.0
1.0
0.0
45.0
36.0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
1
1.0
0.0
1.0
0.0
75.0
56.0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
2
1.0
0.0
1.0
0.0
16.0
54.0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
3
1.0
0.0
1.0
0.0
95.0
56.0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
4
1.0
0.0
1.0
0.0
31.0
33.0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
released
rel_des_iso relation
1. PCA approach
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
rdi_df = all_df.copy()
rdi_df.fillna(0)
scaler = MinMaxScaler()
rdi_X = rdi_df[list(rdi_df.columns)[:-1]]
rdi_y = rdi_df['state']
scaled_X = pd.DataFrame(scaler.fit_transform(rdi_X), columns=rdi_X.columns)
pca = PCA(n_components=2)
y_numeric = rdi_y.map({"isolated": 0, "released" : 1, 'deceased' : 2})
PCA_data = scaled_X.copy()
PCA_data['state'] = list(y_numeric)
principalComponents = pca.fit_transform(PCA_data)
principalDf = pd.DataFrame(principalComponents
, columns = ['principal component 1', 'principal component 2'])
labeled_Df = pd.concat([principalDf, rdi_y], axis=1)
ax = sns.scatterplot(x = 'principal component 1',
y = 'principal component 2',
hue = 'state',
data = labeled_Df,
palette ='Spectral')
주성분분석 시행
PC1_feature_importance = sorted(zip(map(
lambda x : int(x * 1000) / 1000,
pca.components_[0]),
PCA_data.columns),
reverse=True)
PC2_feature_importance = sorted(zip(map(
lambda x : int(x * 1000) / 1000,
pca.components_[1]),
PCA_data.columns),
reverse=True)
all_set = set()
for n, z in enumerate(zip(PC1_feature_importance, PC2_feature_importance)):
all_set.add(z[0][1])
all_set.add(z[1][1])
if n == 2: break
print(all_set)
{'age', 'state', 'region_capital area', 'sex', 'disease', 'region_Gyeongsangbuk-do'}
PCA기반 해당 데이터 셋 대표변수 리스트.
'''
age
나이
state
격리 / 치료 / 사망
region_capital area
수도권, 도심지 여부
sex
성별
disease
기저질환 보유여부
region_Gyeongsangbuk-do
경북지역 여부
'''
2. PLS approach
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=2)
y_numeric = rdi_y.map({"isolated": 0, "released" : 1, 'deceased' : 2})
PLS_data = scaled_X.copy()
pls.fit(PLS_data, y_numeric)
x_scores = pd.DataFrame(pls.x_scores_, columns=['x_scores_PC1', 'x_scores_PC2'])
y_scores = pd.DataFrame(pls.y_scores_, columns=['y_scores_PC1', 'y_scores_PC2'])
xy_scores_l = pp.concat([x_scores, y_scores, y_numeric], axis=1)
x_loadings = pd.DataFrame(pls.x_loadings_, columns=['x_loadings_PC1', 'x_loadings_PC2'])
x_weights = pd.DataFrame(pls.x_weights_, columns=['x_weights_PC1', 'x_weights_PC2'])
x_loading_weight = pp.concat([x_loadings, x_weights], axis=1)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.scatterplot(x = 'x_scores_PC1', y = 'x_scores_PC2',
data = xy_scores_l, hue = 'state', palette ='Spectral',
ax=axes[0])
sns.scatterplot(x = 'x_scores_PC1', y = 'y_scores_PC1',
data = xy_scores_l, hue = 'state', palette ='Spectral',
ax=axes[1])
sns.scatterplot(x = 'y_scores_PC1', y = 'y_scores_PC2',
data = xy_scores_l, hue = 'state', palette ='Spectral',
ax=axes[2])
plt.show()
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.scatterplot(x = 'x_loadings_PC1', y = 'x_loadings_PC2',
data = x_loading_weight, palette ='Spectral',
ax=axes[0])
sns.scatterplot(x = 'x_weights_PC1', y = 'x_weights_PC2',
data = x_loading_weight, palette ='Spectral',
ax=axes[1])
sns.scatterplot(x = 'x_loadings_PC1', y = 'x_weights_PC1',
data = x_loading_weight, palette ='Spectral',
ax=axes[2])
plt.show()
importance_sumup = pd.concat([x_loading_weight['x_weights_PC1'],
x_loading_weight['x_weights_PC2']])
feature_importance = sorted(zip(map(
lambda x : int(x * 1000) / 1000,
importance_sumup),
PCA_data.columns),
reverse=True)
feature_importance[:6]
[(0.635, 'disease'),
(0.346, 'region_Daegu'),
(0.346, 'age'),
(0.279, 'group_Cheongdo Daenam Hospital'),
(0.27, 'sex'),
(0.168, 'infection_reason_visit to Wuhan')]
PLS기반 해당 데이터 셋 대표변수 리스트.
'''
disease
기저질환 보유여부
region_Daegu
대구지역 여부
age
나이
group_Cheongdo Daenam Hospital
청도 대남병원 그룹 여부
sex
성별
infection_reason_visit to Wuhan
발병이유가 우한인 경우(우한 방문여부)
'''
변수간 상관계수 파악
pearson_df = scaled_X.copy()
desc = pearson_df.describe()
std_s = [desc[l][2] for l in desc.columns]
sorted_list = sorted(zip(std_s, desc.columns), reverse=True)
good_col_set = sorted_list[:len(sorted_list) // 3]
good_cols = [ll[1] for ll in good_col_set]
sel_pearson = pearson_df[good_cols]
sel_pearson['state'] = list(y_numeric)
sel_pearson = sel_pearson.corr()
C:\Users\EDCORE\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
fig = plt.figure(figsize=(10,5))
sns.heatmap(sel_pearson.sort_values(by='state', ascending=True))
plt.show()
상관계수 기반으로는
disease -> 기저질환 여부
age -> 나이가 많은지 적은지 여부
region_Daegu -> 대구 지역 인지의 여부
정도가 격리 / 치료 / 사망 여부를
설명하는데 가장 크게 기여한 변수입니다.