# 아래 데이터 셋은 kaggle에서 제공합니다.

patient.csv

0.34MB

from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

glob('*.csv')

['case.csv', 'patient.csv', 'route.csv', 'time.csv', 'trend.csv']

patient = pd.read_csv('patient.csv')

patient.head()

	patient_id	sex	birth_year	country	region	disease	group	infection_reason	infection_order	infected_by	contact_number	confirmed_date	released_date	deceased_date	state
0	1	female	1984.0	China	filtered at airport	NaN	NaN	visit to Wuhan	1.0	NaN	45.0	2020-01-20	2020-02-06	NaN	released
1	2	male	1964.0	Korea	filtered at airport	NaN	NaN	visit to Wuhan	1.0	NaN	75.0	2020-01-24	2020-02-05	NaN	released
2	3	male	1966.0	Korea	capital area	NaN	NaN	visit to Wuhan	1.0	NaN	16.0	2020-01-26	2020-02-12	NaN	released
3	4	male	1964.0	Korea	capital area	NaN	NaN	visit to Wuhan	1.0	NaN	95.0	2020-01-27	2020-02-09	NaN	released
4	5	male	1987.0	Korea	capital area	NaN	NaN	visit to Wuhan	1.0	NaN	31.0	2020-01-30	2020-03-02	NaN	released

patient['age'] = 2020 - patient['birth_year']
patient['sex'] = patient['sex'].map({'female' : 0, 'male' : 1})

str_cols = ['country', 'region', 'group', 'infection_reason']
num_cols = ['sex', 'disease', 'infection_order', 'infected_by', 'contact_number', 'age']
label = 'state'

num_dset = patient[num_cols].fillna(0)
str_dset = pd.get_dummies(patient[str_cols])
all_df = pd.concat([num_dset, str_dset], axis=1)
all_df[label] = patient[label]
all_df.head()

	sex	infection_order	contact_number	age	country_China	country_Korea	region_capital area	region_filtered at airport	infection_reason_visit to Wuhan	state
0	0.0	1.0	45.0	36.0	1	0	0	1	1	released
1	1.0	1.0	75.0	56.0	0	1	0	1	1	released
2	1.0	1.0	16.0	54.0	0	1	1	0	1	released
3	1.0	1.0	95.0	56.0	0	1	1	0	1	released
4	1.0	1.0	31.0	33.0	0	1	1	0	1	released

rel_des_iso relation

1. PCA approach

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

rdi_df = all_df.copy()
rdi_df.fillna(0)
scaler = MinMaxScaler()

rdi_X = rdi_df[list(rdi_df.columns)[:-1]]
rdi_y = rdi_df['state']
scaled_X = pd.DataFrame(scaler.fit_transform(rdi_X), columns=rdi_X.columns)

pca = PCA(n_components=2)

y_numeric = rdi_y.map({"isolated": 0, "released" : 1, 'deceased' : 2})
PCA_data = scaled_X.copy()
PCA_data['state'] = list(y_numeric)
principalComponents = pca.fit_transform(PCA_data)

principalDf = pd.DataFrame(principalComponents
             , columns = ['principal component 1', 'principal component 2'])

labeled_Df = pd.concat([principalDf, rdi_y], axis=1)

ax = sns.scatterplot(x = 'principal component 1', 
                     y = 'principal component 2', 
                     hue = 'state',
                     data = labeled_Df, 
                     palette ='Spectral')

주성분분석 시행

PC1_feature_importance = sorted(zip(map(
                                        lambda x : int(x * 1000) / 1000, 
                                        pca.components_[0]), 
                                    PCA_data.columns), 
                                reverse=True)

PC2_feature_importance = sorted(zip(map(
                                        lambda x : int(x * 1000) / 1000, 
                                        pca.components_[1]), 
                                    PCA_data.columns), 
                                reverse=True)

all_set = set()
for n, z in enumerate(zip(PC1_feature_importance, PC2_feature_importance)):
    all_set.add(z[0][1])
    all_set.add(z[1][1])
    if n == 2: break
print(all_set)

{'age', 'state', 'region_capital area', 'sex', 'disease', 'region_Gyeongsangbuk-do'}

PCA기반 해당 데이터 셋 대표변수 리스트.

'''
age
    나이
state
    격리 / 치료 / 사망
region_capital area
    수도권, 도심지 여부
sex
    성별
disease
    기저질환 보유여부
region_Gyeongsangbuk-do
    경북지역 여부    
'''

2. PLS approach

from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=2)

y_numeric = rdi_y.map({"isolated": 0, "released" : 1, 'deceased' : 2})
PLS_data = scaled_X.copy()
pls.fit(PLS_data, y_numeric)

x_scores = pd.DataFrame(pls.x_scores_, columns=['x_scores_PC1', 'x_scores_PC2'])
y_scores = pd.DataFrame(pls.y_scores_, columns=['y_scores_PC1', 'y_scores_PC2'])
xy_scores_l = pp.concat([x_scores, y_scores, y_numeric], axis=1)

x_loadings = pd.DataFrame(pls.x_loadings_, columns=['x_loadings_PC1', 'x_loadings_PC2'])
x_weights = pd.DataFrame(pls.x_weights_, columns=['x_weights_PC1', 'x_weights_PC2'])

x_loading_weight = pp.concat([x_loadings, x_weights], axis=1)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.scatterplot(x = 'x_scores_PC1', y = 'x_scores_PC2',
                data = xy_scores_l, hue = 'state', palette ='Spectral',
                ax=axes[0])

sns.scatterplot(x = 'x_scores_PC1', y = 'y_scores_PC1',
                data = xy_scores_l, hue = 'state', palette ='Spectral',
                ax=axes[1])

sns.scatterplot(x = 'y_scores_PC1', y = 'y_scores_PC2',
                data = xy_scores_l, hue = 'state', palette ='Spectral',
                ax=axes[2])

plt.show()

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.scatterplot(x = 'x_loadings_PC1', y = 'x_loadings_PC2',
                data = x_loading_weight, palette ='Spectral',
                ax=axes[0])

sns.scatterplot(x = 'x_weights_PC1', y = 'x_weights_PC2',
                data = x_loading_weight, palette ='Spectral',
                ax=axes[1])

sns.scatterplot(x = 'x_loadings_PC1', y = 'x_weights_PC1',
                data = x_loading_weight, palette ='Spectral',
                ax=axes[2])
plt.show()

importance_sumup = pd.concat([x_loading_weight['x_weights_PC1'], 
                              x_loading_weight['x_weights_PC2']])

feature_importance = sorted(zip(map(
                                    lambda x : int(x * 1000) / 1000, 
                                    importance_sumup), 
                                PCA_data.columns), 
                            reverse=True)

feature_importance[:6]

[(0.635, 'disease'),
 (0.346, 'region_Daegu'),
 (0.346, 'age'),
 (0.279, 'group_Cheongdo Daenam Hospital'),
 (0.27, 'sex'),
 (0.168, 'infection_reason_visit to Wuhan')]

PLS기반 해당 데이터 셋 대표변수 리스트.

'''
disease
    기저질환 보유여부
region_Daegu
    대구지역 여부    
age
    나이
group_Cheongdo Daenam Hospital
    청도 대남병원 그룹 여부
sex
    성별
infection_reason_visit to Wuhan
    발병이유가 우한인 경우(우한 방문여부)
'''

변수간 상관계수 파악

pearson_df = scaled_X.copy()
desc = pearson_df.describe()

std_s = [desc[l][2] for l in desc.columns]
sorted_list = sorted(zip(std_s, desc.columns), reverse=True)
good_col_set = sorted_list[:len(sorted_list) // 3]
good_cols = [ll[1] for ll in good_col_set]

sel_pearson = pearson_df[good_cols]
sel_pearson['state'] = list(y_numeric)
sel_pearson = sel_pearson.corr()

C:\Users\EDCORE\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

fig = plt.figure(figsize=(10,5))
sns.heatmap(sel_pearson.sort_values(by='state', ascending=True))
plt.show()

상관계수 기반으로는

disease -> 기저질환 여부

age -> 나이가 많은지 적은지 여부

region_Daegu -> 대구 지역 인지의 여부

정도가 격리 / 치료 / 사망 여부를

설명하는데 가장 크게 기여한 변수입니다.

'[중급] 가볍게 이것저것' 카테고리의 다른 글

컴퓨터를 잘 사용하는 EZ한 방법[CPU, 메모리 점유율 100% 달성] (0)	2022.06.27
비트코인 알고리즘 직접 구현해보기 (0)	2020.04.02
월마트 맥주와 기저귀 썰에 대한 부분. (0)	2020.03.25
[R]소득수준 / 소비수준 / 나이 / 성별을 기반으로 고객군 군집화 분석 예제 (0)	2019.11.11
고객 장바구니 분석 level_2 (0)	2019.11.10

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

PassionPython

데이터로 알아보는 코로나를 대표하는 키워드!

rel_des_iso relation

1. PCA approach

주성분분석 시행

PCA기반 해당 데이터 셋 대표변수 리스트.

2. PLS approach

PLS기반 해당 데이터 셋 대표변수 리스트.

변수간 상관계수 파악

상관계수 기반으로는

disease -> 기저질환 여부

age -> 나이가 많은지 적은지 여부

region_Daegu -> 대구 지역 인지의 여부

정도가 격리 / 치료 / 사망 여부를

설명하는데 가장 크게 기여한 변수입니다.

'[중급] 가볍게 이것저것' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

데이터로 알아보는 코로나를 대표하는 키워드!

rel_des_iso relation

1. PCA approach

주성분분석 시행

PCA기반 해당 데이터 셋 대표변수 리스트.

2. PLS approach

PLS기반 해당 데이터 셋 대표변수 리스트.

변수간 상관계수 파악

상관계수 기반으로는

disease -> 기저질환 여부

age -> 나이가 많은지 적은지 여부

region_Daegu -> 대구 지역 인지의 여부

정도가 격리 / 치료 / 사망 여부를

설명하는데 가장 크게 기여한 변수입니다.

'[중급] 가볍게 이것저것' 카테고리의 다른 글

'[중급] 가볍게 이것저것' Related Articles

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역