소비자 물가지수 데이터 셋으로 선형회귀 + 경사 하강법 이해하기
import pandas as pd
import numpy as np
데이터 불러오기
train = pd.read_csv('ca2.csv', encoding='euc-kr')
train.head()
|
소비자물가지수 |
경제활동인구 |
KOSPI_평균 |
주택전세가격지수 |
대출평균 |
0 |
89.97 |
24082.0 |
1682.16 |
69.50 |
5.94 |
1 |
90.16 |
24035.0 |
1598.96 |
69.93 |
5.84 |
2 |
90.34 |
24382.0 |
1665.50 |
70.41 |
5.69 |
3 |
90.70 |
24858.0 |
1730.29 |
70.85 |
5.49 |
4 |
90.79 |
25099.0 |
1648.30 |
71.14 |
5.40 |
features = ['경제활동인구', 'KOSPI_평균', '주택전세가격지수', '대출평균']
X = train[features]
Y = train['소비자물가지수']
sklearn 의 선형회귀 모델 이용하기
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X, Y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
print(regr.coef_, regr.intercept_)
[-1.28958085e-04 1.47624463e-03 4.03464481e-01 8.54998582e-02] 62.29928990923021
regr.score(X, Y)
0.9797840250318295
response = train['소비자물가지수']
x1 = train['경제활동인구']
x2 = train['KOSPI_평균']
x3 = train['주택전세가격지수']
x4 = train['대출평균']
선형회귀모델 결과 확인하기
for n, value in enumerate(response):
result = regr.coef_[0] * x1[n] + regr.coef_[1] * x2[n] + regr.coef_[2] * x3[n] + regr.coef_[3] * x4[n] + regr.intercept_
print(value, result)
89.97 90.22565154335844
90.16 90.27382876056146
90.34 90.50814759508005
90.7 90.70283383642649
90.79 90.66002735254133
90.7 90.82297220729316
90.88 90.95391181518451
91.34 91.1659921230127
92.07 91.41586636129334
92.07 91.74828024981448
91.61 92.12427800324514
91.98 92.50599857734318
93.07 92.96692755336818
93.71 93.28494791026951
94.07 93.7687178161982
94.16 94.31769105888966
94.35 94.50797561512923
94.53 94.63299615495845
94.98 95.01899315161883
95.62 94.98184414214391
95.53 95.3384342758176
95.35 95.64647517287915
95.44 95.885428050088
95.8 95.96472398841922
96.18 96.1370542260972
96.55 96.3900024589688
경사하강법을 위한 데이터 전처리작업
x1 = x1 - np.min(x1)
x2 = x2 - np.min(x2)
x3 = x3 - np.min(x3)
x4 = x4 - np.min(x4)
x1 = x1 / np.max(x1)
x2 = x2 / np.max(x2)
x3 = x3 / np.max(x3)
x4 = x4 / np.max(x4)
값을 0 에서 1 사이로 맞춰준다.
target = [x1, x2, x3, x4, response]
for item in target:
print(np.min(item), np.max(item))
0.0 1.0
0.0 1.0
0.0 1.0
0.0 1.0
89.97 96.55
경사하강법 수행
from random import randint
w1, w2, w3, w4 = (randint(1, 100) / 100), (randint(1, 100) / 100), (randint(1, 100) / 100), (randint(1, 100) / 100)
b = 100
LR = 0.5
epoch = 0
prev_0 = 0
prev_1 = 0
error = 0
while True:
prev_0 = error
epoch = epoch + 1
predict = (w1 * x1) + (w2 * x2) + (w3 * x3) + (w4 * x4) + b
error = np.mean(predict - response)
w1 = w1 - LR * np.mean(((predict - response) * (x1)))
w2 = w2 - LR * np.mean(((predict - response) * (x2)))
w3 = w3 - LR * np.mean(((predict - response) * (x3)))
w4 = w4 - LR * np.mean(((predict - response) * (x4)))
b = b - LR * np.mean(predict - response)
prev_1 = error
if epoch % 100 == 0:
LR = LR * 0.5
print(epoch, error)
if np.abs(error) < 10 ** (-20) or (prev_1 - prev_0) ** 2 < 10 ** (-20):
print(w1, w2, w3, w4, b)
break
100 0.003070218358805422
200 -2.563210229092979e-05
300 -0.00015078427737664086
400 -0.0001579533846739222
500 -0.000155979069148224
600 -0.0001541265900613098
700 -0.00015303133738615754
800 -0.00015244649873015812
900 -0.00015214535848429023
1000 -0.00015199267799586707
1100 -0.00015191581882043273
1200 -0.00015187726051653255
1300 -0.00015185794936365063
-0.1934569723304091 0.827575461298807 5.791374332907708 0.06331772154680886 90.04836228527031
경사하강법 모델 예측결과 확인
for i, value in enumerate(response):
result = w1 * x1[i] + w2 * x2[i] + w3 * x3[i] + w4 * x4[i] + b
print(value, int(result * 100) / 100)
89.97 90.23
90.16 90.27
90.34 90.5
90.7 90.7
90.79 90.65
90.7 90.82
90.88 90.95
91.34 91.16
92.07 91.41
92.07 91.74
91.61 92.12
91.98 92.5
93.07 92.96
93.71 93.28
94.07 93.77
94.16 94.32
94.35 94.51
94.53 94.63
94.98 95.02
95.62 94.98
95.53 95.33
95.35 95.64
95.44 95.88
95.8 95.95
96.18 96.13
96.55 96.38