import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"


Ekc = pd.read_csv('csv/loedata/Ekc.csv')
plt.subplot(1,2,1)
plt.scatter(Ekc.gdppcppp, Ekc.co2pc, facecolors='w', edgecolors='k')
#plt.plot(Ekc.gdppcppp, Ekc.co2pc, 'o', mfc='none')
plt.title("In original scale")
plt.subplot(1,2,2)
plt.scatter(Ekc.gdppcppp, Ekc.co2pc, facecolors='w', edgecolors='k')
plt.xscale('log')
plt.yscale('log')
plt.title('In log scale')
plt.show()


Pubserv = pd.read_csv('csv/loedata/Pubserv.csv')
Pubserv1 = Pubserv[Pubserv['servpc'] < 28].copy()
plt.scatter(Pubserv1.servpc, Pubserv1.finind, color='w', edgecolors='k')
#plt.plot(Pubserv1.servpc, Pubserv1.finind, 'o', mfc='w')
plt.show()


Klips = pd.read_csv('csv/loedata/Klips.csv')
Klips2 = Klips[(Klips['regular']==1) & (Klips['married']==1)].copy()
print(Klips2.shape)
plt.scatter(Klips2.educ, Klips2.labinc, color='w', edgecolors='k')
#plt.plot(Klips2.educ, Klips2.labinc, 'o', mfc='w')
plt.yscale('log')
plt.show()

(501, 8)


Klips2['lnlabinc'] = np.log(Klips2['labinc'])
ave = Klips2[['educ','lnlabinc']].groupby(['educ']).mean(['lnlabinc']).reset_index()
print(ave)

    educ  lnlabinc
0      9  3.145510
1     10  3.239290
2     11  2.837503
3     12  3.262296
4     13  3.223670
5     14  3.342744
6     15  3.223641
7     16  3.476449
8     17  3.312591
9     18  3.693358
10    19  3.696848


plt.plot(Klips2.educ, Klips2.lnlabinc, 'o', mfc='w')
plt.plot(ave['educ'], ave['lnlabinc'], '^', color='black', ms=10)
plt.show()


## https://stackoverflow.com/questions/972/adding-a-method-to-an-existing-object-instance
# We need this to fixate the limits of plots
def fix_lims(self):
    self.xlim(self.gca().get_xlim())
    self.ylim(self.gca().get_ylim())

# Now patch plt
plt.fix_lims = fix_lims.__get__(plt)

# 위 method를 만들고 싶지 않으면 plt.fix_lims()를 모두 다음으로 교체
# plt.xlim(plt.gca().get_xlim())
# plt.ylim(plt.gca().get_ylim())


from statsmodels.api import OLS, QuantReg
fm = 'finind~servpc'
ols = OLS.from_formula(fm, data = Pubserv1).fit()
lad = QuantReg.from_formula(fm, data = Pubserv1).fit(q=0.5)


plt.plot(Pubserv1.servpc, Pubserv1.finind, 'o', mfc='w')
plt.fix_lims()
plt.axline((0, ols.params['Intercept']), slope=ols.params['servpc'])
plt.axline((0, lad.params['Intercept']), slope=lad.params['servpc'], linestyle=':')
plt.show()


ols = OLS.from_formula('np.log(finind)~np.log(servpc)', Pubserv1).fit()
print(ols.params)

Intercept         4.767454
np.log(servpc)   -0.821201
dtype: float64


x = np.log(Pubserv1.servpc)
y = np.log(Pubserv1.finind)
xbar = np.mean(x)
xd = x-xbar
b1 = np.sum(xd*y)/np.sum(xd**2)
b0 = np.mean(y)-b1*xbar
print((b0,b1))

(4.767453582972257, -0.8212005150740734)


plt.scatter(x,y)
plt.fix_lims()
plt.axline((0,b0), slope=b1, color='k', lw=0.5)
plt.show()


ols = OLS.from_formula('np.log(labinc)~educ', data = Klips2).fit()
print(ols.params)

Intercept    2.527273
educ         0.059577
dtype: float64


plt.scatter(Klips2.educ, np.log(Klips2.labinc))
plt.fix_lims()
b0,b1 = ols.params
plt.axline((0,b0), slope = b1)
plt.show()


Death = pd.read_csv('csv/loedata/Death.csv')
Death10 = Death[Death.year==2010].copy()
ols = OLS.from_formula('smoke~aged', data=Death10).fit()
ols.params

Intercept    29.077420
aged         -0.228558
dtype: float64


Death10.plot.scatter(x='aged', y='smoke')
plt.fix_lims()
b0,b1 = ols.params
plt.axline((0,b0), slope=b1)
plt.show()
plt.close()


Consumption = pd.read_csv('csv/Ecdat/Consumption.csv')
ols = OLS.from_formula('ce~yd', data = Consumption).fit()
print(ols.params)

Intercept    6000.256052
yd              0.861788
dtype: float64


Consumption.plot.scatter(x='yd', y='ce')
plt.fix_lims()
b0,b1 = ols.params
plt.axline((0,b0), slope = b1, color='k', lw=0.5)
plt.show()


Galtonpar = pd.read_csv('csv/loedata/Galtonpar.csv')
ols = OLS.from_formula('avgsonht~midparht', data = Galtonpar).fit()
print(ols.params)

Intercept    21.752799
midparht      0.687150
dtype: float64


Galtonpar.plot.scatter(x = 'midparht', y = 'avgsonht')
plt.fix_lims()
b0,b1 = ols.params
plt.axline((0,0), slope = 1, linestyle=':')
plt.axline((0,b0), slope = b1)
plt.show()


hubble = pd.read_csv('csv/gamair/hubble.csv')
print(hubble.head(5))

     Galaxy     y      x
0   NGC0300   133   2.00
1   NGC0925   664   9.16
2  NGC1326A  1794  16.14
3   NGC1365  1594  17.95
4   NGC1425  1473  21.88


ols = OLS.from_formula('y~x-1', data = hubble).fit()
print(ols.params)

x    76.581172
dtype: float64


hubble.plot.scatter(x='x', y='y')
plt.axline((0,0), slope=ols.params['x'])
plt.show()


Cigar = pd.read_csv('csv/Ecdat/Cigar.csv')
Cigar90 = Cigar[Cigar.year==90].copy()
ols = OLS.from_formula('np.log(sales)~np.log(price)', data=Cigar90).fit()
print(ols.params)

Intercept        9.081182
np.log(price)   -0.899229
dtype: float64


ols = OLS.from_formula('np.log(finind)~np.log(servpc)', data = Pubserv1).fit()


b0,b1 = ols.params
plt.scatter(np.log(Pubserv1.servpc), np.log(Pubserv1.finind), zorder=1)
plt.fix_lims()
plt.axline((0,b0), slope=b1, linewidth=1, zorder=2)
plt.scatter(np.log(Pubserv1.servpc), ols.fittedvalues, marker = '^', zorder=3)
plt.show()


print(f"Sum of residuals = {np.sum(ols.resid)}")
print(f"Sum of x*uhat    = {np.sum(ols.resid*np.log(Pubserv1.servpc))}")
print(f"Sum of yhat*uhat = {np.sum(ols.fittedvalues*ols.resid)}")

Sum of residuals = 9.547918011776346e-14
Sum of x*uhat    = 2.5029978090174154e-13
Sum of yhat*uhat = 2.485095462745335e-13


print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         np.log(finind)   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.548
Method:                 Least Squares   F-statistic:                     101.5
Date:                Sun, 28 May 2023   Prob (F-statistic):           5.27e-16
Time:                        14:37:29   Log-Likelihood:                -11.195
No. Observations:                  84   AIC:                             26.39
Df Residuals:                      82   BIC:                             31.25
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          4.7675      0.202     23.559      0.000       4.365       5.170
np.log(servpc)    -0.8212      0.081    -10.076      0.000      -0.983      -0.659
==============================================================================
Omnibus:                        2.262   Durbin-Watson:                   1.282
Prob(Omnibus):                  0.323   Jarque-Bera (JB):                2.128
Skew:                          -0.384   Prob(JB):                        0.345
Kurtosis:                       2.866   Cond. No.                         19.1
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


print((ols.ess/ols.centered_tss, ols.rsquared))

(0.5532173833344799, 0.55321738333448)


print((ols.centered_tss, ols.ess + ols.ssr))

(14.37053822736247, 14.37053822736247)


print(OLS.from_formula('co2pc~gdppcppp', data = Ekc).fit().params)

Intercept   -0.034195
gdppcppp     0.000339
dtype: float64


print(OLS.from_formula('co2pc~I(gdppcppp/1000)', data = Ekc).fit().params)

Intercept            -0.034195
I(gdppcppp / 1000)    0.339048
dtype: float64


print(OLS.from_formula('np.log(co2pc)~np.log(gdppcppp)', data = Ekc).fit().params)

Intercept          -10.283620
np.log(gdppcppp)     1.224789
dtype: float64


print(OLS.from_formula('np.log(co2pc)~np.log(gdppcppp/1000)', data = Ekc).fit().params)

Intercept                 -1.823079
np.log(gdppcppp / 1000)    1.224789
dtype: float64

3장 단순회귀 모형의 추정¶

소득과 환경오염¶

공무원수 비율과 재정자립도¶

교육수준과 임금(KLIPS 데이터)¶

재정자립도 회귀: OLS와 LAD¶

공무원수와 재정자립도¶

수동 계산으로 확인¶

데이터와 회귀 직선 그림으로 표시¶

교육수준과 연간 근로소득¶

고령인구비율과 흡연율¶

소비함수¶

유전학과 Galton의 "회귀"¶

우주의 나이¶

담배 가격과 판매량¶

재정자립도 회귀에서 맞춘값 그림¶

잔차합, 설명변수와 잔차의 곱의 합, 맞춘값과 잔차의 곱의 합¶

OLS 회귀결과 요약(R제곱 포함)¶

SSE/SST = R-squared¶

SST = SSE + SSR¶

예제 3.1 단위 변환의 유용성¶

로그와 측정단위 변환¶