In [1]:
import pandas as pd
from numpy import log
from statsmodels.formula.api import ols as OLS
Housing = pd.read_csv('csv/Ecdat/Housing.csv')
#https://stackoverflow.com/questions/30553838/getting-statsmodels-to-use-heteroskedasticity-corrected-standard-errors-in-coeff
model = OLS('log(price)~log(lotsize)+bedrooms+bathrms', data=Housing)
ols = model.fit(cov_type = 'HC0')
In [2]:
print(ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: log(price) R-squared: 0.504 Model: OLS Adj. R-squared: 0.501 Method: Least Squares F-statistic: 181.4 Date: Sun, 31 Dec 2023 Prob (F-statistic): 2.00e-81 Time: 11:21:28 Log-Likelihood: -43.015 No. Observations: 546 AIC: 94.03 Df Residuals: 542 BIC: 111.2 Df Model: 3 Covariance Type: HC0 ================================================================================ coef std err z P>|z| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 6.6222 0.248 26.708 0.000 6.136 7.108 log(lotsize) 0.4568 0.030 15.172 0.000 0.398 0.516 bedrooms 0.0892 0.018 4.999 0.000 0.054 0.124 bathrms 0.2368 0.026 8.971 0.000 0.185 0.288 ============================================================================== Omnibus: 4.038 Durbin-Watson: 1.324 Prob(Omnibus): 0.133 Jarque-Bera (JB): 3.887 Skew: -0.203 Prob(JB): 0.143 Kurtosis: 3.074 Cond. No. 197. ============================================================================== Notes: [1] Standard Errors are heteroscedasticity robust (HC0)
예제 13.4 통상적인 표준오차와 여러 견고한 표준오차들의 비교¶
In [3]:
ols = model.fit() # fit again
pd.DataFrame({'ord': ols.bse, 'hc0': ols.HC0_se, 'hc1': ols.HC1_se, 'hc2': ols.HC2_se, 'hc3': ols.HC3_se})
Out[3]:
ord | hc0 | hc1 | hc2 | hc3 | |
---|---|---|---|---|---|
Intercept | 0.241234 | 0.247946 | 0.248859 | 0.249356 | 0.250780 |
log(lotsize) | 0.028983 | 0.030110 | 0.030221 | 0.030286 | 0.030464 |
bedrooms | 0.016513 | 0.017850 | 0.017916 | 0.017985 | 0.018122 |
bathrms | 0.024479 | 0.026392 | 0.026490 | 0.026611 | 0.026834 |
예제 13.5 지역별 사망률¶
In [4]:
import pandas as pd
from statsmodels.formula.api import ols as OLS
Death = pd.read_csv('csv/loedata/Death.csv')
model = OLS('deathrate~drink+smoke+aged+vehipc+C(year)', data=Death) # C=categorical
In [5]:
print(model.fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: deathrate R-squared: 0.921 Model: OLS Adj. R-squared: 0.919 Method: Least Squares F-statistic: 487.3 Date: Sun, 31 Dec 2023 Prob (F-statistic): 3.42e-135 Time: 11:21:28 Log-Likelihood: -236.68 No. Observations: 258 AIC: 487.4 Df Residuals: 251 BIC: 512.2 Df Model: 6 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- Intercept -0.2241 0.769 -0.291 0.771 -1.739 1.291 C(year)[T.2009] -0.3788 0.098 -3.867 0.000 -0.572 -0.186 C(year)[T.2010] -0.3510 0.102 -3.457 0.001 -0.551 -0.151 drink 0.0064 0.011 0.594 0.553 -0.015 0.028 smoke 0.0333 0.018 1.873 0.062 -0.002 0.068 aged 0.4027 0.010 38.401 0.000 0.382 0.423 vehipc 1.4079 1.163 1.211 0.227 -0.882 3.698 ============================================================================== Omnibus: 2.638 Durbin-Watson: 1.529 Prob(Omnibus): 0.267 Jarque-Bera (JB): 2.504 Skew: 0.117 Prob(JB): 0.286 Kurtosis: 3.422 Cond. No. 1.91e+03 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.91e+03. This might indicate that there are strong multicollinearity or other numerical problems.
위의 주석 [2]는 condition number에 관한 것이다. 이 값은 $X'X$의 가장 큰 eigenvalue와 가장 작은 eigenvalue 간 비율에 제곱근을 취한 것과 동일하다. 근사적인 다중공선성의 지표가 되기도 하는데, 크게 신경 쓰지 않아도 된다.
In [6]:
print(model.fit(cov_type='HC3').summary())
OLS Regression Results ============================================================================== Dep. Variable: deathrate R-squared: 0.921 Model: OLS Adj. R-squared: 0.919 Method: Least Squares F-statistic: 650.1 Date: Sun, 31 Dec 2023 Prob (F-statistic): 8.45e-150 Time: 11:21:28 Log-Likelihood: -236.68 No. Observations: 258 AIC: 487.4 Df Residuals: 251 BIC: 512.2 Df Model: 6 Covariance Type: HC3 =================================================================================== coef std err z P>|z| [0.025 0.975] ----------------------------------------------------------------------------------- Intercept -0.2241 0.785 -0.285 0.775 -1.763 1.315 C(year)[T.2009] -0.3788 0.095 -3.996 0.000 -0.565 -0.193 C(year)[T.2010] -0.3510 0.104 -3.371 0.001 -0.555 -0.147 drink 0.0064 0.011 0.565 0.572 -0.016 0.029 smoke 0.0333 0.019 1.770 0.077 -0.004 0.070 aged 0.4027 0.010 39.371 0.000 0.383 0.423 vehipc 1.4079 1.295 1.087 0.277 -1.130 3.946 ============================================================================== Omnibus: 2.638 Durbin-Watson: 1.529 Prob(Omnibus): 0.267 Jarque-Bera (JB): 2.504 Skew: 0.117 Prob(JB): 0.286 Kurtosis: 3.422 Cond. No. 1.91e+03 ============================================================================== Notes: [1] Standard Errors are heteroscedasticity robust (HC3) [2] The condition number is large, 1.91e+03. This might indicate that there are strong multicollinearity or other numerical problems.
예제 13.6 사망률 분석에서 F검정¶
In [7]:
# Continue
hypo = 'drink=0, smoke=0'
rego = model.fit()
print(rego.f_test(hypo))
regh = model.fit(cov_type='HC3')
print(regh.f_test(hypo))
<F test: F=3.2488401914796645, p=0.04045761180359891, df_denom=251, df_num=2> <F test: F=2.9862362091563925, p=0.052273714967270596, df_denom=251, df_num=2>
예제 13.8 사망률 모형의 WLS 추정¶
In [8]:
import pandas as pd
import statsmodels.api as sm
Death = pd.read_csv('csv/loedata/Death.csv')
fm = 'deathrate~drink+smoke+aged+vehipc+C(year)'
wls = sm.WLS.from_formula(fm, data=Death, weights=Death.regpop).fit()
print(wls.summary())
WLS Regression Results ============================================================================== Dep. Variable: deathrate R-squared: 0.941 Model: WLS Adj. R-squared: 0.939 Method: Least Squares F-statistic: 663.9 Date: Sun, 31 Dec 2023 Prob (F-statistic): 7.08e-151 Time: 11:21:28 Log-Likelihood: -246.89 No. Observations: 258 AIC: 507.8 Df Residuals: 251 BIC: 532.6 Df Model: 6 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- Intercept -0.5815 0.764 -0.761 0.448 -2.087 0.924 C(year)[T.2009] -0.2963 0.096 -3.093 0.002 -0.485 -0.108 C(year)[T.2010] -0.2977 0.099 -3.008 0.003 -0.493 -0.103 drink 0.0166 0.011 1.575 0.117 -0.004 0.037 smoke 0.0327 0.018 1.830 0.068 -0.002 0.068 aged 0.4105 0.010 42.003 0.000 0.391 0.430 vehipc 0.5491 1.185 0.464 0.643 -1.784 2.882 ============================================================================== Omnibus: 13.113 Durbin-Watson: 1.452 Prob(Omnibus): 0.001 Jarque-Bera (JB): 30.102 Skew: -0.119 Prob(JB): 2.91e-07 Kurtosis: 4.656 Cond. No. 2.02e+03 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.02e+03. This might indicate that there are strong multicollinearity or other numerical problems.
예제 13.9 FGLS의 예¶
In [9]:
from statsmodels.formula.api import ols as OLS, wls as WLS
from numpy import log, exp
# Continue
fm = 'deathrate~drink+smoke+aged+vehipc+C(year)'
# Step 1
ols = OLS(fm, data=Death).fit()
Death['u'] = ols.resid
# Step 2
fm_aux = fm.replace('deathrate', 'log(u**2)')
aux = OLS(fm_aux, data=Death).fit()
# Step 3
h = exp(aux.fittedvalues)
# Step 4
fgls = WLS(fm, data=Death, weights=1/h).fit()
print(fgls.summary())
WLS Regression Results ============================================================================== Dep. Variable: deathrate R-squared: 0.936 Model: WLS Adj. R-squared: 0.934 Method: Least Squares F-statistic: 608.2 Date: Sun, 31 Dec 2023 Prob (F-statistic): 2.13e-146 Time: 11:21:28 Log-Likelihood: -232.03 No. Observations: 258 AIC: 478.1 Df Residuals: 251 BIC: 502.9 Df Model: 6 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- Intercept -0.2457 0.773 -0.318 0.751 -1.769 1.278 C(year)[T.2009] -0.3521 0.090 -3.914 0.000 -0.529 -0.175 C(year)[T.2010] -0.2974 0.097 -3.056 0.002 -0.489 -0.106 drink 0.0087 0.011 0.817 0.414 -0.012 0.030 smoke 0.0363 0.017 2.187 0.030 0.004 0.069 aged 0.4050 0.010 40.215 0.000 0.385 0.425 vehipc 0.7946 1.113 0.714 0.476 -1.397 2.987 ============================================================================== Omnibus: 0.763 Durbin-Watson: 1.505 Prob(Omnibus): 0.683 Jarque-Bera (JB): 0.554 Skew: 0.101 Prob(JB): 0.758 Kurtosis: 3.103 Cond. No. 1.99e+03 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.99e+03. This might indicate that there are strong multicollinearity or other numerical problems.
이분산 존재 검정¶
In [10]:
import pandas as pd
import numpy as np
# Generate data
np.random.seed(101)
n = 50
x1 = np.random.normal(size=n)
x2 = np.random.normal(size=n)
u = [a*b for a,b in zip(x1,np.random.normal(size=n))]
y = 1+x1-x2+u
DF = pd.DataFrame({'y': y, 'x1': x1, 'x2': x2})
In [11]:
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan as bptest
uhat = sm.OLS.from_formula('y~x1+x2', data=DF).fit().resid
bp = bptest(uhat, sm.add_constant(pd.DataFrame({"x1":x1, "x1sq":x1**2})))
print(bp) # (lm, lm_pvalue, fvalue, f_pvalue)
(25.43047687088538, 3.00498363642047e-06, 24.32347601234627, 5.604385363360125e-08)
맨 앞 숫자가 LM 검정통계, 그 다음이 p값이다. 세 번째 숫자는 F 검정통계, 네 번째 숫자는 이에 해당하는 p값이다.
In [12]:
DF['u2'] = uhat**2
aux = sm.OLS.from_formula('u2~x1+I(x1**2)', data=DF).fit()
print(aux.nobs*aux.rsquared)
25.43047687088538
LM 검정통계는 앞에서 구한 값과 동일하다.