import pandas as pd
from numpy import log # log instead of np.log
from statsmodels.api import OLS
Regko = pd.read_csv('csv/loedata/Regko.csv')
fm = 'log(divorce)~log(regpop)+log(drink)+log(hdrink) + log(smoke)+log(grdp/regpop) + log(aged)'
print(OLS.from_formula(fm, data=Regko[Regko.type==2]).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: log(divorce) R-squared: 0.485 Model: OLS Adj. R-squared: 0.444 Method: Least Squares F-statistic: 11.76 Date: Fri, 16 Dec 2022 Prob (F-statistic): 3.00e-09 Time: 10:50:59 Log-Likelihood: 75.101 No. Observations: 82 AIC: -136.2 Df Residuals: 75 BIC: -119.4 Df Model: 6 Covariance Type: nonrobust ====================================================================================== coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------------- Intercept 0.1727 0.990 0.175 0.862 -1.799 2.144 log(regpop) -0.0126 0.026 -0.487 0.628 -0.064 0.039 log(drink) -0.0731 0.195 -0.374 0.709 -0.463 0.316 log(hdrink) 0.2494 0.075 3.321 0.001 0.100 0.399 log(smoke) 0.1834 0.160 1.148 0.255 -0.135 0.502 log(grdp / regpop) 0.0953 0.040 2.404 0.019 0.016 0.174 log(aged) -0.1969 0.062 -3.150 0.002 -0.321 -0.072 ============================================================================== Omnibus: 0.191 Durbin-Watson: 1.805 Prob(Omnibus): 0.909 Jarque-Bera (JB): 0.335 Skew: -0.097 Prob(JB): 0.846 Kurtosis: 2.754 Cond. No. 1.18e+03 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.18e+03. This might indicate that there are strong multicollinearity or other numerical problems.
주석 [2]가 무슨 뜻인지 모르겠다.
import pandas as pd
import numpy as np
from statsmodels.api import OLS
twoyear = pd.read_csv('csv/wooldridge/twoyear.csv')
print(len(twoyear))
twoyear.head()
6763
female | phsrank | BA | AA | black | hispanic | id | exper | jc | univ | ... | medcity | submed | lgcity | sublg | vlgcity | subvlg | ne | nc | south | totcoll | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 65 | 0 | 0 | 0 | 0 | 19 | 161 | 0.000000 | 0.000000 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0.000000 |
1 | 1 | 97 | 0 | 0 | 0 | 0 | 93 | 119 | 0.000000 | 7.033333 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 7.033333 |
2 | 1 | 44 | 0 | 0 | 0 | 0 | 96 | 81 | 0.000000 | 0.000000 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0.000000 |
3 | 1 | 34 | 0 | 0 | 0 | 1 | 119 | 39 | 0.266667 | 0.000000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.266667 |
4 | 1 | 80 | 0 | 0 | 0 | 0 | 132 | 141 | 0.000000 | 0.000000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.000000 |
5 rows × 23 columns
reg0 = OLS.from_formula('lwage~jc+univ+exper', data=twoyear).fit()
print(reg0.summary())
OLS Regression Results ============================================================================== Dep. Variable: lwage R-squared: 0.222 Model: OLS Adj. R-squared: 0.222 Method: Least Squares F-statistic: 644.5 Date: Fri, 16 Dec 2022 Prob (F-statistic): 0.00 Time: 10:50:59 Log-Likelihood: -3888.7 No. Observations: 6763 AIC: 7785. Df Residuals: 6759 BIC: 7813. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.4723 0.021 69.910 0.000 1.431 1.514 jc 0.0667 0.007 9.767 0.000 0.053 0.080 univ 0.0769 0.002 33.298 0.000 0.072 0.081 exper 0.0049 0.000 31.397 0.000 0.005 0.005 ============================================================================== Omnibus: 81.514 Durbin-Watson: 1.968 Prob(Omnibus): 0.000 Jarque-Bera (JB): 142.465 Skew: -0.036 Prob(JB): 1.16e-31 Kurtosis: 3.707 Cond. No. 512. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
twoyear['totcoll'] = twoyear.jc + twoyear.univ
print(OLS.from_formula('lwage~jc+totcoll+exper', data=twoyear).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: lwage R-squared: 0.222 Model: OLS Adj. R-squared: 0.222 Method: Least Squares F-statistic: 644.5 Date: Fri, 16 Dec 2022 Prob (F-statistic): 0.00 Time: 10:50:59 Log-Likelihood: -3888.7 No. Observations: 6763 AIC: 7785. Df Residuals: 6759 BIC: 7813. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.4723 0.021 69.910 0.000 1.431 1.514 jc -0.0102 0.007 -1.468 0.142 -0.024 0.003 totcoll 0.0769 0.002 33.298 0.000 0.072 0.081 exper 0.0049 0.000 31.397 0.000 0.005 0.005 ============================================================================== Omnibus: 81.514 Durbin-Watson: 1.968 Prob(Omnibus): 0.000 Jarque-Bera (JB): 142.465 Skew: -0.036 Prob(JB): 1.16e-31 Kurtosis: 3.707 Cond. No. 511. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
다음은 오차분산 추정치($s^2$) 부분을 제외한 분산 및 공분산 행렬이다.
reg0.normalized_cov_params
Intercept | jc | univ | exper | |
---|---|---|---|---|
Intercept | 0.002397 | -9.412177e-05 | -8.504379e-05 | -1.678074e-05 |
jc | -0.000094 | 2.520413e-04 | 1.042017e-05 | -9.287133e-08 |
univ | -0.000085 | 1.042017e-05 | 2.880909e-05 | 2.125993e-07 |
exper | -0.000017 | -9.287133e-08 | 2.125993e-07 | 1.340290e-07 |
위에 $s^2$을 곱하면 분산 및 공분산들을 얻는다. $s^2$은 다음과 같다.
s2 = reg0.scale # bad naming; this is s^2, not s.
print(s2)
0.18501901424040346
ixx = reg0.normalized_cov_params # inv(X'X)
se = np.sqrt(s2*(ixx.jc.jc+ixx.univ.univ-2*ixx.jc.univ))
print(se)
0.006935906572923082
bhat = reg0.params
tstat = (bhat.jc-bhat.univ)/se
print(tstat)
-1.4676566667595632
from scipy.stats import t
pval = 2*(1-t.cdf(abs(tstat), reg0.df_resid))
print(pval)
0.14224403710494182
import pandas as pd
import numpy as np
from statsmodels.api import OLS
twoyear = pd.read_csv('csv/wooldridge/twoyear.csv')
reg1 = OLS.from_formula('lwage~jc+univ+exper', data=twoyear).fit()
reg0 = OLS.from_formula('lwage~I(jc+univ)+exper', data=twoyear).fit()
Fstat = ((reg0.ssr-reg1.ssr)/1)/reg1.scale
print(Fstat)
2.1540160914845914
from scipy.stats import f
pval = 1-f.cdf(Fstat,1,reg1.df_resid)
print(pval)
0.14224403710559586
# 동일한 검정을 간단히
print(reg1.f_test('jc=univ'))
<F test: F=2.154016091483791, p=0.14224403710566683, df_denom=6.76e+03, df_num=1>
# 동일한 검정을 다른 식으로
twoyear['totcoll'] = twoyear.jc+twoyear.univ
ols1 = OLS.from_formula('lwage~jc+totcoll+exper', data=twoyear).fit()
print(ols1.f_test('jc=0'))
<F test: F=2.154016091483635, p=0.14224403710566683, df_denom=6.76e+03, df_num=1>
import pandas as pd
import numpy as np
from statsmodels.api import OLS
Doctor = pd.read_csv('csv/Ecdat/Doctor.csv')
ols1 = OLS.from_formula('doctor~children+access+health', data=Doctor).fit()
print(ols1.summary())
OLS Regression Results ============================================================================== Dep. Variable: doctor R-squared: 0.092 Model: OLS Adj. R-squared: 0.086 Method: Least Squares F-statistic: 16.15 Date: Fri, 16 Dec 2022 Prob (F-statistic): 5.12e-10 Time: 10:51:00 Log-Likelihood: -1250.3 No. Observations: 485 AIC: 2509. Df Residuals: 481 BIC: 2525. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.6075 0.418 3.843 0.000 0.786 2.429 children -0.2512 0.110 -2.278 0.023 -0.468 -0.035 access 1.4995 0.783 1.915 0.056 -0.039 3.039 health 0.6506 0.102 6.399 0.000 0.451 0.850 ============================================================================== Omnibus: 660.517 Durbin-Watson: 1.879 Prob(Omnibus): 0.000 Jarque-Bera (JB): 131192.869 Skew: 6.773 Prob(JB): 0.00 Kurtosis: 82.426 Cond. No. 16.2 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(ols1.f_test('access=0'))
<F test: F=3.665355066562526, p=0.05614775007156983, df_denom=481, df_num=1>
# Continue with the 'twoyear' data
ols1 = OLS.from_formula('lwage~jc+univ+exper', data=twoyear).fit() # unrestricted regression
print(ols1.f_test('jc=univ, univ=exper'))
<F test: F=503.97206186906055, p=9.710503129040587e-205, df_denom=6.76e+03, df_num=2>
# Continue with the 'twoyear' data
# Two constraints: jc=univ=exper
ols0 = OLS.from_formula('lwage~I(jc+univ+exper)', data=twoyear).fit() # restricted regression
twoyear['resid'] = ols0.resid
aux = OLS.from_formula('resid~jc+univ+exper', data=twoyear).fit() # unrestricted regression
lmstat = ols0.nobs*aux.rsquared
print(lmstat)
877.6587033769049
from scipy.stats import chi2
print(chi2.ppf(.95,2))
5.991464547107979
print(1-chi2.cdf(lmstat,2))
0.0