import pandas as pd
from numpy import log
from statsmodels.formula.api import ols as OLS

Housing = pd.read_csv('csv/Ecdat/Housing.csv')
#https://stackoverflow.com/questions/30553838/getting-statsmodels-to-use-heteroskedasticity-corrected-standard-errors-in-coeff
model = OLS('log(price)~log(lotsize)+bedrooms+bathrms', data=Housing)
ols = model.fit(cov_type = 'HC0')

print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             log(price)   R-squared:                       0.504
Model:                            OLS   Adj. R-squared:                  0.501
Method:                 Least Squares   F-statistic:                     181.4
Date:                Sun, 31 Dec 2023   Prob (F-statistic):           2.00e-81
Time:                        11:21:28   Log-Likelihood:                -43.015
No. Observations:                 546   AIC:                             94.03
Df Residuals:                     542   BIC:                             111.2
Df Model:                           3                                         
Covariance Type:                  HC0                                         
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        6.6222      0.248     26.708      0.000       6.136       7.108
log(lotsize)     0.4568      0.030     15.172      0.000       0.398       0.516
bedrooms         0.0892      0.018      4.999      0.000       0.054       0.124
bathrms          0.2368      0.026      8.971      0.000       0.185       0.288
==============================================================================
Omnibus:                        4.038   Durbin-Watson:                   1.324
Prob(Omnibus):                  0.133   Jarque-Bera (JB):                3.887
Skew:                          -0.203   Prob(JB):                        0.143
Kurtosis:                       3.074   Cond. No.                         197.
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC0)

ols = model.fit() # fit again
pd.DataFrame({'ord': ols.bse, 'hc0': ols.HC0_se, 'hc1': ols.HC1_se, 'hc2': ols.HC2_se, 'hc3': ols.HC3_se})

import pandas as pd
from statsmodels.formula.api import ols as OLS

Death = pd.read_csv('csv/loedata/Death.csv')
model = OLS('deathrate~drink+smoke+aged+vehipc+C(year)', data=Death) # C=categorical

print(model.fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     487.3
Date:                Sun, 31 Dec 2023   Prob (F-statistic):          3.42e-135
Time:                        11:21:28   Log-Likelihood:                -236.68
No. Observations:                 258   AIC:                             487.4
Df Residuals:                     251   BIC:                             512.2
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.2241      0.769     -0.291      0.771      -1.739       1.291
C(year)[T.2009]    -0.3788      0.098     -3.867      0.000      -0.572      -0.186
C(year)[T.2010]    -0.3510      0.102     -3.457      0.001      -0.551      -0.151
drink               0.0064      0.011      0.594      0.553      -0.015       0.028
smoke               0.0333      0.018      1.873      0.062      -0.002       0.068
aged                0.4027      0.010     38.401      0.000       0.382       0.423
vehipc              1.4079      1.163      1.211      0.227      -0.882       3.698
==============================================================================
Omnibus:                        2.638   Durbin-Watson:                   1.529
Prob(Omnibus):                  0.267   Jarque-Bera (JB):                2.504
Skew:                           0.117   Prob(JB):                        0.286
Kurtosis:                       3.422   Cond. No.                     1.91e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.91e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

print(model.fit(cov_type='HC3').summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     650.1
Date:                Sun, 31 Dec 2023   Prob (F-statistic):          8.45e-150
Time:                        11:21:28   Log-Likelihood:                -236.68
No. Observations:                 258   AIC:                             487.4
Df Residuals:                     251   BIC:                             512.2
Df Model:                           6                                         
Covariance Type:                  HC3                                         
===================================================================================
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.2241      0.785     -0.285      0.775      -1.763       1.315
C(year)[T.2009]    -0.3788      0.095     -3.996      0.000      -0.565      -0.193
C(year)[T.2010]    -0.3510      0.104     -3.371      0.001      -0.555      -0.147
drink               0.0064      0.011      0.565      0.572      -0.016       0.029
smoke               0.0333      0.019      1.770      0.077      -0.004       0.070
aged                0.4027      0.010     39.371      0.000       0.383       0.423
vehipc              1.4079      1.295      1.087      0.277      -1.130       3.946
==============================================================================
Omnibus:                        2.638   Durbin-Watson:                   1.529
Prob(Omnibus):                  0.267   Jarque-Bera (JB):                2.504
Skew:                           0.117   Prob(JB):                        0.286
Kurtosis:                       3.422   Cond. No.                     1.91e+03
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)
[2] The condition number is large, 1.91e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

# Continue
hypo = 'drink=0, smoke=0'
rego = model.fit()
print(rego.f_test(hypo))
regh = model.fit(cov_type='HC3')
print(regh.f_test(hypo))

<F test: F=3.2488401914796645, p=0.04045761180359891, df_denom=251, df_num=2>
<F test: F=2.9862362091563925, p=0.052273714967270596, df_denom=251, df_num=2>

import pandas as pd
import statsmodels.api as sm

Death = pd.read_csv('csv/loedata/Death.csv')
fm = 'deathrate~drink+smoke+aged+vehipc+C(year)'
wls = sm.WLS.from_formula(fm, data=Death, weights=Death.regpop).fit()
print(wls.summary())

                            WLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.941
Model:                            WLS   Adj. R-squared:                  0.939
Method:                 Least Squares   F-statistic:                     663.9
Date:                Sun, 31 Dec 2023   Prob (F-statistic):          7.08e-151
Time:                        11:21:28   Log-Likelihood:                -246.89
No. Observations:                 258   AIC:                             507.8
Df Residuals:                     251   BIC:                             532.6
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.5815      0.764     -0.761      0.448      -2.087       0.924
C(year)[T.2009]    -0.2963      0.096     -3.093      0.002      -0.485      -0.108
C(year)[T.2010]    -0.2977      0.099     -3.008      0.003      -0.493      -0.103
drink               0.0166      0.011      1.575      0.117      -0.004       0.037
smoke               0.0327      0.018      1.830      0.068      -0.002       0.068
aged                0.4105      0.010     42.003      0.000       0.391       0.430
vehipc              0.5491      1.185      0.464      0.643      -1.784       2.882
==============================================================================
Omnibus:                       13.113   Durbin-Watson:                   1.452
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               30.102
Skew:                          -0.119   Prob(JB):                     2.91e-07
Kurtosis:                       4.656   Cond. No.                     2.02e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

from statsmodels.formula.api import ols as OLS, wls as WLS
from numpy import log, exp

# Continue
fm = 'deathrate~drink+smoke+aged+vehipc+C(year)'
# Step 1
ols = OLS(fm, data=Death).fit()
Death['u'] = ols.resid
# Step 2
fm_aux = fm.replace('deathrate', 'log(u**2)')
aux = OLS(fm_aux, data=Death).fit()
# Step 3
h = exp(aux.fittedvalues)
# Step 4
fgls = WLS(fm, data=Death, weights=1/h).fit()
print(fgls.summary())

                            WLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.936
Model:                            WLS   Adj. R-squared:                  0.934
Method:                 Least Squares   F-statistic:                     608.2
Date:                Sun, 31 Dec 2023   Prob (F-statistic):          2.13e-146
Time:                        11:21:28   Log-Likelihood:                -232.03
No. Observations:                 258   AIC:                             478.1
Df Residuals:                     251   BIC:                             502.9
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.2457      0.773     -0.318      0.751      -1.769       1.278
C(year)[T.2009]    -0.3521      0.090     -3.914      0.000      -0.529      -0.175
C(year)[T.2010]    -0.2974      0.097     -3.056      0.002      -0.489      -0.106
drink               0.0087      0.011      0.817      0.414      -0.012       0.030
smoke               0.0363      0.017      2.187      0.030       0.004       0.069
aged                0.4050      0.010     40.215      0.000       0.385       0.425
vehipc              0.7946      1.113      0.714      0.476      -1.397       2.987
==============================================================================
Omnibus:                        0.763   Durbin-Watson:                   1.505
Prob(Omnibus):                  0.683   Jarque-Bera (JB):                0.554
Skew:                           0.101   Prob(JB):                        0.758
Kurtosis:                       3.103   Cond. No.                     1.99e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.99e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

import pandas as pd
import numpy as np

# Generate data
np.random.seed(101)
n = 50
x1 = np.random.normal(size=n)
x2 = np.random.normal(size=n)
u = [a*b for a,b in zip(x1,np.random.normal(size=n))]
y = 1+x1-x2+u
DF = pd.DataFrame({'y': y, 'x1': x1, 'x2': x2})

import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan as bptest

uhat = sm.OLS.from_formula('y~x1+x2', data=DF).fit().resid
bp = bptest(uhat, sm.add_constant(pd.DataFrame({"x1":x1, "x1sq":x1**2})))
print(bp) # (lm, lm_pvalue, fvalue, f_pvalue)

(25.43047687088538, 3.00498363642047e-06, 24.32347601234627, 5.604385363360125e-08)

DF['u2'] = uhat**2
aux = sm.OLS.from_formula('u2~x1+I(x1**2)', data=DF).fit()
print(aux.nobs*aux.rsquared)

25.43047687088538

	ord	hc0	hc1	hc2	hc3
Intercept	0.241234	0.247946	0.248859	0.249356	0.250780
log(lotsize)	0.028983	0.030110	0.030221	0.030286	0.030464
bedrooms	0.016513	0.017850	0.017916	0.017985	0.018122
bathrms	0.024479	0.026392	0.026490	0.026611	0.026834

13장 이분산¶

예제 13.3 이분산에 견고한 표준오차¶

예제 13.4 통상적인 표준오차와 여러 견고한 표준오차들의 비교¶

예제 13.5 지역별 사망률¶

예제 13.6 사망률 분석에서 F검정¶

예제 13.8 사망률 모형의 WLS 추정¶

예제 13.9 FGLS의 예¶

이분산 존재 검정¶