import pandas as pd
from statsmodels.formula.api import ols as OLS

Death = pd.read_csv('csv/loedata/Death.csv')
model = OLS('deathrate~drink+smoke+aged+vehipc+C(year)', data=Death) # C: categorical
print(model.fit().summary()) # ordinary

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     487.3
Date:                Fri, 16 Dec 2022   Prob (F-statistic):          3.42e-135
Time:                        15:39:09   Log-Likelihood:                -236.68
No. Observations:                 258   AIC:                             487.4
Df Residuals:                     251   BIC:                             512.2
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.2241      0.769     -0.291      0.771      -1.739       1.291
C(year)[T.2009]    -0.3788      0.098     -3.867      0.000      -0.572      -0.186
C(year)[T.2010]    -0.3510      0.102     -3.457      0.001      -0.551      -0.151
drink               0.0064      0.011      0.594      0.553      -0.015       0.028
smoke               0.0333      0.018      1.873      0.062      -0.002       0.068
aged                0.4027      0.010     38.401      0.000       0.382       0.423
vehipc              1.4079      1.163      1.211      0.227      -0.882       3.698
==============================================================================
Omnibus:                        2.638   Durbin-Watson:                   1.529
Prob(Omnibus):                  0.267   Jarque-Bera (JB):                2.504
Skew:                           0.117   Prob(JB):                        0.286
Kurtosis:                       3.422   Cond. No.                     1.91e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.91e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


print(model.fit(cov_type='HC3').summary()) # HC3

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     650.1
Date:                Fri, 16 Dec 2022   Prob (F-statistic):          8.45e-150
Time:                        15:39:09   Log-Likelihood:                -236.68
No. Observations:                 258   AIC:                             487.4
Df Residuals:                     251   BIC:                             512.2
Df Model:                           6                                         
Covariance Type:                  HC3                                         
===================================================================================
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.2241      0.785     -0.285      0.775      -1.763       1.315
C(year)[T.2009]    -0.3788      0.095     -3.996      0.000      -0.565      -0.193
C(year)[T.2010]    -0.3510      0.104     -3.371      0.001      -0.555      -0.147
drink               0.0064      0.011      0.565      0.572      -0.016       0.029
smoke               0.0333      0.019      1.770      0.077      -0.004       0.070
aged                0.4027      0.010     39.371      0.000       0.383       0.423
vehipc              1.4079      1.295      1.087      0.277      -1.130       3.946
==============================================================================
Omnibus:                        2.638   Durbin-Watson:                   1.529
Prob(Omnibus):                  0.267   Jarque-Bera (JB):                2.504
Skew:                           0.117   Prob(JB):                        0.286
Kurtosis:                       3.422   Cond. No.                     1.91e+03
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)
[2] The condition number is large, 1.91e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


print(model.fit(cov_type='cluster', cov_kwds={'groups':Death.region}).summary()) # not for CC0 but for CC1

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     493.2
Date:                Fri, 16 Dec 2022   Prob (F-statistic):           8.24e-64
Time:                        15:39:09   Log-Likelihood:                -236.68
No. Observations:                 258   AIC:                             487.4
Df Residuals:                     251   BIC:                             512.2
Df Model:                           6                                         
Covariance Type:              cluster                                         
===================================================================================
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.2241      1.019     -0.220      0.826      -2.222       1.773
C(year)[T.2009]    -0.3788      0.077     -4.948      0.000      -0.529      -0.229
C(year)[T.2010]    -0.3510      0.099     -3.547      0.000      -0.545      -0.157
drink               0.0064      0.014      0.453      0.650      -0.021       0.034
smoke               0.0333      0.019      1.716      0.086      -0.005       0.071
aged                0.4027      0.014     29.775      0.000       0.376       0.429
vehipc              1.4079      1.725      0.816      0.414      -1.972       4.788
==============================================================================
Omnibus:                        2.638   Durbin-Watson:                   1.529
Prob(Omnibus):                  0.267   Jarque-Bera (JB):                2.504
Skew:                           0.117   Prob(JB):                        0.286
Kurtosis:                       3.422   Cond. No.                     1.91e+03
==============================================================================

Notes:
[1] Standard Errors are robust to cluster correlation (cluster)
[2] The condition number is large, 1.91e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


import pandas as pd
import numpy as np

np.random.seed(101)
n = 100
x = 2*(np.arange(n) % 2)-1
y = 1+x+np.random.normal(size=n)
DF = pd.DataFrame({'x':x, 'y':y})
print(DF.head())

   x         y
0 -1  2.706850
1  1  2.628133
2 -1  0.907969
3  1  2.503826
4 -1  0.651118


from statsmodels.formula.api import ols as OLS
ols = OLS('y~x', data=DF).fit()
print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.586
Model:                            OLS   Adj. R-squared:                  0.582
Method:                 Least Squares   F-statistic:                     138.9
Date:                Fri, 16 Dec 2022   Prob (F-statistic):           1.72e-20
Time:                        15:39:09   Log-Likelihood:                -143.19
No. Observations:                 100   AIC:                             290.4
Df Residuals:                      98   BIC:                             295.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.1664      0.102     11.397      0.000       0.963       1.369
x              1.2060      0.102     11.785      0.000       1.003       1.409
==============================================================================
Omnibus:                        0.580   Durbin-Watson:                   2.008
Prob(Omnibus):                  0.748   Jarque-Bera (JB):                0.565
Skew:                           0.176   Prob(JB):                        0.754
Kurtosis:                       2.890   Cond. No.                         1.00
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Continue
from statsmodels.stats.diagnostic import acorr_breusch_godfrey

acorr_breusch_godfrey(ols, nlags=1)[:4] # (lm, lmpval, fval, fpval)

(0.2876025719260267, 0.591760970426137, 0.27977914679012034, 0.598054322774964)

14장 오차의 자기상관¶

예제 14.1 지역별 사망률¶

예제 14.2 클러스터 개수에 따른 조정¶

Durbin-Watson 검정¶

Breusch-Godfrey 검정¶