import pandas as pd
from statsmodels.api import OLS

Death = pd.read_csv('csv/loedata/Death.csv')
ols = OLS.from_formula('deathrate~drink', data=Death[Death.year==2010]).fit()
print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.318
Model:                            OLS   Adj. R-squared:                  0.310
Method:                 Least Squares   F-statistic:                     39.14
Date:                Sun, 28 May 2023   Prob (F-statistic):           1.59e-08
Time:                        15:08:46   Log-Likelihood:                -173.59
No. Observations:                  86   AIC:                             351.2
Df Residuals:                      84   BIC:                             356.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     21.1006      1.722     12.252      0.000      17.676      24.525
drink         -0.2268      0.036     -6.256      0.000      -0.299      -0.155
==============================================================================
Omnibus:                        0.580   Durbin-Watson:                   1.435
Prob(Omnibus):                  0.748   Jarque-Bera (JB):                0.574
Skew:                          -0.188   Prob(JB):                        0.750
Kurtosis:                       2.865   Cond. No.                         412.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
plt.scatter(Death.drink, Death.deathrate, facecolors='w', edgecolors='k')
plt.show()


print(OLS.from_formula('deathrate~drink+aged', data=Death[Death.year==2010]).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              deathrate   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.917
Method:                 Least Squares   F-statistic:                     470.0
Date:                Sun, 28 May 2023   Prob (F-statistic):           5.38e-46
Time:                        15:08:46   Log-Likelihood:                -82.034
No. Observations:                  86   AIC:                             170.1
Df Residuals:                      83   BIC:                             177.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.6139      1.060     -0.579      0.564      -2.722       1.495
drink          0.0344      0.016      2.099      0.039       0.002       0.067
aged           0.4058      0.016     24.797      0.000       0.373       0.438
==============================================================================
Omnibus:                        0.091   Durbin-Watson:                   2.150
Prob(Omnibus):                  0.955   Jarque-Bera (JB):                0.086
Skew:                          -0.064   Prob(JB):                        0.958
Kurtosis:                       2.912   Cond. No.                         810.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


import pandas as pd
import numpy as np
from statsmodels.api import OLS

Wages1 = pd.read_csv('csv/Ecdat/Wages1.csv')
## https://stackoverflow.com/questions/20840803/how-to-convert-false-to-0-and-true-to-1
Wages1['male'] = [int(x) for x in Wages1.sex=='male'] # :-(
Wages1['female'] = [int(x) for x in Wages1.sex=='female'] # :-(
print(Wages1.head())

   exper     sex  school      wage  male  female
0      9  female      13  6.315296     0       1
1     12  female      12  5.479770     0       1
2     11  female      11  3.642170     0       1
3      9  female      14  4.593337     0       1
4      8  female      14  2.418157     0       1


print(np.sum(Wages1.male), np.sum(Wages1.female))

1725 1569


ols = OLS.from_formula('wage~female', data=Wages1[Wages1.female==1]).fit()
print(ols.params)
print("Type of female dummy: ", type(Wages1.female[0]))

Intercept    8.384840e+12
female      -8.384840e+12
dtype: float64
Type of female dummy:  <class 'numpy.int64'>


print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                      -0.001
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                   -0.8714
Date:                Sun, 28 May 2023   Prob (F-statistic):               1.00
Time:                        15:08:46   Log-Likelihood:                -3883.9
No. Observations:                1569   AIC:                             7772.
Df Residuals:                    1567   BIC:                             7782.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   8.385e+12   2.06e+13      0.406      0.685   -3.21e+13    4.89e+13
female     -8.385e+12   2.06e+13     -0.406      0.685   -4.89e+13    3.21e+13
==============================================================================
Omnibus:                      762.482   Durbin-Watson:                   1.872
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             8859.515
Skew:                           1.975   Prob(JB):                         0.00
Kurtosis:                      13.951   Cond. No.                     5.68e+14
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 9.71e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


print(OLS.from_formula('wage~female', data=Wages1).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     107.9
Date:                Sun, 28 May 2023   Prob (F-statistic):           6.71e-25
Time:                        15:08:46   Log-Likelihood:                -8522.2
No. Observations:                3294   AIC:                         1.705e+04
Df Residuals:                    3292   BIC:                         1.706e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      6.3130      0.077     81.495      0.000       6.161       6.465
female        -1.1661      0.112    -10.389      0.000      -1.386      -0.946
==============================================================================
Omnibus:                     1538.963   Durbin-Watson:                   1.866
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            15569.712
Skew:                           1.965   Prob(JB):                         0.00
Kurtosis:                      12.899   Cond. No.                         2.57
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


print(OLS.from_formula('wage~male+female', data=Wages1).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     53.94
Date:                Sun, 28 May 2023   Prob (F-statistic):           8.91e-24
Time:                        15:08:46   Log-Likelihood:                -8522.2
No. Observations:                3294   AIC:                         1.705e+04
Df Residuals:                    3291   BIC:                         1.707e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2.441e+11   2.35e+12      0.104      0.917   -4.37e+12    4.85e+12
male       -2.441e+11   2.35e+12     -0.104      0.917   -4.85e+12    4.37e+12
female     -2.441e+11   2.35e+12     -0.104      0.917   -4.85e+12    4.37e+12
==============================================================================
Omnibus:                     1538.628   Durbin-Watson:                   1.866
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            15563.735
Skew:                           1.965   Prob(JB):                         0.00
Kurtosis:                      12.897   Cond. No.                     8.90e+13
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.24e-25. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


print(OLS.from_formula('wage~male+female-1', data=Wages1).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     107.9
Date:                Sun, 28 May 2023   Prob (F-statistic):           6.71e-25
Time:                        15:08:46   Log-Likelihood:                -8522.2
No. Observations:                3294   AIC:                         1.705e+04
Df Residuals:                    3292   BIC:                         1.706e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
male           6.3130      0.077     81.495      0.000       6.161       6.465
female         5.1469      0.081     63.366      0.000       4.988       5.306
==============================================================================
Omnibus:                     1538.963   Durbin-Watson:                   1.866
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            15569.712
Skew:                           1.965   Prob(JB):                         0.00
Kurtosis:                      12.899   Cond. No.                         1.05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


import pandas as pd
import numpy as np
from statsmodels.api import OLS

Wages1 = pd.read_csv('csv/Ecdat/Wages1.csv')
print(OLS.from_formula('np.log(wage)~sex+school+exper', data=Wages1).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(wage)   R-squared:                       0.137
Model:                            OLS   Adj. R-squared:                  0.137
Method:                 Least Squares   F-statistic:                     174.7
Date:                Sun, 28 May 2023   Prob (F-statistic):          4.04e-105
Time:                        15:08:46   Log-Likelihood:                -2869.8
No. Observations:                3294   AIC:                             5748.
Df Residuals:                    3290   BIC:                             5772.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -0.2593      0.088     -2.936      0.003      -0.433      -0.086
sex[T.male]     0.2426      0.020     11.860      0.000       0.202       0.283
school          0.1234      0.006     19.802      0.000       0.111       0.136
exper           0.0354      0.005      7.845      0.000       0.027       0.044
==============================================================================
Omnibus:                      881.201   Durbin-Watson:                   1.875
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3779.073
Skew:                          -1.239   Prob(JB):                         0.00
Kurtosis:                       7.625   Cond. No.                         126.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


np.random.seed(1)
Wages1['rnd'] = np.random.normal(size=len(Wages1))
## https://www.statsmodels.org/0.6.1/examples/notebooks/generated/ols.html
print(OLS.from_formula('np.log(wage)~sex+school+exper', data=Wages1).fit().rsquared)

0.13743765266234398


print(OLS.from_formula('np.log(wage)~sex+school+exper+rnd', data=Wages1).fit().rsquared)

0.1374480389013667

8장 다중회귀 모형과 그 추정¶

예제 8.1 음주율과 사망률¶

예제 8.2 음주율과 사망률(계속)¶

예제 8.3 여성 더미, 남성 더미, 공선성¶

맞춘값, 잔차, 제곱합, R제곱¶