import pandas as pd
from numpy import log
from statsmodels.formula.api import ols as OLS

Wages1 = pd.read_csv('csv/Ecdat/Wages1.csv')
Wages1['female'] = [int(x=='female') for x in Wages1.sex]
ols = OLS('log(wage)~school+exper+female', data=Wages1).fit()
print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              log(wage)   R-squared:                       0.137
Model:                            OLS   Adj. R-squared:                  0.137
Method:                 Least Squares   F-statistic:                     174.7
Date:                Sun, 28 May 2023   Prob (F-statistic):          4.04e-105
Time:                        15:18:21   Log-Likelihood:                -2869.8
No. Observations:                3294   AIC:                             5748.
Df Residuals:                    3290   BIC:                             5772.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0168      0.088     -0.192      0.848      -0.188       0.155
school         0.1234      0.006     19.802      0.000       0.111       0.136
exper          0.0354      0.005      7.845      0.000       0.027       0.044
female        -0.2426      0.020    -11.860      0.000      -0.283      -0.202
==============================================================================
Omnibus:                      881.201   Durbin-Watson:                   1.875
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3779.073
Skew:                          -1.239   Prob(JB):                         0.00
Kurtosis:                       7.625   Cond. No.                         124.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Continue
ols = OLS('log(wage)~female*school+exper', data=Wages1).fit()
print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              log(wage)   R-squared:                       0.137
Model:                            OLS   Adj. R-squared:                  0.136
Method:                 Least Squares   F-statistic:                     131.0
Date:                Sun, 28 May 2023   Prob (F-statistic):          5.79e-104
Time:                        15:18:21   Log-Likelihood:                -2869.8
No. Observations:                3294   AIC:                             5750.
Df Residuals:                    3289   BIC:                             5780.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -0.0122      0.109     -0.112      0.911      -0.226       0.201
female           -0.2529      0.148     -1.714      0.087      -0.542       0.036
school            0.1230      0.008     15.003      0.000       0.107       0.139
female:school     0.0009      0.012      0.071      0.944      -0.024       0.025
exper             0.0354      0.005      7.793      0.000       0.026       0.044
==============================================================================
Omnibus:                      881.154   Durbin-Watson:                   1.875
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3778.551
Skew:                          -1.239   Prob(JB):                         0.00
Kurtosis:                       7.625   Cond. No.                         257.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Continue
eqa = OLS('log(wage)~school', data=Wages1).fit()
eqb = OLS('log(wage)~school+female', data=Wages1).fit()
eqc = OLS('log(wage)~school*female', data=Wages1).fit()
eqd = OLS('log(wage)~school', data=Wages1[Wages1.female==0]).fit()
eqe = OLS('log(wage)~school', data=Wages1[Wages1.female==1]).fit()


#https://economics.stackexchange.com/questions/11774/outputting-regressions-as-table-in-python-similar-to-outreg-in-stata
from statsmodels.iolib.summary2 import summary_col

outreg = summary_col([eqa,eqb,eqc,eqd,eqe], stars=True)
print(outreg) # Why do I have R-squared and R-squared Adj.?

====================================================================================
               log(wage) I log(wage) II log(wage) III log(wage) IIII log(wage) IIIII
------------------------------------------------------------------------------------
Intercept      0.3639***   0.3793***    0.4326***     0.4326***      0.0419         
               (0.0738)    (0.0721)     (0.0937)      (0.0908)       (0.1181)       
R-squared      0.0784      0.1213       0.1215        0.1021         0.0873         
R-squared Adj. 0.0781      0.1208       0.1207        0.1016         0.0867         
female                     -0.2601***   -0.3907***                                  
                           (0.0205)     (0.1478)                                    
school         0.1052***   0.1145***    0.1099***     0.1099***      0.1210***      
               (0.0063)    (0.0062)     (0.0081)      (0.0078)       (0.0099)       
school:female                           0.0112                                      
                                        (0.0125)                                    
====================================================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


import pandas as pd
from statsmodels.formula.api import ols as OLS

Fastfood = pd.read_csv('csv/loedata/Fastfood.csv')
ols = OLS('fte~after', data=Fastfood[Fastfood.nj==1]).fit()
print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    fte   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.6536
Date:                Sun, 28 May 2023   Prob (F-statistic):              0.419
Time:                        15:18:21   Log-Likelihood:                -2327.4
No. Observations:                 640   AIC:                             4659.
Df Residuals:                     638   BIC:                             4668.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     20.4394      0.513     39.805      0.000      19.431      21.448
after          0.5880      0.727      0.808      0.419      -0.840       2.016
==============================================================================
Omnibus:                      182.038   Durbin-Watson:                   1.874
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              738.315
Skew:                           1.256   Prob(JB):                    4.75e-161
Kurtosis:                       7.624   Cond. No.                         2.61
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


print(OLS('fte~after', data=Fastfood[Fastfood.nj==0]).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    fte   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.727
Date:                Sun, 28 May 2023   Prob (F-statistic):              0.191
Time:                        15:18:21   Log-Likelihood:                -575.53
No. Observations:                 154   AIC:                             1155.
Df Residuals:                     152   BIC:                             1161.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     23.3312      1.165     20.024      0.000      21.029      25.633
after         -2.1656      1.648     -1.314      0.191      -5.421       1.090
==============================================================================
Omnibus:                       45.554   Durbin-Watson:                   1.721
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               98.276
Skew:                           1.292   Prob(JB):                     4.57e-22
Kurtosis:                       5.939   Cond. No.                         2.62
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


print(OLS('fte~nj', data=Fastfood[Fastfood.after==0]).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    fte   R-squared:                       0.014
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     5.525
Date:                Sun, 28 May 2023   Prob (F-statistic):             0.0192
Time:                        15:18:21   Log-Likelihood:                -1467.8
No. Observations:                 398   AIC:                             2940.
Df Residuals:                     396   BIC:                             2948.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     23.3312      1.105     21.118      0.000      21.159      25.503
nj            -2.8918      1.230     -2.351      0.019      -5.310      -0.473
==============================================================================
Omnibus:                      164.507   Durbin-Watson:                   1.901
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              853.185
Skew:                           1.712   Prob(JB):                    5.41e-186
Kurtosis:                       9.303   Cond. No.                         4.34
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


print(OLS('fte~nj', data=Fastfood[Fastfood.after==1]).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    fte   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                   0.01428
Date:                Sun, 28 May 2023   Prob (F-statistic):              0.905
Time:                        15:18:21   Log-Likelihood:                -1435.6
No. Observations:                 396   AIC:                             2875.
Df Residuals:                     394   BIC:                             2883.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     21.1656      1.038     20.397      0.000      19.125      23.206
nj            -0.1382      1.156     -0.119      0.905      -2.411       2.135
==============================================================================
Omnibus:                       44.985   Durbin-Watson:                   1.769
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               71.204
Skew:                           0.726   Prob(JB):                     3.45e-16
Kurtosis:                       4.486   Cond. No.                         4.33
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


print(OLS('fte~nj*after', data=Fastfood).fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    fte   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     1.964
Date:                Sun, 28 May 2023   Prob (F-statistic):              0.118
Time:                        15:18:21   Log-Likelihood:                -2904.2
No. Observations:                 794   AIC:                             5816.
Df Residuals:                     790   BIC:                             5835.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     23.3312      1.072     21.767      0.000      21.227      25.435
nj            -2.8918      1.194     -2.423      0.016      -5.235      -0.549
after         -2.1656      1.516     -1.429      0.154      -5.141       0.810
nj:after       2.7536      1.688      1.631      0.103      -0.561       6.068
==============================================================================
Omnibus:                      218.742   Durbin-Watson:                   1.840
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              804.488
Skew:                           1.268   Prob(JB):                    2.03e-175
Kurtosis:                       7.229   Cond. No.                         11.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#https://stackoverflow.com/questions/13413590/how-to-drop-rows-of-pandas-dataframe-whose-value-in-a-certain-column-is-nan
Fastfood = pd.read_csv('csv/loedata/Fastfood.csv')
print('Original:', Fastfood.shape)
Fastfood = Fastfood.dropna(subset = ['id','fte','nj','after'])
print('Cleaned :', Fastfood.shape)
mod = OLS('fte~nj*after', data=Fastfood)

Original: (820, 35)
Cleaned : (794, 35)


print(mod.fit(cov_type='cluster', cov_kwds={'groups': Fastfood['id']}).summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    fte   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     1.802
Date:                Sun, 28 May 2023   Prob (F-statistic):              0.146
Time:                        15:18:21   Log-Likelihood:                -2904.2
No. Observations:                 794   AIC:                             5816.
Df Residuals:                     790   BIC:                             5835.
Df Model:                           3                                         
Covariance Type:              cluster                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     23.3312      1.347     17.327      0.000      20.692      25.970
nj            -2.8918      1.440     -2.009      0.045      -5.713      -0.070
after         -2.1656      1.218     -1.778      0.075      -4.553       0.222
nj:after       2.7536      1.307      2.107      0.035       0.193       5.315
==============================================================================
Omnibus:                      218.742   Durbin-Watson:                   1.840
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              804.488
Skew:                           1.268   Prob(JB):                    2.03e-175
Kurtosis:                       7.229   Cond. No.                         11.3
==============================================================================

Notes:
[1] Standard Errors are robust to cluster correlation (cluster)


import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"

np.random.seed(101)
n = 100
x = np.random.normal(loc=3.2, scale=1, size=n)
u = np.random.normal(scale=.75, size=n)
y = 3+7*x-x**2+u


plt.scatter(x,y)
plt.show()


import pandas as pd
from numpy import log
from statsmodels.formula.api import ols as OLS

smoke = pd.read_csv('csv/wooldridge/smoke.csv')
ols = OLS('cigs~log(income)+log(cigpric)+educ+age+agesq', data=smoke).fit()
print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   cigs   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     7.565
Date:                Sun, 28 May 2023   Prob (F-statistic):           5.92e-07
Time:                        15:18:22   Log-Likelihood:                -3239.5
No. Observations:                 807   AIC:                             6491.
Df Residuals:                     801   BIC:                             6519.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        5.3688     23.897      0.225      0.822     -41.540      52.277
log(income)      0.7583      0.729      1.041      0.298      -0.672       2.189
log(cigpric)    -2.8532      5.733     -0.498      0.619     -14.107       8.401
educ            -0.5141      0.168     -3.068      0.002      -0.843      -0.185
age              0.7806      0.161      4.860      0.000       0.465       1.096
agesq           -0.0091      0.002     -5.207      0.000      -0.013      -0.006
==============================================================================
Omnibus:                      230.357   Durbin-Watson:                   1.998
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              513.713
Skew:                           1.562   Prob(JB):                    2.81e-112
Kurtosis:                       5.350   Cond. No.                     1.32e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.32e+05. This might indicate that there are
strong multicollinearity or other numerical problems.


print(-ols.params.age/(2*ols.params.agesq))

42.86552149669145


#https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html
print(smoke.age.describe())

count    807.000000
mean      41.237918
std       17.027285
min       17.000000
25%       28.000000
50%       38.000000
75%       54.000000
max       88.000000
Name: age, dtype: float64


#https://stackoverflow.com/questions/33768122/python-pandas-dataframe-how-to-multiply-entire-column-with-a-scalar
#https://stackoverflow.com/questions/53162/how-can-i-do-a-line-break-line-continuation-in-python
smoke['pred0'] = ols.params.Intercept \
               + log(smoke.income)*ols.params['log(income)'] \
               + log(smoke.cigpric)*ols.params['log(cigpric)'] \
               + smoke.educ*ols.params.educ


import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"

plt.scatter(smoke.age, smoke.cigs-smoke.pred0, color='w', edgecolors='k')
plt.xlim(plt.gca().get_xlim()) # fix x limits
plt.ylim(plt.gca().get_ylim()) # fix y limits
x = np.arange(100)
plt.plot(x, x*ols.params.age+(x**2)*ols.params.agesq)
plt.show()


import pandas as pd
from numpy import log
from statsmodels.formula.api import ols as OLS

Housing = pd.read_csv('csv/Ecdat/Housing.csv')
H3 = Housing[Housing.bedrooms==3].copy()
ols2 = OLS('log(price)~log(lotsize)+I(log(lotsize)**2)', data=H3).fit()
print(ols2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             log(price)   R-squared:                       0.348
Model:                            OLS   Adj. R-squared:                  0.343
Method:                 Least Squares   F-statistic:                     79.39
Date:                Sun, 28 May 2023   Prob (F-statistic):           2.30e-28
Time:                        15:18:22   Log-Likelihood:                -47.453
No. Observations:                 301   AIC:                             100.9
Df Residuals:                     298   BIC:                             112.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               -4.8947      5.384     -0.909      0.364     -15.491       5.702
log(lotsize)             3.2905      1.273      2.585      0.010       0.785       5.796
I(log(lotsize) ** 2)    -0.1651      0.075     -2.197      0.029      -0.313      -0.017
==============================================================================
Omnibus:                        3.730   Durbin-Watson:                   1.409
Prob(Omnibus):                  0.155   Jarque-Bera (JB):                3.705
Skew:                          -0.271   Prob(JB):                        0.157
Kurtosis:                       2.954   Cond. No.                     2.46e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.46e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


-ols2.params['log(lotsize)']/(2*ols2.params['I(log(lotsize) ** 2)'])

9.96511008201641


log(H3.lotsize).describe()

count    301.000000
mean       8.477732
std        0.412099
min        7.408531
25%        8.188689
50%        8.484670
75%        8.779557
max        9.655026
Name: lotsize, dtype: float64


ols1 = OLS('log(price)~log(lotsize)', data=H3).fit()
print(ols1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             log(price)   R-squared:                       0.337
Model:                            OLS   Adj. R-squared:                  0.335
Method:                 Least Squares   F-statistic:                     152.0
Date:                Sun, 28 May 2023   Prob (F-statistic):           1.62e-28
Time:                        15:18:22   Log-Likelihood:                -49.872
No. Observations:                 301   AIC:                             103.7
Df Residuals:                     299   BIC:                             111.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        6.9115      0.341     20.285      0.000       6.241       7.582
log(lotsize)     0.4949      0.040     12.329      0.000       0.416       0.574
==============================================================================
Omnibus:                        4.137   Durbin-Watson:                   1.400
Prob(Omnibus):                  0.126   Jarque-Bera (JB):                4.069
Skew:                          -0.285   Prob(JB):                        0.131
Kurtosis:                       2.990   Cond. No.                         178.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
from numpy import log

plt.scatter(log(H3.lotsize), log(H3.price), color='w', edgecolors='k')
plt.xlim(plt.gca().get_xlim()) # fix x limits
plt.ylim(plt.gca().get_ylim()) # fix y limits
x = log(H3.lotsize.sort_values())
b2 = ols2.params
b1 = ols1.params
plt.plot(x, b2[0]+b2[1]*x+b2[2]*x**2)
plt.plot(x, b1[0]+b1[1]*x, '--')
plt.show()


import pandas as pd
import numpy as np
from statsmodels.formula.api import ols as OLS

Hies = pd.read_csv('csv/loedata/Hies.csv')
len(Hies)

1368


Hies.cons.describe()

count    1.368000e+03
mean     2.422611e+06
std      1.199659e+06
min      2.546630e+05
25%      1.686188e+06
50%      2.239735e+06
75%      2.865792e+06
max      1.532575e+07
Name: cons, dtype: float64


Hies.inc.describe()

count    1.368000e+03
mean     4.221342e+06
std      2.160279e+06
min      0.000000e+00
25%      2.855448e+06
50%      3.914338e+06
75%      5.193994e+06
max      1.899592e+07
Name: inc, dtype: float64


np.sum(Hies.inc==0)

8


Hies[Hies.inc>0].inc.describe()

count    1.360000e+03
mean     4.246173e+06
std      2.142139e+06
min      6.997000e+03
25%      2.879314e+06
50%      3.928140e+06
75%      5.212878e+06
max      1.899592e+07
Name: inc, dtype: float64


Hies['inca'] = [x if x>0 else 1 for x in Hies.inc]
Hies['incmil'] = Hies.inc/1e6


fm = 'np.log(cons)~np.log(famsize)+emp+age+I(age**2)+ownhouse+female+educ'
fm1 = fm+'+np.log(inc)'
fm2 = fm+'+np.log(inca)'
fm3 = fm+'+np.log(inc+1)'
fm4 = fm+'+np.log(inca)+I(inc==0)'
fm5 = fm+'+incmil+I(incmil**2)'


ols1 = OLS(fm1, data=Hies[Hies.inc>0]).fit()
ols2 = OLS(fm2, data=Hies).fit()
ols3 = OLS(fm3, data=Hies).fit()
ols4 = OLS(fm4, data=Hies).fit()
ols5 = OLS(fm5, data=Hies).fit()


from statsmodels.iolib.summary2 import summary_col

outreg = summary_col([ols1,ols2,ols3,ols4,ols5], stars=True)
print(outreg) # Why do I have R-squared and R-squared Adj.? Also automatically sorted. Annoying.

========================================================================================================
                    np.log(cons) I np.log(cons) II np.log(cons) III np.log(cons) IIII np.log(cons) IIIII
--------------------------------------------------------------------------------------------------------
I(age ** 2)         0.0030**       0.0017          0.0017           0.0031**          0.0030**          
                    (0.0014)       (0.0015)        (0.0015)         (0.0014)          (0.0014)          
I(inc == 0)[T.True]                                                 5.9065***                           
                                                                    (0.3123)                            
I(incmil ** 2)                                                                        -0.0062***        
                                                                                      (0.0009)          
Intercept           11.7484***     14.8247***      14.8247***       11.8755***        17.1542***        
                    (1.6693)       (1.8555)        (1.8555)         (1.6587)          (1.6592)          
R-squared           0.4370         0.2900          0.2900           0.4380            0.4358            
R-squared Adj.      0.4336         0.2858          0.2858           0.4343            0.4320            
age                 -0.2065**      -0.1128         -0.1128          -0.2129**         -0.2033**         
                    (0.0960)       (0.1071)        (0.1071)         (0.0955)          (0.0955)          
educ                0.0046         0.0231***       0.0231***        0.0049            0.0037            
                    (0.0046)       (0.0050)        (0.0050)         (0.0046)          (0.0046)          
emp                 -0.0647        0.1140**        0.1140**         -0.0572           -0.0088           
                    (0.0411)       (0.0445)        (0.0445)         (0.0406)          (0.0396)          
female              -0.0330        -0.0638**       -0.0638**        -0.0303           -0.0188           
                    (0.0266)       (0.0296)        (0.0296)         (0.0264)          (0.0265)          
incmil                                                                                0.1692***         
                                                                                      (0.0128)          
np.log(famsize)     0.2948***      0.3976***       0.3976***        0.2971***         0.3045***         
                    (0.0220)       (0.0238)        (0.0238)         (0.0219)          (0.0219)          
np.log(inc + 1)                                    0.0533***                                            
                                                   (0.0089)                                             
np.log(inc)         0.4010***                                                                           
                    (0.0200)                                                                            
np.log(inca)                       0.0533***                        0.3996***                           
                                   (0.0089)                         (0.0199)                            
ownhouse            -0.0648***     -0.0259         -0.0259          -0.0644***        -0.0666***        
                    (0.0192)       (0.0214)        (0.0214)         (0.0192)          (0.0192)          
========================================================================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


import pandas as pd
import numpy as np
from statsmodels.formula.api import ols as OLS

Housing = pd.read_csv('csv/Ecdat/Housing.csv')
ols = OLS('np.log(price)~np.log(lotsize/5000)+I(bedrooms-3)', data=Housing).fit()
theta = ols.params.Intercept
se = np.sqrt(ols.scale + ols.bse.Intercept**2)
[theta, se]

[11.089394186363187, 0.2845444495161841]


from scipy.stats import t

c = t.ppf(.975, ols.df_resid)*se
[theta-c, theta+c]

[10.530451460392335, 11.648336912334038]


# Continue
alpha1 = np.exp(ols.scale/2)
print(alpha1)
print(np.exp(theta)*alpha1)

1.0412349999323829
68172.85164989016


alpha2 = np.exp(ols.resid).mean()
print(alpha2)
print(np.exp(theta)*alpha2)

1.0403223271241266
68113.09615956743


import numpy as np

np.random.seed(2)
n = 100
x1 = np.random.normal(size=n)
x2 = np.exp(np.random.normal(size=n))
y = 1+x1-np.log(x2)+np.random.normal(size=n)
#https://stackoverflow.com/questions/44118416/pandas-using-variables-to-create-dataframe-with-one-row-and-column-names-from-v
DF = pd.DataFrame({"y": np.array(y), "x1": np.array(x1), "x2": np.array(x2)})
DF.head()
# Data set is different from what is generated by R


ols = OLS('y~x1+x2', data=DF).fit()


#https://stackoverflow.com/questions/73501475/how-to-perform-linearity-tests-on-ols-regression-statmodels
from statsmodels.stats.diagnostic import linear_reset

# linear_reset(ols, power = [2,3], test_type = "fitted", use_f = True) # Doesn't work. Version mismatch.


DF['yhat'] = ols.fittedvalues
aux = OLS('y~x1+x2+I(yhat**2)+I(yhat**3)', data=DF).fit()
print(aux.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.524
Method:                 Least Squares   F-statistic:                     28.20
Date:                Sun, 28 May 2023   Prob (F-statistic):           1.92e-15
Time:                        15:18:22   Log-Likelihood:                -153.88
No. Observations:                 100   AIC:                             317.8
Df Residuals:                      95   BIC:                             330.8
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.7255      0.197      3.690      0.000       0.335       1.116
x1               0.6947      0.164      4.248      0.000       0.370       1.019
x2              -0.1860      0.038     -4.857      0.000      -0.262      -0.110
I(yhat ** 2)     0.2578      0.065      3.949      0.000       0.128       0.387
I(yhat ** 3)    -0.0108      0.022     -0.498      0.619      -0.054       0.032
==============================================================================
Omnibus:                        0.958   Durbin-Watson:                   1.988
Prob(Omnibus):                  0.619   Jarque-Bera (JB):                0.955
Skew:                           0.227   Prob(JB):                        0.620
Kurtosis:                       2.851   Cond. No.                         28.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


print(aux.f_test([[0,0,0,1,0], [0,0,0,0,1]]))

<F test: F=9.286693843009289, p=0.00020712555845992816, df_denom=95, df_num=2>


ols2 = OLS('y~x1+np.log(x2)', data=DF).fit()


# linear_reset(ols2, power=[2,3], test_type = 'fitted', use_f = True) # Doesn't work. Version mismatch.

	y	x1	x2
0	-1.729282	-0.416758	3.194790
1	0.131133	-0.056267	1.471199
2	-0.151210	-2.136196	0.322023
3	3.708615	1.640271	1.542019
4	0.380249	-1.793436	0.737797

11장 다중회귀 관련 추가 주제¶

이진적 설명변수¶

상호작용항¶

집단별 회귀와 전체 회귀¶

예제 11.1 Card and Krueger (1994)¶

제곱항 자료 생성¶

예제 11.2 나이의 2차 함수로서의 흡연량¶

예제 11.3 건평과 주택가격¶

예제 11.4 가구소득과 소비의 관계¶

예제 11.5 종속변수 값의 구간예측¶

예제 11.6 종속변수가 로그 형태일 때 수준값의 예측¶

예제 11.7 RESET 검정¶