import pandas as pd
from numpy import log
from statsmodels.formula.api import ols as OLS
Wages1 = pd.read_csv('csv/Ecdat/Wages1.csv')
Wages1['female'] = [int(x=='female') for x in Wages1.sex]
ols = OLS('log(wage)~school+exper+female', data=Wages1).fit()
print(ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: log(wage) R-squared: 0.137 Model: OLS Adj. R-squared: 0.137 Method: Least Squares F-statistic: 174.7 Date: Sun, 28 May 2023 Prob (F-statistic): 4.04e-105 Time: 15:18:21 Log-Likelihood: -2869.8 No. Observations: 3294 AIC: 5748. Df Residuals: 3290 BIC: 5772. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.0168 0.088 -0.192 0.848 -0.188 0.155 school 0.1234 0.006 19.802 0.000 0.111 0.136 exper 0.0354 0.005 7.845 0.000 0.027 0.044 female -0.2426 0.020 -11.860 0.000 -0.283 -0.202 ============================================================================== Omnibus: 881.201 Durbin-Watson: 1.875 Prob(Omnibus): 0.000 Jarque-Bera (JB): 3779.073 Skew: -1.239 Prob(JB): 0.00 Kurtosis: 7.625 Cond. No. 124. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Continue
ols = OLS('log(wage)~female*school+exper', data=Wages1).fit()
print(ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: log(wage) R-squared: 0.137 Model: OLS Adj. R-squared: 0.136 Method: Least Squares F-statistic: 131.0 Date: Sun, 28 May 2023 Prob (F-statistic): 5.79e-104 Time: 15:18:21 Log-Likelihood: -2869.8 No. Observations: 3294 AIC: 5750. Df Residuals: 3289 BIC: 5780. Df Model: 4 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept -0.0122 0.109 -0.112 0.911 -0.226 0.201 female -0.2529 0.148 -1.714 0.087 -0.542 0.036 school 0.1230 0.008 15.003 0.000 0.107 0.139 female:school 0.0009 0.012 0.071 0.944 -0.024 0.025 exper 0.0354 0.005 7.793 0.000 0.026 0.044 ============================================================================== Omnibus: 881.154 Durbin-Watson: 1.875 Prob(Omnibus): 0.000 Jarque-Bera (JB): 3778.551 Skew: -1.239 Prob(JB): 0.00 Kurtosis: 7.625 Cond. No. 257. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Continue
eqa = OLS('log(wage)~school', data=Wages1).fit()
eqb = OLS('log(wage)~school+female', data=Wages1).fit()
eqc = OLS('log(wage)~school*female', data=Wages1).fit()
eqd = OLS('log(wage)~school', data=Wages1[Wages1.female==0]).fit()
eqe = OLS('log(wage)~school', data=Wages1[Wages1.female==1]).fit()
#https://economics.stackexchange.com/questions/11774/outputting-regressions-as-table-in-python-similar-to-outreg-in-stata
from statsmodels.iolib.summary2 import summary_col
outreg = summary_col([eqa,eqb,eqc,eqd,eqe], stars=True)
print(outreg) # Why do I have R-squared and R-squared Adj.?
==================================================================================== log(wage) I log(wage) II log(wage) III log(wage) IIII log(wage) IIIII ------------------------------------------------------------------------------------ Intercept 0.3639*** 0.3793*** 0.4326*** 0.4326*** 0.0419 (0.0738) (0.0721) (0.0937) (0.0908) (0.1181) R-squared 0.0784 0.1213 0.1215 0.1021 0.0873 R-squared Adj. 0.0781 0.1208 0.1207 0.1016 0.0867 female -0.2601*** -0.3907*** (0.0205) (0.1478) school 0.1052*** 0.1145*** 0.1099*** 0.1099*** 0.1210*** (0.0063) (0.0062) (0.0081) (0.0078) (0.0099) school:female 0.0112 (0.0125) ==================================================================================== Standard errors in parentheses. * p<.1, ** p<.05, ***p<.01
import pandas as pd
from statsmodels.formula.api import ols as OLS
Fastfood = pd.read_csv('csv/loedata/Fastfood.csv')
ols = OLS('fte~after', data=Fastfood[Fastfood.nj==1]).fit()
print(ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: fte R-squared: 0.001 Model: OLS Adj. R-squared: -0.001 Method: Least Squares F-statistic: 0.6536 Date: Sun, 28 May 2023 Prob (F-statistic): 0.419 Time: 15:18:21 Log-Likelihood: -2327.4 No. Observations: 640 AIC: 4659. Df Residuals: 638 BIC: 4668. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 20.4394 0.513 39.805 0.000 19.431 21.448 after 0.5880 0.727 0.808 0.419 -0.840 2.016 ============================================================================== Omnibus: 182.038 Durbin-Watson: 1.874 Prob(Omnibus): 0.000 Jarque-Bera (JB): 738.315 Skew: 1.256 Prob(JB): 4.75e-161 Kurtosis: 7.624 Cond. No. 2.61 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(OLS('fte~after', data=Fastfood[Fastfood.nj==0]).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: fte R-squared: 0.011 Model: OLS Adj. R-squared: 0.005 Method: Least Squares F-statistic: 1.727 Date: Sun, 28 May 2023 Prob (F-statistic): 0.191 Time: 15:18:21 Log-Likelihood: -575.53 No. Observations: 154 AIC: 1155. Df Residuals: 152 BIC: 1161. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 23.3312 1.165 20.024 0.000 21.029 25.633 after -2.1656 1.648 -1.314 0.191 -5.421 1.090 ============================================================================== Omnibus: 45.554 Durbin-Watson: 1.721 Prob(Omnibus): 0.000 Jarque-Bera (JB): 98.276 Skew: 1.292 Prob(JB): 4.57e-22 Kurtosis: 5.939 Cond. No. 2.62 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(OLS('fte~nj', data=Fastfood[Fastfood.after==0]).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: fte R-squared: 0.014 Model: OLS Adj. R-squared: 0.011 Method: Least Squares F-statistic: 5.525 Date: Sun, 28 May 2023 Prob (F-statistic): 0.0192 Time: 15:18:21 Log-Likelihood: -1467.8 No. Observations: 398 AIC: 2940. Df Residuals: 396 BIC: 2948. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 23.3312 1.105 21.118 0.000 21.159 25.503 nj -2.8918 1.230 -2.351 0.019 -5.310 -0.473 ============================================================================== Omnibus: 164.507 Durbin-Watson: 1.901 Prob(Omnibus): 0.000 Jarque-Bera (JB): 853.185 Skew: 1.712 Prob(JB): 5.41e-186 Kurtosis: 9.303 Cond. No. 4.34 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(OLS('fte~nj', data=Fastfood[Fastfood.after==1]).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: fte R-squared: 0.000 Model: OLS Adj. R-squared: -0.003 Method: Least Squares F-statistic: 0.01428 Date: Sun, 28 May 2023 Prob (F-statistic): 0.905 Time: 15:18:21 Log-Likelihood: -1435.6 No. Observations: 396 AIC: 2875. Df Residuals: 394 BIC: 2883. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 21.1656 1.038 20.397 0.000 19.125 23.206 nj -0.1382 1.156 -0.119 0.905 -2.411 2.135 ============================================================================== Omnibus: 44.985 Durbin-Watson: 1.769 Prob(Omnibus): 0.000 Jarque-Bera (JB): 71.204 Skew: 0.726 Prob(JB): 3.45e-16 Kurtosis: 4.486 Cond. No. 4.33 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(OLS('fte~nj*after', data=Fastfood).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: fte R-squared: 0.007 Model: OLS Adj. R-squared: 0.004 Method: Least Squares F-statistic: 1.964 Date: Sun, 28 May 2023 Prob (F-statistic): 0.118 Time: 15:18:21 Log-Likelihood: -2904.2 No. Observations: 794 AIC: 5816. Df Residuals: 790 BIC: 5835. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 23.3312 1.072 21.767 0.000 21.227 25.435 nj -2.8918 1.194 -2.423 0.016 -5.235 -0.549 after -2.1656 1.516 -1.429 0.154 -5.141 0.810 nj:after 2.7536 1.688 1.631 0.103 -0.561 6.068 ============================================================================== Omnibus: 218.742 Durbin-Watson: 1.840 Prob(Omnibus): 0.000 Jarque-Bera (JB): 804.488 Skew: 1.268 Prob(JB): 2.03e-175 Kurtosis: 7.229 Cond. No. 11.3 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#https://stackoverflow.com/questions/13413590/how-to-drop-rows-of-pandas-dataframe-whose-value-in-a-certain-column-is-nan
Fastfood = pd.read_csv('csv/loedata/Fastfood.csv')
print('Original:', Fastfood.shape)
Fastfood = Fastfood.dropna(subset = ['id','fte','nj','after'])
print('Cleaned :', Fastfood.shape)
mod = OLS('fte~nj*after', data=Fastfood)
Original: (820, 35) Cleaned : (794, 35)
print(mod.fit(cov_type='cluster', cov_kwds={'groups': Fastfood['id']}).summary())
OLS Regression Results ============================================================================== Dep. Variable: fte R-squared: 0.007 Model: OLS Adj. R-squared: 0.004 Method: Least Squares F-statistic: 1.802 Date: Sun, 28 May 2023 Prob (F-statistic): 0.146 Time: 15:18:21 Log-Likelihood: -2904.2 No. Observations: 794 AIC: 5816. Df Residuals: 790 BIC: 5835. Df Model: 3 Covariance Type: cluster ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 23.3312 1.347 17.327 0.000 20.692 25.970 nj -2.8918 1.440 -2.009 0.045 -5.713 -0.070 after -2.1656 1.218 -1.778 0.075 -4.553 0.222 nj:after 2.7536 1.307 2.107 0.035 0.193 5.315 ============================================================================== Omnibus: 218.742 Durbin-Watson: 1.840 Prob(Omnibus): 0.000 Jarque-Bera (JB): 804.488 Skew: 1.268 Prob(JB): 2.03e-175 Kurtosis: 7.229 Cond. No. 11.3 ============================================================================== Notes: [1] Standard Errors are robust to cluster correlation (cluster)
import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
np.random.seed(101)
n = 100
x = np.random.normal(loc=3.2, scale=1, size=n)
u = np.random.normal(scale=.75, size=n)
y = 3+7*x-x**2+u
plt.scatter(x,y)
plt.show()
import pandas as pd
from numpy import log
from statsmodels.formula.api import ols as OLS
smoke = pd.read_csv('csv/wooldridge/smoke.csv')
ols = OLS('cigs~log(income)+log(cigpric)+educ+age+agesq', data=smoke).fit()
print(ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: cigs R-squared: 0.045 Model: OLS Adj. R-squared: 0.039 Method: Least Squares F-statistic: 7.565 Date: Sun, 28 May 2023 Prob (F-statistic): 5.92e-07 Time: 15:18:22 Log-Likelihood: -3239.5 No. Observations: 807 AIC: 6491. Df Residuals: 801 BIC: 6519. Df Model: 5 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 5.3688 23.897 0.225 0.822 -41.540 52.277 log(income) 0.7583 0.729 1.041 0.298 -0.672 2.189 log(cigpric) -2.8532 5.733 -0.498 0.619 -14.107 8.401 educ -0.5141 0.168 -3.068 0.002 -0.843 -0.185 age 0.7806 0.161 4.860 0.000 0.465 1.096 agesq -0.0091 0.002 -5.207 0.000 -0.013 -0.006 ============================================================================== Omnibus: 230.357 Durbin-Watson: 1.998 Prob(Omnibus): 0.000 Jarque-Bera (JB): 513.713 Skew: 1.562 Prob(JB): 2.81e-112 Kurtosis: 5.350 Cond. No. 1.32e+05 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.32e+05. This might indicate that there are strong multicollinearity or other numerical problems.
print(-ols.params.age/(2*ols.params.agesq))
42.86552149669145
#https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html
print(smoke.age.describe())
count 807.000000 mean 41.237918 std 17.027285 min 17.000000 25% 28.000000 50% 38.000000 75% 54.000000 max 88.000000 Name: age, dtype: float64
#https://stackoverflow.com/questions/33768122/python-pandas-dataframe-how-to-multiply-entire-column-with-a-scalar
#https://stackoverflow.com/questions/53162/how-can-i-do-a-line-break-line-continuation-in-python
smoke['pred0'] = ols.params.Intercept \
+ log(smoke.income)*ols.params['log(income)'] \
+ log(smoke.cigpric)*ols.params['log(cigpric)'] \
+ smoke.educ*ols.params.educ
import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
plt.scatter(smoke.age, smoke.cigs-smoke.pred0, color='w', edgecolors='k')
plt.xlim(plt.gca().get_xlim()) # fix x limits
plt.ylim(plt.gca().get_ylim()) # fix y limits
x = np.arange(100)
plt.plot(x, x*ols.params.age+(x**2)*ols.params.agesq)
plt.show()
import pandas as pd
from numpy import log
from statsmodels.formula.api import ols as OLS
Housing = pd.read_csv('csv/Ecdat/Housing.csv')
H3 = Housing[Housing.bedrooms==3].copy()
ols2 = OLS('log(price)~log(lotsize)+I(log(lotsize)**2)', data=H3).fit()
print(ols2.summary())
OLS Regression Results ============================================================================== Dep. Variable: log(price) R-squared: 0.348 Model: OLS Adj. R-squared: 0.343 Method: Least Squares F-statistic: 79.39 Date: Sun, 28 May 2023 Prob (F-statistic): 2.30e-28 Time: 15:18:22 Log-Likelihood: -47.453 No. Observations: 301 AIC: 100.9 Df Residuals: 298 BIC: 112.0 Df Model: 2 Covariance Type: nonrobust ======================================================================================== coef std err t P>|t| [0.025 0.975] ---------------------------------------------------------------------------------------- Intercept -4.8947 5.384 -0.909 0.364 -15.491 5.702 log(lotsize) 3.2905 1.273 2.585 0.010 0.785 5.796 I(log(lotsize) ** 2) -0.1651 0.075 -2.197 0.029 -0.313 -0.017 ============================================================================== Omnibus: 3.730 Durbin-Watson: 1.409 Prob(Omnibus): 0.155 Jarque-Bera (JB): 3.705 Skew: -0.271 Prob(JB): 0.157 Kurtosis: 2.954 Cond. No. 2.46e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.46e+04. This might indicate that there are strong multicollinearity or other numerical problems.
-ols2.params['log(lotsize)']/(2*ols2.params['I(log(lotsize) ** 2)'])
9.96511008201641
log(H3.lotsize).describe()
count 301.000000 mean 8.477732 std 0.412099 min 7.408531 25% 8.188689 50% 8.484670 75% 8.779557 max 9.655026 Name: lotsize, dtype: float64
ols1 = OLS('log(price)~log(lotsize)', data=H3).fit()
print(ols1.summary())
OLS Regression Results ============================================================================== Dep. Variable: log(price) R-squared: 0.337 Model: OLS Adj. R-squared: 0.335 Method: Least Squares F-statistic: 152.0 Date: Sun, 28 May 2023 Prob (F-statistic): 1.62e-28 Time: 15:18:22 Log-Likelihood: -49.872 No. Observations: 301 AIC: 103.7 Df Residuals: 299 BIC: 111.2 Df Model: 1 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 6.9115 0.341 20.285 0.000 6.241 7.582 log(lotsize) 0.4949 0.040 12.329 0.000 0.416 0.574 ============================================================================== Omnibus: 4.137 Durbin-Watson: 1.400 Prob(Omnibus): 0.126 Jarque-Bera (JB): 4.069 Skew: -0.285 Prob(JB): 0.131 Kurtosis: 2.990 Cond. No. 178. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
from numpy import log
plt.scatter(log(H3.lotsize), log(H3.price), color='w', edgecolors='k')
plt.xlim(plt.gca().get_xlim()) # fix x limits
plt.ylim(plt.gca().get_ylim()) # fix y limits
x = log(H3.lotsize.sort_values())
b2 = ols2.params
b1 = ols1.params
plt.plot(x, b2[0]+b2[1]*x+b2[2]*x**2)
plt.plot(x, b1[0]+b1[1]*x, '--')
plt.show()
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols as OLS
Hies = pd.read_csv('csv/loedata/Hies.csv')
len(Hies)
1368
Hies.cons.describe()
count 1.368000e+03 mean 2.422611e+06 std 1.199659e+06 min 2.546630e+05 25% 1.686188e+06 50% 2.239735e+06 75% 2.865792e+06 max 1.532575e+07 Name: cons, dtype: float64
Hies.inc.describe()
count 1.368000e+03 mean 4.221342e+06 std 2.160279e+06 min 0.000000e+00 25% 2.855448e+06 50% 3.914338e+06 75% 5.193994e+06 max 1.899592e+07 Name: inc, dtype: float64
np.sum(Hies.inc==0)
8
Hies[Hies.inc>0].inc.describe()
count 1.360000e+03 mean 4.246173e+06 std 2.142139e+06 min 6.997000e+03 25% 2.879314e+06 50% 3.928140e+06 75% 5.212878e+06 max 1.899592e+07 Name: inc, dtype: float64
Hies['inca'] = [x if x>0 else 1 for x in Hies.inc]
Hies['incmil'] = Hies.inc/1e6
fm = 'np.log(cons)~np.log(famsize)+emp+age+I(age**2)+ownhouse+female+educ'
fm1 = fm+'+np.log(inc)'
fm2 = fm+'+np.log(inca)'
fm3 = fm+'+np.log(inc+1)'
fm4 = fm+'+np.log(inca)+I(inc==0)'
fm5 = fm+'+incmil+I(incmil**2)'
ols1 = OLS(fm1, data=Hies[Hies.inc>0]).fit()
ols2 = OLS(fm2, data=Hies).fit()
ols3 = OLS(fm3, data=Hies).fit()
ols4 = OLS(fm4, data=Hies).fit()
ols5 = OLS(fm5, data=Hies).fit()
from statsmodels.iolib.summary2 import summary_col
outreg = summary_col([ols1,ols2,ols3,ols4,ols5], stars=True)
print(outreg) # Why do I have R-squared and R-squared Adj.? Also automatically sorted. Annoying.
======================================================================================================== np.log(cons) I np.log(cons) II np.log(cons) III np.log(cons) IIII np.log(cons) IIIII -------------------------------------------------------------------------------------------------------- I(age ** 2) 0.0030** 0.0017 0.0017 0.0031** 0.0030** (0.0014) (0.0015) (0.0015) (0.0014) (0.0014) I(inc == 0)[T.True] 5.9065*** (0.3123) I(incmil ** 2) -0.0062*** (0.0009) Intercept 11.7484*** 14.8247*** 14.8247*** 11.8755*** 17.1542*** (1.6693) (1.8555) (1.8555) (1.6587) (1.6592) R-squared 0.4370 0.2900 0.2900 0.4380 0.4358 R-squared Adj. 0.4336 0.2858 0.2858 0.4343 0.4320 age -0.2065** -0.1128 -0.1128 -0.2129** -0.2033** (0.0960) (0.1071) (0.1071) (0.0955) (0.0955) educ 0.0046 0.0231*** 0.0231*** 0.0049 0.0037 (0.0046) (0.0050) (0.0050) (0.0046) (0.0046) emp -0.0647 0.1140** 0.1140** -0.0572 -0.0088 (0.0411) (0.0445) (0.0445) (0.0406) (0.0396) female -0.0330 -0.0638** -0.0638** -0.0303 -0.0188 (0.0266) (0.0296) (0.0296) (0.0264) (0.0265) incmil 0.1692*** (0.0128) np.log(famsize) 0.2948*** 0.3976*** 0.3976*** 0.2971*** 0.3045*** (0.0220) (0.0238) (0.0238) (0.0219) (0.0219) np.log(inc + 1) 0.0533*** (0.0089) np.log(inc) 0.4010*** (0.0200) np.log(inca) 0.0533*** 0.3996*** (0.0089) (0.0199) ownhouse -0.0648*** -0.0259 -0.0259 -0.0644*** -0.0666*** (0.0192) (0.0214) (0.0214) (0.0192) (0.0192) ======================================================================================================== Standard errors in parentheses. * p<.1, ** p<.05, ***p<.01
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols as OLS
Housing = pd.read_csv('csv/Ecdat/Housing.csv')
ols = OLS('np.log(price)~np.log(lotsize/5000)+I(bedrooms-3)', data=Housing).fit()
theta = ols.params.Intercept
se = np.sqrt(ols.scale + ols.bse.Intercept**2)
[theta, se]
[11.089394186363187, 0.2845444495161841]
from scipy.stats import t
c = t.ppf(.975, ols.df_resid)*se
[theta-c, theta+c]
[10.530451460392335, 11.648336912334038]
# Continue
alpha1 = np.exp(ols.scale/2)
print(alpha1)
print(np.exp(theta)*alpha1)
1.0412349999323829 68172.85164989016
alpha2 = np.exp(ols.resid).mean()
print(alpha2)
print(np.exp(theta)*alpha2)
1.0403223271241266 68113.09615956743
import numpy as np
np.random.seed(2)
n = 100
x1 = np.random.normal(size=n)
x2 = np.exp(np.random.normal(size=n))
y = 1+x1-np.log(x2)+np.random.normal(size=n)
#https://stackoverflow.com/questions/44118416/pandas-using-variables-to-create-dataframe-with-one-row-and-column-names-from-v
DF = pd.DataFrame({"y": np.array(y), "x1": np.array(x1), "x2": np.array(x2)})
DF.head()
# Data set is different from what is generated by R
y | x1 | x2 | |
---|---|---|---|
0 | -1.729282 | -0.416758 | 3.194790 |
1 | 0.131133 | -0.056267 | 1.471199 |
2 | -0.151210 | -2.136196 | 0.322023 |
3 | 3.708615 | 1.640271 | 1.542019 |
4 | 0.380249 | -1.793436 | 0.737797 |
ols = OLS('y~x1+x2', data=DF).fit()
#https://stackoverflow.com/questions/73501475/how-to-perform-linearity-tests-on-ols-regression-statmodels
from statsmodels.stats.diagnostic import linear_reset
# linear_reset(ols, power = [2,3], test_type = "fitted", use_f = True) # Doesn't work. Version mismatch.
DF['yhat'] = ols.fittedvalues
aux = OLS('y~x1+x2+I(yhat**2)+I(yhat**3)', data=DF).fit()
print(aux.summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.543 Model: OLS Adj. R-squared: 0.524 Method: Least Squares F-statistic: 28.20 Date: Sun, 28 May 2023 Prob (F-statistic): 1.92e-15 Time: 15:18:22 Log-Likelihood: -153.88 No. Observations: 100 AIC: 317.8 Df Residuals: 95 BIC: 330.8 Df Model: 4 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 0.7255 0.197 3.690 0.000 0.335 1.116 x1 0.6947 0.164 4.248 0.000 0.370 1.019 x2 -0.1860 0.038 -4.857 0.000 -0.262 -0.110 I(yhat ** 2) 0.2578 0.065 3.949 0.000 0.128 0.387 I(yhat ** 3) -0.0108 0.022 -0.498 0.619 -0.054 0.032 ============================================================================== Omnibus: 0.958 Durbin-Watson: 1.988 Prob(Omnibus): 0.619 Jarque-Bera (JB): 0.955 Skew: 0.227 Prob(JB): 0.620 Kurtosis: 2.851 Cond. No. 28.2 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(aux.f_test([[0,0,0,1,0], [0,0,0,0,1]]))
<F test: F=9.286693843009289, p=0.00020712555845992816, df_denom=95, df_num=2>
ols2 = OLS('y~x1+np.log(x2)', data=DF).fit()
# linear_reset(ols2, power=[2,3], test_type = 'fitted', use_f = True) # Doesn't work. Version mismatch.