import pandas as pd
from statsmodels.api import OLS
Death = pd.read_csv('csv/loedata/Death.csv')
ols = OLS.from_formula('deathrate~drink', data=Death[Death.year==2010]).fit()
print(ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: deathrate R-squared: 0.318 Model: OLS Adj. R-squared: 0.310 Method: Least Squares F-statistic: 39.14 Date: Sun, 28 May 2023 Prob (F-statistic): 1.59e-08 Time: 15:08:46 Log-Likelihood: -173.59 No. Observations: 86 AIC: 351.2 Df Residuals: 84 BIC: 356.1 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 21.1006 1.722 12.252 0.000 17.676 24.525 drink -0.2268 0.036 -6.256 0.000 -0.299 -0.155 ============================================================================== Omnibus: 0.580 Durbin-Watson: 1.435 Prob(Omnibus): 0.748 Jarque-Bera (JB): 0.574 Skew: -0.188 Prob(JB): 0.750 Kurtosis: 2.865 Cond. No. 412. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
plt.scatter(Death.drink, Death.deathrate, facecolors='w', edgecolors='k')
plt.show()
print(OLS.from_formula('deathrate~drink+aged', data=Death[Death.year==2010]).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: deathrate R-squared: 0.919 Model: OLS Adj. R-squared: 0.917 Method: Least Squares F-statistic: 470.0 Date: Sun, 28 May 2023 Prob (F-statistic): 5.38e-46 Time: 15:08:46 Log-Likelihood: -82.034 No. Observations: 86 AIC: 170.1 Df Residuals: 83 BIC: 177.4 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.6139 1.060 -0.579 0.564 -2.722 1.495 drink 0.0344 0.016 2.099 0.039 0.002 0.067 aged 0.4058 0.016 24.797 0.000 0.373 0.438 ============================================================================== Omnibus: 0.091 Durbin-Watson: 2.150 Prob(Omnibus): 0.955 Jarque-Bera (JB): 0.086 Skew: -0.064 Prob(JB): 0.958 Kurtosis: 2.912 Cond. No. 810. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import pandas as pd
import numpy as np
from statsmodels.api import OLS
Wages1 = pd.read_csv('csv/Ecdat/Wages1.csv')
## https://stackoverflow.com/questions/20840803/how-to-convert-false-to-0-and-true-to-1
Wages1['male'] = [int(x) for x in Wages1.sex=='male'] # :-(
Wages1['female'] = [int(x) for x in Wages1.sex=='female'] # :-(
print(Wages1.head())
exper sex school wage male female 0 9 female 13 6.315296 0 1 1 12 female 12 5.479770 0 1 2 11 female 11 3.642170 0 1 3 9 female 14 4.593337 0 1 4 8 female 14 2.418157 0 1
print(np.sum(Wages1.male), np.sum(Wages1.female))
1725 1569
대상을 여성으로 한정하면 female
더미변수는 상수항과 완전한 공선성을 갖는다. 그럼에도 파이썬은 (실수 연산의 불가피한 부정확성으로 인하여) 여전히 값을 계산한다. Stack overflow의 이 글에 이유가 있다. 이렇게 처리하는 것은 예측 시에는 큰 문제가 없을 수도 있으나 인과관계 분석에서는 용납되지 않는다. R은 공선성을 체크하여 공선성이 있다고 판단되면 해당 변수에 NA
를 리포트한다.
ols = OLS.from_formula('wage~female', data=Wages1[Wages1.female==1]).fit()
print(ols.params)
print("Type of female dummy: ", type(Wages1.female[0]))
Intercept 8.384840e+12 female -8.384840e+12 dtype: float64 Type of female dummy: <class 'numpy.int64'>
위 결과를 보고 절편이 5,794,297,000,000이고 female
더미변수의 계수가 -5,794,297,000,000이라고 생각해서는 안 된다. 파이썬의 statsmodels
모듈이 그런 식으로 코딩되었나보다 생각하면 된다. Summary는 더 많은 정보를 담고 있다.
print(ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: wage R-squared: -0.001 Model: OLS Adj. R-squared: -0.001 Method: Least Squares F-statistic: -0.8714 Date: Sun, 28 May 2023 Prob (F-statistic): 1.00 Time: 15:08:46 Log-Likelihood: -3883.9 No. Observations: 1569 AIC: 7772. Df Residuals: 1567 BIC: 7782. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 8.385e+12 2.06e+13 0.406 0.685 -3.21e+13 4.89e+13 female -8.385e+12 2.06e+13 -0.406 0.685 -4.89e+13 3.21e+13 ============================================================================== Omnibus: 762.482 Durbin-Watson: 1.872 Prob(Omnibus): 0.000 Jarque-Bera (JB): 8859.515 Skew: 1.975 Prob(JB): 0.00 Kurtosis: 13.951 Cond. No. 5.68e+14 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 9.71e-27. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
위 Notes [2]에 강한 다중상관 또는 특이성이 존재한다는 말이 있다.
print(OLS.from_formula('wage~female', data=Wages1).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: wage R-squared: 0.032 Model: OLS Adj. R-squared: 0.031 Method: Least Squares F-statistic: 107.9 Date: Sun, 28 May 2023 Prob (F-statistic): 6.71e-25 Time: 15:08:46 Log-Likelihood: -8522.2 No. Observations: 3294 AIC: 1.705e+04 Df Residuals: 3292 BIC: 1.706e+04 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 6.3130 0.077 81.495 0.000 6.161 6.465 female -1.1661 0.112 -10.389 0.000 -1.386 -0.946 ============================================================================== Omnibus: 1538.963 Durbin-Watson: 1.866 Prob(Omnibus): 0.000 Jarque-Bera (JB): 15569.712 Skew: 1.965 Prob(JB): 0.00 Kurtosis: 12.899 Cond. No. 2.57 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(OLS.from_formula('wage~male+female', data=Wages1).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: wage R-squared: 0.032 Model: OLS Adj. R-squared: 0.031 Method: Least Squares F-statistic: 53.94 Date: Sun, 28 May 2023 Prob (F-statistic): 8.91e-24 Time: 15:08:46 Log-Likelihood: -8522.2 No. Observations: 3294 AIC: 1.705e+04 Df Residuals: 3291 BIC: 1.707e+04 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 2.441e+11 2.35e+12 0.104 0.917 -4.37e+12 4.85e+12 male -2.441e+11 2.35e+12 -0.104 0.917 -4.85e+12 4.37e+12 female -2.441e+11 2.35e+12 -0.104 0.917 -4.85e+12 4.37e+12 ============================================================================== Omnibus: 1538.628 Durbin-Watson: 1.866 Prob(Omnibus): 0.000 Jarque-Bera (JB): 15563.735 Skew: 1.965 Prob(JB): 0.00 Kurtosis: 12.897 Cond. No. 8.90e+13 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 6.24e-25. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
여기도 마찬가지이다. 파이썬 statsmodels
모듈은 다중공선성 문제에 대한 판단을 사람에게 맡긴다. 인과관계 분석에서는 매우 조심해야 한다.
print(OLS.from_formula('wage~male+female-1', data=Wages1).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: wage R-squared: 0.032 Model: OLS Adj. R-squared: 0.031 Method: Least Squares F-statistic: 107.9 Date: Sun, 28 May 2023 Prob (F-statistic): 6.71e-25 Time: 15:08:46 Log-Likelihood: -8522.2 No. Observations: 3294 AIC: 1.705e+04 Df Residuals: 3292 BIC: 1.706e+04 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ male 6.3130 0.077 81.495 0.000 6.161 6.465 female 5.1469 0.081 63.366 0.000 4.988 5.306 ============================================================================== Omnibus: 1538.963 Durbin-Watson: 1.866 Prob(Omnibus): 0.000 Jarque-Bera (JB): 15569.712 Skew: 1.965 Prob(JB): 0.00 Kurtosis: 12.899 Cond. No. 1.05 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import pandas as pd
import numpy as np
from statsmodels.api import OLS
Wages1 = pd.read_csv('csv/Ecdat/Wages1.csv')
print(OLS.from_formula('np.log(wage)~sex+school+exper', data=Wages1).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: np.log(wage) R-squared: 0.137 Model: OLS Adj. R-squared: 0.137 Method: Least Squares F-statistic: 174.7 Date: Sun, 28 May 2023 Prob (F-statistic): 4.04e-105 Time: 15:08:46 Log-Likelihood: -2869.8 No. Observations: 3294 AIC: 5748. Df Residuals: 3290 BIC: 5772. Df Model: 3 Covariance Type: nonrobust =============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------- Intercept -0.2593 0.088 -2.936 0.003 -0.433 -0.086 sex[T.male] 0.2426 0.020 11.860 0.000 0.202 0.283 school 0.1234 0.006 19.802 0.000 0.111 0.136 exper 0.0354 0.005 7.845 0.000 0.027 0.044 ============================================================================== Omnibus: 881.201 Durbin-Watson: 1.875 Prob(Omnibus): 0.000 Jarque-Bera (JB): 3779.073 Skew: -1.239 Prob(JB): 0.00 Kurtosis: 7.625 Cond. No. 126. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
np.random.seed(1)
Wages1['rnd'] = np.random.normal(size=len(Wages1))
## https://www.statsmodels.org/0.6.1/examples/notebooks/generated/ols.html
print(OLS.from_formula('np.log(wage)~sex+school+exper', data=Wages1).fit().rsquared)
0.13743765266234398
print(OLS.from_formula('np.log(wage)~sex+school+exper+rnd', data=Wages1).fit().rsquared)
0.1374480389013667