import numpy as np
import statsmodels.api as sm
설명변수(educ
)를 생성한다.
iterate = 1000 # will repeat 1000 times
n1 = 13
n2 = 25
n3 = 12
n = n1+n2+n3
## https://stackoverflow.com/questions/3459098/create-list-of-single-item-repeated-n-times
## https://stackoverflow.com/questions/11574195/how-do-i-merge-multiple-lists-into-one-list
## https://stackoverflow.com/questions/952914/how-do-i-make-a-flat-list-out-of-a-list-of-lists
educ = [12.0]*n1 + [14.0]*n2 + [16.0]*n3 # so counter-intuitive and annoying
print(educ)
[12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0]
오차 표준편차를 nx1 벡터로 만든다.
stdevs = [0.8]*n1 + [1.0]*n2 + [1.4]*n3 # standard deviations are set to be different
print(stdevs)
[0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4]
모의실험을 실행한다.
## https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
np.random.seed(11)
bhats = [None]*iterate # list of 1000 None's
for j in range(iterate):
## https://numpy.org/doc/stable/reference/random/generated/numpy.random.normal.html
u0 = np.random.normal(size=n)
## https://stackoverflow.com/questions/10271484/how-to-perform-element-wise-multiplication-of-two-lists
u = [s*e for s,e in zip(stdevs,u0)]
lnwage = [8.3+0.08*xi+ui for xi,ui in zip(educ,u)]
# We have generated educ and lnwage. Now let's regress lnwage on educ.
## https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html
ols = sm.OLS(lnwage, sm.add_constant(educ)).fit()
bhats[j] = ols.params[1] # 0=intercept, 1=slope
1,000회 반복으로부터 구한 1,000개 OLS 기울기 추정값들의 평균을 구한다.
print(np.mean(bhats)) # Should be close to the truth 0.08
0.07929930419095235
import pandas as pd
Crime = pd.read_csv('csv/Ecdat/Crime.csv')
ols = sm.OLS.from_formula('np.log(crmrte)~np.log(polpc)', data=Crime).fit()
ols.params
Intercept -2.624693 np.log(polpc) 0.151685 dtype: float64
maxiter = 5000
n = 40
np.random.seed(10101)
b1hat = [None]*maxiter
b1tilde = [None]*maxiter
x = np.random.normal(size=n)
for iter in range(maxiter):
u = np.random.normal(size=n)
y = [1-xi+ui for xi,ui in zip(x,u)]
ols = sm.OLS(y, sm.add_constant(x)).fit()
b1hat[iter] = ols.params[1]
b1tilde[iter] = (y[0]-y[1])/(x[0]-x[1])
print([np.mean(b1hat), np.mean(b1tilde)])
print([np.var(b1hat), np.var(b1tilde)])
[-0.9950713068722231, -1.3369248972867611] [0.02650718669047491, 235.6908069747431]
## https://stackoverflow.com/questions/809362/how-to-calculate-cumulative-normal-distribution
from scipy.stats import norm
norm.cdf(1.5)-norm.cdf(-0.5)
0.624655260005155
norm.ppf(0.8) # quantile
0.8416212335729143