import numpy as np
import statsmodels.api as sm


iterate = 1000 # will repeat 1000 times
n1 = 13
n2 = 25
n3 = 12
n = n1+n2+n3
## https://stackoverflow.com/questions/3459098/create-list-of-single-item-repeated-n-times
## https://stackoverflow.com/questions/11574195/how-do-i-merge-multiple-lists-into-one-list
## https://stackoverflow.com/questions/952914/how-do-i-make-a-flat-list-out-of-a-list-of-lists
educ = [12.0]*n1 + [14.0]*n2 + [16.0]*n3  # so counter-intuitive and annoying
print(educ)

[12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0]


stdevs = [0.8]*n1 + [1.0]*n2 + [1.4]*n3 # standard deviations are set to be different
print(stdevs)

[0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4]


## https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
np.random.seed(11)
bhats = [None]*iterate # list of 1000 None's
for j in range(iterate):
    ## https://numpy.org/doc/stable/reference/random/generated/numpy.random.normal.html
    u0 = np.random.normal(size=n)
    ## https://stackoverflow.com/questions/10271484/how-to-perform-element-wise-multiplication-of-two-lists
    u = [s*e for s,e in zip(stdevs,u0)]
    lnwage = [8.3+0.08*xi+ui for xi,ui in zip(educ,u)]
    # We have generated educ and lnwage. Now let's regress lnwage on educ.
    ## https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html
    ols = sm.OLS(lnwage, sm.add_constant(educ)).fit()
    bhats[j] = ols.params[1] # 0=intercept, 1=slope


print(np.mean(bhats)) # Should be close to the truth 0.08

0.07929930419095235


import pandas as pd
Crime = pd.read_csv('csv/Ecdat/Crime.csv')
ols = sm.OLS.from_formula('np.log(crmrte)~np.log(polpc)', data=Crime).fit()
ols.params

Intercept       -2.624693
np.log(polpc)    0.151685
dtype: float64


maxiter = 5000
n = 40
np.random.seed(10101)
b1hat = [None]*maxiter
b1tilde = [None]*maxiter
x = np.random.normal(size=n)
for iter in range(maxiter):
    u = np.random.normal(size=n)
    y = [1-xi+ui for xi,ui in zip(x,u)]
    ols = sm.OLS(y, sm.add_constant(x)).fit()
    b1hat[iter] = ols.params[1]
    b1tilde[iter] = (y[0]-y[1])/(x[0]-x[1])


print([np.mean(b1hat), np.mean(b1tilde)])
print([np.var(b1hat), np.var(b1tilde)])

[-0.9950713068722231, -1.3369248972867611]
[0.02650718669047491, 235.6908069747431]


## https://stackoverflow.com/questions/809362/how-to-calculate-cumulative-normal-distribution
from scipy.stats import norm
norm.cdf(1.5)-norm.cdf(-0.5)

0.624655260005155


norm.ppf(0.8) # quantile

0.8416212335729143

4장 추정값과 참값의 관계¶

평균에 관한 모의실험¶

예제 4.1 경찰규모와 범죄율¶

분산에 관한 모의실험¶

표준정규분포의 확률과 임계값¶