36.高阶回归与分类变量

目录

高阶回归

分类变量


高阶回归

# Y=5 + 2X + 3X^2
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
nsample = 50 # 样本数量
#linspace,返回间隔均匀的样本
x = np.linspace(0,10,nsample)
#column_stack,将一维数组转换为二维数组。
X = np.column_stack((x,x**2))
X = sm.add_constant(X)
X
array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 2.04081633e-01, 4.16493128e-02],
       [1.00000000e+00, 4.08163265e-01, 1.66597251e-01],
       [1.00000000e+00, 6.12244898e-01, 3.74843815e-01],
       [1.00000000e+00, 8.16326531e-01, 6.66389005e-01],
       [1.00000000e+00, 1.02040816e+00, 1.04123282e+00],
       [1.00000000e+00, 1.22448980e+00, 1.49937526e+00],
       [1.00000000e+00, 1.42857143e+00, 2.04081633e+00],
       [1.00000000e+00, 1.63265306e+00, 2.66555602e+00],
       [1.00000000e+00, 1.83673469e+00, 3.37359434e+00],
       [1.00000000e+00, 2.04081633e+00, 4.16493128e+00],
       [1.00000000e+00, 2.24489796e+00, 5.03956685e+00],
       [1.00000000e+00, 2.44897959e+00, 5.99750104e+00],
       [1.00000000e+00, 2.65306122e+00, 7.03873386e+00],
       [1.00000000e+00, 2.85714286e+00, 8.16326531e+00],
       [1.00000000e+00, 3.06122449e+00, 9.37109538e+00],
       [1.00000000e+00, 3.26530612e+00, 1.06622241e+01],
       [1.00000000e+00, 3.46938776e+00, 1.20366514e+01],
       [1.00000000e+00, 3.67346939e+00, 1.34943773e+01],
       [1.00000000e+00, 3.87755102e+00, 1.50354019e+01],
       [1.00000000e+00, 4.08163265e+00, 1.66597251e+01],
       [1.00000000e+00, 4.28571429e+00, 1.83673469e+01],
       [1.00000000e+00, 4.48979592e+00, 2.01582674e+01],
       [1.00000000e+00, 4.69387755e+00, 2.20324865e+01],
       [1.00000000e+00, 4.89795918e+00, 2.39900042e+01],
       [1.00000000e+00, 5.10204082e+00, 2.60308205e+01],
       [1.00000000e+00, 5.30612245e+00, 2.81549354e+01],
       [1.00000000e+00, 5.51020408e+00, 3.03623490e+01],
       [1.00000000e+00, 5.71428571e+00, 3.26530612e+01],
       [1.00000000e+00, 5.91836735e+00, 3.50270721e+01],
       [1.00000000e+00, 6.12244898e+00, 3.74843815e+01],
       [1.00000000e+00, 6.32653061e+00, 4.00249896e+01],
       [1.00000000e+00, 6.53061224e+00, 4.26488963e+01],
       [1.00000000e+00, 6.73469388e+00, 4.53561016e+01],
       [1.00000000e+00, 6.93877551e+00, 4.81466056e+01],
       [1.00000000e+00, 7.14285714e+00, 5.10204082e+01],
       [1.00000000e+00, 7.34693878e+00, 5.39775094e+01],
       [1.00000000e+00, 7.55102041e+00, 5.70179092e+01],
       [1.00000000e+00, 7.75510204e+00, 6.01416077e+01],
       [1.00000000e+00, 7.95918367e+00, 6.33486047e+01],
       [1.00000000e+00, 8.16326531e+00, 6.66389005e+01],
       [1.00000000e+00, 8.36734694e+00, 7.00124948e+01],
       [1.00000000e+00, 8.57142857e+00, 7.34693878e+01],
       [1.00000000e+00, 8.77551020e+00, 7.70095793e+01],
       [1.00000000e+00, 8.97959184e+00, 8.06330696e+01],
       [1.00000000e+00, 9.18367347e+00, 8.43398584e+01],
       [1.00000000e+00, 9.38775510e+00, 8.81299459e+01],
       [1.00000000e+00, 9.59183673e+00, 9.20033319e+01],
       [1.00000000e+00, 9.79591837e+00, 9.59600167e+01],
       [1.00000000e+00, 1.00000000e+01, 1.00000000e+02]])
bate = np.array([5,2,3])
e = np.random.normal(size=nsample)
y = np.dot(X,bate)+e

#最小二乘法,计算回归系数
model = sm.OLS(y,X)
result = model.fit()
result.params
array([4.91388904, 1.89789012, 3.01647615])
result.summary()
OLS Regression Results
Dep. Variable: y R-squared: 1.000
Model: OLS Adj. R-squared: 1.000
Method: Least Squares F-statistic: 2.449e+05
Date: Wed, 19 Sep 2018 Prob (F-statistic): 3.80e-95
Time: 16:48:00 Log-Likelihood: -68.540
No. Observations: 50 AIC: 143.1
Df Residuals: 47 BIC: 148.8
Df Model: 2    
Covariance Type: nonrobust    
  coef std err t P>|t| [0.025 0.975]
const 4.9139 0.401 12.257 0.000 4.107 5.720
x1 1.8979 0.185 10.236 0.000 1.525 2.271
x2 3.0165 0.018 168.240 0.000 2.980 3.053
Omnibus: 0.197 Durbin-Watson: 2.100
Prob(Omnibus): 0.906 Jarque-Bera (JB): 0.003
Skew: 0.019 Prob(JB): 0.998
Kurtosis: 3.011 Cond. No. 142.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#拟合估计值
y_fitted = result.fittedvalues
#绘图
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o',label='data')
ax.plot(x,y_fitted,'r--',label='OLS')
ax.legend(loc='best')
plt.show()

分类变量

#假设分类变量有3个取值(a,b,c),创建哑变量,比如考试成绩有3个等级a(1,0,0)b(0,1,0)c(0,0,1),这个时候需要3个系数β0,β1,β2,也就是要β0 * 0 + β1 * 1 + β2*2
#此方法为创建哑变量,有多少个取值,做多少个长度,对应值的位置为1,其余位置为0。
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
nsamples = 50
groups = np.zeros(nsamples,int)
groups[20:40] = 1 # 第二组值
groups[40:] = 2 #第三组值
print('groups:',groups)
dummy = sm.categorical(groups,drop=True)
print('dummy',dummy)
groups: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 2 2 2 2 2 2 2 2 2]
dummy [[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]
# Y = 5 + 2X + 3Z1 + 6Z2 + 9Z3 + e
x = np.linspace(0,10,nsamples)
X = np.column_stack((x,dummy))
print(X)
X = sm.add_constant(X)
bata = [5,2,3,6,9]
e = np.random.normal(size=nsamples)
y = np.dot(X,bata) + e
[[ 0.          1.          0.          0.        ]
 [ 0.20408163  1.          0.          0.        ]
 [ 0.40816327  1.          0.          0.        ]
 [ 0.6122449   1.          0.          0.        ]
 [ 0.81632653  1.          0.          0.        ]
 [ 1.02040816  1.          0.          0.        ]
 [ 1.2244898   1.          0.          0.        ]
 [ 1.42857143  1.          0.          0.        ]
 [ 1.63265306  1.          0.          0.        ]
 [ 1.83673469  1.          0.          0.        ]
 [ 2.04081633  1.          0.          0.        ]
 [ 2.24489796  1.          0.          0.        ]
 [ 2.44897959  1.          0.          0.        ]
 [ 2.65306122  1.          0.          0.        ]
 [ 2.85714286  1.          0.          0.        ]
 [ 3.06122449  1.          0.          0.        ]
 [ 3.26530612  1.          0.          0.        ]
 [ 3.46938776  1.          0.          0.        ]
 [ 3.67346939  1.          0.          0.        ]
 [ 3.87755102  1.          0.          0.        ]
 [ 4.08163265  0.          1.          0.        ]
 [ 4.28571429  0.          1.          0.        ]
 [ 4.48979592  0.          1.          0.        ]
 [ 4.69387755  0.          1.          0.        ]
 [ 4.89795918  0.          1.          0.        ]
 [ 5.10204082  0.          1.          0.        ]
 [ 5.30612245  0.          1.          0.        ]
 [ 5.51020408  0.          1.          0.        ]
 [ 5.71428571  0.          1.          0.        ]
 [ 5.91836735  0.          1.          0.        ]
 [ 6.12244898  0.          1.          0.        ]
 [ 6.32653061  0.          1.          0.        ]
 [ 6.53061224  0.          1.          0.        ]
 [ 6.73469388  0.          1.          0.        ]
 [ 6.93877551  0.          1.          0.        ]
 [ 7.14285714  0.          1.          0.        ]
 [ 7.34693878  0.          1.          0.        ]
 [ 7.55102041  0.          1.          0.        ]
 [ 7.75510204  0.          1.          0.        ]
 [ 7.95918367  0.          1.          0.        ]
 [ 8.16326531  0.          0.          1.        ]
 [ 8.36734694  0.          0.          1.        ]
 [ 8.57142857  0.          0.          1.        ]
 [ 8.7755102   0.          0.          1.        ]
 [ 8.97959184  0.          0.          1.        ]
 [ 9.18367347  0.          0.          1.        ]
 [ 9.3877551   0.          0.          1.        ]
 [ 9.59183673  0.          0.          1.        ]
 [ 9.79591837  0.          0.          1.        ]
 [10.          0.          0.          1.        ]]
#回归分析
res = sm.OLS(y,X).fit()
res.summary()
OLS Regression Results
Dep. Variable: y R-squared: 0.981
Model: OLS Adj. R-squared: 0.980
Method: Least Squares F-statistic: 789.5
Date: Thu, 20 Sep 2018 Prob (F-statistic): 1.50e-39
Time: 10:23:05 Log-Likelihood: -76.762
No. Observations: 50 AIC: 161.5
Df Residuals: 46 BIC: 169.2
Df Model: 3    
Covariance Type: nonrobust    
  coef std err t P>|t| [0.025 0.975]
const 7.6313 0.664 11.501 0.000 6.296 8.967
x1 2.1754 0.153 14.247 0.000 1.868 2.483
x2 0.2556 0.421 0.607 0.547 -0.591 1.103
x3 2.2886 0.352 6.508 0.000 1.581 2.997
x4 5.0870 0.792 6.421 0.000 3.492 6.682
Omnibus: 3.522 Durbin-Watson: 2.244
Prob(Omnibus): 0.172 Jarque-Bera (JB): 2.958
Skew: -0.596 Prob(JB): 0.228
Kurtosis: 3.026 Cond. No. 1.32e+17



Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

#拟合估计值
y_fitted = res.fittedvalues
#绘图
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o',label='data')
ax.plot(x,y_fitted,'r--',label='OLS')
ax.legend(loc='best')
plt.show()

猜你喜欢

转载自blog.csdn.net/xzy53719/article/details/82782516