03.Regression

01.regression

  1 # -*- coding: utf-8 -*-
  2 """
  3 scipy 패키지 선형 회귀분석
  4 """
  5 from scipy import stats #선형 회귀분석 모듈
  6 import pandas as pd
  7 
  8 score_df=pd.read_csv("../data/score_iq.csv")
  9 print(score_df.info()) #150x6
 10 """
 11 RangeIndex: 150 entries, 0 to 149
 12 Data columns (total 6 columns):
 13 sid        150 non-null int64
 14 score      150 non-null int64
 15 iq         150 non-null int64
 16 academy    150 non-null int64
 17 game       150 non-null int64
 18 tv         150 non-null int64
 19 dtypes: int64(6)
 20 """
 21 
 22 print(score_df.head())
 23 """
 24      sid  score   iq  academy  game  tv
 25 0  10001     90  140        2     1   0
 26 1  10002     75  125        1     3   3
 27 2  10003     77  120        1     0   4
 28 3  10004     83  135        2     3   2
 29 4  10005     65  105        0     4   4
 30 """
 31 
 32 #1)단순 선현회귀분석
 33 #독립변수 (x:1) -> 종속변수(y:1)
 34 #변수 모델링
 35 x=score_df.iq #score_df['iq']
 36 y=score_df.score # #score_df['score']
 37 
 38 #단순 선형  회귀모형
 39 model=stats.linregress(x,y)
 40 
 41 #모델 결과
 42 print('model=',model)
 43 """
 44 model= LinregressResult(
 45 slope=0.6514309527270075, ->기울기
 46 intercept=-2.8564471221974657, ->절편
 47 rvalue=0.8822203446134699, ->설명력 1=100% 1에 가까우면 좋다
 48 pvalue=2.8476895206683644e-50, ->모델 유의성(0.05보다 크면 의미 없다)
 49 stderr=0.028577934409305443)->표준오차
 50 """
 51 
 52 #회귀방정식 =1차 함수
 53 #Y =aX+b (a:기울기 ,b:절편)
 54 #score:90   iq:140
 55 Y=model.slope*140-model.intercept
 56 print("점수 예측치=",Y) #점수 예측치= 88.34388625958358
 57 err=90-Y
 58 print("모델 오차=",err)#모델 오차= 1.6561137404164157
 59 print('x 기울기=',model.slope)#x 기울기= 0.6514309527270075
 60 print('x 절편=',model.intercept)#x 절편= -2.8564471221974657
 61 print('x 설명력=',model.rvalue)#x 설명력= 0.8822203446134699
 62 print('x 유의성=',model.pvalue)#x 유의성= 2.8476895206683644e-50
 63 print('x 표준오차=',model.stderr)#x 표준오차= 0.028577934409305443
 64 
 65 
 66 #2)다중 선형 회귀모형
 67 # -독립 변수 (X) 2개이상
 68 import statsmodels.formula.api as sm
 69 corr=score_df.corr()
 70 print("상관 계수 행렬")
 71 print(corr)
 72 """
 73               sid     score        iq   academy      game        tv
 74 sid      1.000000 -0.014399 -0.007048 -0.004398  0.018806  0.024565
 75 score   -0.014399  1.000000  0.882220  0.896265 -0.298193 -0.819752
 76 iq      -0.007048  0.882220  1.000000  0.671783 -0.031516 -0.585033
 77 academy -0.004398  0.896265  0.671783  1.000000 -0.351315 -0.948551
 78 game     0.018806 -0.298193 -0.031516 -0.351315  1.000000  0.239217
 79 tv       0.024565 -0.819752 -0.585033 -0.948551  0.239217  1.000000
 80 """
 81 
 82 #변수 모델 :X(iq,academy )->y(score)
 83 model = sm.ols(formula="score ~ iq + academy",
 84                data=score_df).fit()
 85 print("model",model) #object info
 86 #model <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000000000CEAC588>
 87 
 88 #모델의 파라메터: 기울기 절편
 89 print(model.params)
 90 """
 91 Intercept    25.229141-> 절편
 92 iq            0.376966 ->X1 기울기
 93 academy       2.992800 ->X2 기울기
 94 dtype: float64
 95 """
 96 
 97 #다중 선형 회귀 방정식
 98 print(score_df.head())
 99 """
100      sid  score   iq  academy  game  tv
101 0  10001     90  140        2     1   0
102 1  10002     75  125        1     3   3
103 2  10003     77  120        1     0   4
104 3  10004     83  135        2     3   2
105 4  10005     65  105        0     4   4
106 """
107 Y=0.376966*140+2.992800*2+25.229141
108 print("예측치=",Y)#예측치= 83.989981 
109 
110 #모델 결과
111 print(model.summary()) 
112 """
113                             OLS Regression Results                            
114 ==============================================================================
115 Dep. Variable:                  score   R-squared:                       0.946
116 Model:                            OLS   Adj. R-squared:                  0.946
117 Method:                 Least Squares   F-statistic:                     1295.
118 Date:                Sat, 16 Feb 2019   Prob (F-statistic):           4.50e-94 
119 Time:                        11:23:48   Log-Likelihood:                -275.05
120 No. Observations:                 150   AIC:                             556.1
121 Df Residuals:                     147   BIC:                             565.1
122 Df Model:                           2                                         
123 Covariance Type:            nonrobust                                         
124 ==============================================================================
125                  coef    std err          t      P>|t|      [0.025      0.975]
126 ------------------------------------------------------------------------------
127 Intercept     25.2291      2.187     11.537      0.000      20.907      29.551
128 iq             0.3770      0.019     19.786      0.000       0.339       0.415
129 academy        2.9928      0.140     21.444      0.000       2.717       3.269
130 ==============================================================================
131 Omnibus:                       36.342   Durbin-Watson:                   1.913
132 Prob(Omnibus):                  0.000   Jarque-Bera (JB):               54.697
133 Skew:                           1.286   Prob(JB):                     1.33e-12
134 Kurtosis:                       4.461   Cond. No.                     2.18e+03
135 ==============================================================================
136 
137 Warnings:
138 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
139 [2] The condition number is large, 2.18e+03. This might indicate that there are
140 strong multicollinearity or other numerical problems.
141 """
142 """
143 1.Prob (F-statistic):  4.50e-94:유의성 (0.05미만이여야 한다)
144 2.Adj. R-squared:      0.946:설명력 (1에 가까와야 좋다)
145 3.P>|t|                :X 유의성 검정: 0.05미만예야 좋타 
146 """
147  

02.dot_regression

 1 # -*- coding: utf-8 -*-
 2 """
 3 회귀모형 예측에 행렬곱(dot) 적용예
 4 """
 5 import pandas as pd
 6 import numpy as np
 7 
 8 #1.data set 가져오기
 9 score_df=pd.read_csv("../data/score_iq.csv")
10 print(score_df.head())# 6칼럼 
11 """
12      sid  score   iq  academy  game  tv
13 0  10001     90  140        2     1   0
14 1  10002     75  125        1     3   3
15 2  10003     77  120        1     0   4
16 3  10004     83  135        2     3   2
17 4  10005     65  105        0     4   4
18 """
19 
20 #2.subset 생성
21 score_arr=score_df[['score','iq','academy']]#3칼럼
22 print(score_arr.shape)#(150, 3)
23 print(score_arr.info())
24 """
25 <class 'pandas.core.frame.DataFrame'>
26 RangeIndex: 150 entries, 0 to 149
27 Data columns (total 3 columns):
28 score      150 non-null int64
29 iq         150 non-null int64
30 academy    150 non-null int64
31 dtypes: int64(3)
32 memory usage: 3.6 KB
33 None
34 """
35 #3.X,y변수 선택
36 score_X=score_arr.ix[:,1:] #2개 (150x2) 2차원
37 score_y=score_arr.ix[:,0]#1개(150) 1차원
38 print(score_X.shape) #(150, 2)
39 print(score_y.shape) #(150,)
40 
41 #4.기울기 ,절편
42 """
43 Intercept     25.229141-> 절편
44 iq            0.376966 -> X1 기울기
45 academy       2.992800 -> X2 기울기
46 dtype: float64
47 """
48 #기울기 변수
49 slop=np.array([[0.376966],[2.992800]]) #2차원
50 Intercept=25.229141 #상수 0차원
51 
52 #Y=(a1*x1+a2*x2)+b
53 #(a1*x1+a2*x2)->행렵곱
54 
55 #5.행렬곱(dot) 적용
56 print(score_X.shape) #(150, 2)
57 print(slop.shape) #(2, 1)
58 #(150, 2) * (2, 1) =(150,1)
59 matmul = np.dot(score_X,slop)
60 Y = matmul + Intercept
61 print(Y)
62 """
63 [[83.989981]
64  [75.342691]
65  ...
66  [73.457861]]
67 """
68 
69 #6. model 평가 (정답 vs 예측치)
70 #Y = 예측치
71 #score_y #정답
72 print(Y.shape) #(150, 1)  2차원 ->1차원
73 print(score_y.shape) #(150,) 1차원
74 
75 #2차원 ->1차원
76 Y_fitted=Y.reshape(150) # (150,)
77 df=pd.DataFrame({"Y_fitted":Y_fitted,'score':score_y})
78 print(df)  # (150, 2)
79 
80 #상관 분석
81 print(df.head())
82 """
83     Y_fitted  score
84 0  83.989981     90
85 1  75.342691     75
86 2  73.457861     77
87 3  82.105151     83
88 4  64.810571     65
89 """
90 cor=df.Y_fitted.corr(df.score)
91 print('corr=',cor) #corr= 0.9727792069594755

03.sklearn_Dataset

 1 # -*- coding: utf-8 -*-
 2 """
 3 sklearn 제공 datasets
 4 """
 5 from sklearn import datasets
 6 import numpy as np
 7 
 8 #1.선형회귀분석  적합 데이터셋
 9 #1) iris (붖꽃)
10 iris=datasets.load_iris()
11 print(iris)
12 
13 iris_x=iris.data #x
14 iris_y=iris.target  #y
15 
16 print(type(iris_x)) #<class 'numpy.ndarray'>
17 print(np.shape(iris_x)) #(150, 4)
18 print(np.shape(iris_y)) #(150,)
19 
20 print(iris_x)
21 """
22 [[5.1 3.5 1.4 0.2]
23  [4.9 3.  1.4 0.2]
24  [4.7 3.2 1.3 0.2]
25  [4.6 3.1 1.5 0.2]]
26 """
27 
28 print(iris_y)
29 """
30 [0 0 ... 0 0]
31 """
32 
33 #y범주
34 print(list(iris.target_names)) #['setosa'=0, 'versicolor'=1, 'virginica'=2]
35 
36 #2)당뇨병 데이터셋
37 diabetes=datasets.load_diabetes()
38 diabetes_x=diabetes.data # x
39 diabetes_y=diabetes.target # y
40 print(diabetes_x.shape) #(442, 10)
41 print(diabetes_y.shape) #(442,)
42 print(diabetes_y)
43 
44 #3)보스톤 데이터셋
45 boston=datasets.load_boston()
46 boston_x=boston.data
47 boston_y=boston.target
48 print(boston_x.shape)#(506, 13)
49 print(boston_y.shape)#(506,)
50 print(boston.feature_names)
51 #['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT']
52 
53 #2. 분류분석에 적합한 데이터셋
54 #4) wine 데이터셋   다항분류 (softmax 함수) 
55 #'class_0:0.98,+class_1:0.01,+class_2:0.01=1
56 wine= datasets.load_wine()
57 wine_x=wine.data #(442, 10)
58 wine_y=wine.target #(442,)
59 print(wine.target_names) #['class_0' 'class_1' 'class_2']
60 print(wine_x.shape)#(178, 13)
61 print(wine_y) 
62 """
63 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
64  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
65  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
66  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
67  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
68 """
69 
70 #5)  이진분류 (sigmoid 함수)
71 # YES 0.5>  ,NO 0.5 <
72 breast=datasets.load_breast_cancer()
73 print(breast.data.shape) #(569, 30)
74 print(breast.target.shape)#(569,)
75 print(breast.target_names) #['malignant' 'benign']
76 print(breast)

04.sklearn_Regression

  1 # -*- coding: utf-8 -*-
  2 """
  3 sklearn 관련 Regressin모델
  4  - y변수가 연속인 경우
  5 """
  6 
  7 import pandas as pd
  8 from sklearn import datasets
  9 from sklearn.linear_model import LinearRegression #model
 10 from sklearn.model_selection import train_test_split #train set VS test set
 11 from sklearn.metrics import mean_squared_error #MES (평균제곱 오차)
 12 
 13 # 1. dataset 가져오기 
 14 iris=pd.read_csv("../data/iris.csv")
 15 print(iris.info())
 16 """
 17 RangeIndex: 150 entries, 0 to 149
 18 Data columns (total 5 columns):
 19 Sepal.Length    150 non-null float64
 20 Sepal.Width     150 non-null float64
 21 Petal.Length    150 non-null float64
 22 Petal.Width     150 non-null float64
 23 Species         150 non-null object
 24 dtypes: float64(4), object(1)
 25 """
 26 print(iris.head())
 27 """
 28    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 29 0           5.1          3.5           1.4          0.2  setosa
 30 1           4.9          3.0           1.4          0.2  setosa
 31 2           4.7          3.2           1.3          0.2  setosa
 32 3           4.6          3.1           1.5          0.2  setosa
 33 4           5.0          3.6           1.4          0.2  setosa
 34 """
 35 
 36 #2. 변수(x,y) 선택
 37 cols=list(iris.columns)
 38 print(cols) 
 39 #['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
 40 
 41 x_cols = cols[1:4] #'Sepal.Width', 'Petal.Length', 'Petal.Width'
 42 y_cols = cols[0] #'Sepal.Length'
 43 
 44 #subset
 45 data_df=iris[cols[:4]] #1~4칼럼
 46 print(data_df.shape)#(150, 4)
 47 
 48 #3 train set(70%)/test set(30%)   #자동 랜덤  ,random_state=123똑같은 랜덤
 49 iris_train,iris_test=train_test_split(
 50         data_df,test_size=0.3,random_state=123) 
 51 
 52 print(iris_train.shape)#(105, 4) model 생성
 53 print(iris_test.shape) #(45, 4)  model 검정
 54 
 55 #4.model 생성
 56 #help(LinearRegression) 
 57 #class-> object
 58 lr_model=LinearRegression()#default model객체
 59 #fit(train_x,train_y) :학습->model
 60 lr_model.fit(iris_train[x_cols],iris_train[y_cols]) #train set
 61 
 62 #획귀 계수(기울기),절편
 63 print("기울기=",lr_model.coef_)#기울기= [ 0.63924286  0.75744562 -0.68796484]
 64 print("절편=",lr_model.intercept_)#절편= 1.8609363992411732
 65 print("예측치",lr_model.predict) # 105
 66 
 67 #5. 모델 평가 :test 예측치 =회귀방정식
 68 
 69 #1)train set
 70 model_socre1=lr_model.score(iris_train[x_cols],
 71                             iris_train[y_cols])
 72 #2)test set
 73 model_socre2=lr_model.score(iris_test[x_cols],
 74                             iris_test[y_cols])
 75 
 76 #1.socre
 77 print('train_model score=',model_socre1)#train_model score= 0.8581515699458577
 78 print('test_model score=',model_socre2)#test_model score= 0.854680765745176
 79 
 80 
 81 #model 예측치 vs 정답
 82 pred=lr_model.predict(iris_test[x_cols])# 예측치 predict(x)
 83 Y=iris_test[y_cols]#정답
 84 
 85 #2.평균제곱오차 (MSE)
 86 MSE=mean_squared_error(Y,pred) #(정답,예측치)
 87 print('MSE=',MSE)#MSE= 0.11633863200224713
 88 
 89 
 90 ######################
 91 ### load_iris()
 92 ######################
 93 
 94 from sklearn.datasets import load_iris
 95 
 96 #1. data loading
 97 iris=load_iris()
 98 
 99 # 2. 변수 선택 
100 X=iris.data # x 
101 y=iris.target #y(0~2)
102 
103 print(X.shape)#(150, 4)
104 print(y.shape)#(150,)
105 
106 # 3. train /test split(7:3)
107 x_train,x_test,y_train,y_test=train_test_split(
108         X,y, test_size=0.3,random_state=123)
109 
110 print(x_train.shape)#(105, 4) - 1~4번째 
111 print(x_test.shape)#(45, 4)
112 print(y_train.shape)#(105,) - 5번째 
113 print(y_test.shape)#(45,)
114 
115 #4.model 생성:tran set
116 lr_model2=LinearRegression()
117 lr_model2.fit(x_train,y_train)  # train -> model
118 
119 print(lr_model2.coef_) #기울기  [-0.12591445 -0.0481559   0.24484363  0.57025678]
120 print(lr_model2.intercept_) #절편 0.2537496076784179
121 
122 #5. model평가 :test set
123 #1) score
124 model_score=lr_model2.score(x_test,y_test)
125 print(model_score) #0.9427868501294299
126 
127 #2) Mes(예측치 vs 정답)
128 pred=lr_model2.predict(x_test)
129 Y=y_test
130 MSE=mean_squared_error(pred,Y)
131 print('MSE=',MSE)#MSE= 0.04447086315865546
132 
133 #E=pred-Y
134 #sqared=E^2
135 import numpy as np
136 mes=np.mean((pred-Y)**2)
137 print('MSE=',MSE) #MSE= 0.04447086315865546
138 
139 #3시각화 평가
140 import matplotlib.pyplot as plt
141 fig=plt.figure(figsize=(20,5))
142 chart=fig.add_subplot(1,1,1)
143 chart.plot(pred,color='r',label="pred")
144 chart.plot(Y,color='b',label="y")
145 plt.legend(loc='best')
146 plt.show()

05.LogisticRegression

  1 # -*- coding: utf-8 -*-
  2 """
  3 sklearn logistic Regreesion
  4  - y변수가 범주인 경우
  5 """
  6 
  7 from sklearn.datasets import load_iris #다항분류
  8 from sklearn.datasets import load_breast_cancer #이항분류
  9 from sklearn.linear_model import LogisticRegression
 10 
 11 import matplotlib.pyplot as plt
 12 import pandas as np
 13 
 14 #####################################
 15 ## 1. load_breast_cancer : 이항분류 
 16 #####################################
 17 
 18 #1.loading data
 19 breast=load_breast_cancer()
 20 
 21 # 2. 변수 선택 
 22 X=breast.data
 23 y=breast.target
 24 print(X.shape,y.shape)#(569, 30) (569,)
 25 
 26 # 3.model 생성
 27 #help(LogisticRegression)
 28 #1.random_state : 난수 seed값
 29 #2.solver :최적화 알고리즘
 30 #   {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'} default: 'liblinear'
 31 #   작은 데이터셋:'liblinear'
 32 #   큰 데이터셋:'sag', 'saga'
 33 #   멀티 클래스 문제:'newton-cg','lbfgs'
 34 #   다항붕류 'multinomal'
 35 
 36 #적용 예)
 37 #1.일반 데이터셋 ,이항분류 :default
 38 #2일반 데이터셋 ,다항분류 :solver='lbfgs',multi_class="multinomial"
 39 #3.빅 데이터셋 ,이항분류 :solver='sag'
 40 
 41 #object
 42 lr_model=LogisticRegression(random_state=0)
 43 lr_model.fit(X,y) #model 생성 
 44 
 45 #예측치 predict
 46 pred=lr_model.predict(X)
 47 print('prdict=',pred[:5])#prdict= [0 0 0 1 0]
 48 print('y정답=',y[:5])#y정답= [0 0 0 0 0]
 49 
 50 # model 평가 : score = 분류정확도(accuracy)
 51 score=lr_model.score(X,y)
 52 print(score) #0.9595782073813708
 53 
 54 #:교차 분할표(confusing matrix)
 55 tab=pd.crosstab(y,pred) #crosstab(row:정답,col:예측치)
 56 print(tab)
 57 """
 58 col_0    0    1
 59 row_0          
 60 0      198   14
 61 1        9  348
 62 """
 63 acc=(198+348)/len(y)
 64 print('accuracy=',acc)#accuracy= 0.9595782073813708
 65 
 66 
 67 #################################
 68 ## 2. load_irsi : 다항분류 
 69 #################################
 70 #1.data loading
 71 X,y=load_iris(return_X_y=True)
 72 
 73 #2.model 생성
 74 lr_model2=LogisticRegression(random_state=123,
 75                              solver='lbfgs',
 76                              multi_class="multinomial")
 77 lr_model2.fit(X,y)
 78 print(lr_model2) #model 정보
 79 """
 80 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
 81           intercept_scaling=1, max_iter=100, multi_class='multinomial',
 82           n_jobs=1, penalty='l2', random_state=123, solver='lbfgs',
 83           tol=0.0001, verbose=0, warm_start=False)
 84 """
 85 # 예측치
 86 pred=lr_model2.predict(X) #예측치
 87 Y=y #정답
 88 
 89 score=lr_model2.score(X,y)
 90 print('accuracy=',score)#accuracy= 0.9733333333333334
 91 tab=pd.crosstab(Y,pred)
 92 print(tab)
 93 """
 94 col_0   0   1   2
 95 row_0            
 96 0      50   0   0
 97 1       0  47   3
 98 2       0   1  49
 99 """
100 print(type(tab))#<class 'pandas.core.frame.DataFrame'>
101 
102 
103 acc=(tab.ix[0,0]+tab.ix[1,1]+tab.ix[2,2])/len(y)
104 print('accuracy=',acc) #accuracy= 0.9733333333333334

猜你喜欢

转载自www.cnblogs.com/kingboy100/p/10390027.html