01.regression
1 # -*- coding: utf-8 -*-
2 """
3 scipy 패키지 선형 회귀분석
4 """
5 from scipy import stats #선형 회귀분석 모듈
6 import pandas as pd
7
8 score_df=pd.read_csv("../data/score_iq.csv")
9 print(score_df.info()) #150x6
10 """
11 RangeIndex: 150 entries, 0 to 149
12 Data columns (total 6 columns):
13 sid 150 non-null int64
14 score 150 non-null int64
15 iq 150 non-null int64
16 academy 150 non-null int64
17 game 150 non-null int64
18 tv 150 non-null int64
19 dtypes: int64(6)
20 """
21
22 print(score_df.head())
23 """
24 sid score iq academy game tv
25 0 10001 90 140 2 1 0
26 1 10002 75 125 1 3 3
27 2 10003 77 120 1 0 4
28 3 10004 83 135 2 3 2
29 4 10005 65 105 0 4 4
30 """
31
32 #1)단순 선현회귀분석
33 #독립변수 (x:1) -> 종속변수(y:1)
34 #변수 모델링
35 x=score_df.iq #score_df['iq']
36 y=score_df.score # #score_df['score']
37
38 #단순 선형 회귀모형
39 model=stats.linregress(x,y)
40
41 #모델 결과
42 print('model=',model)
43 """
44 model= LinregressResult(
45 slope=0.6514309527270075, ->기울기
46 intercept=-2.8564471221974657, ->절편
47 rvalue=0.8822203446134699, ->설명력 1=100% 1에 가까우면 좋다
48 pvalue=2.8476895206683644e-50, ->모델 유의성(0.05보다 크면 의미 없다)
49 stderr=0.028577934409305443)->표준오차
50 """
51
52 #회귀방정식 =1차 함수
53 #Y =aX+b (a:기울기 ,b:절편)
54 #score:90 iq:140
55 Y=model.slope*140-model.intercept
56 print("점수 예측치=",Y) #점수 예측치= 88.34388625958358
57 err=90-Y
58 print("모델 오차=",err)#모델 오차= 1.6561137404164157
59 print('x 기울기=',model.slope)#x 기울기= 0.6514309527270075
60 print('x 절편=',model.intercept)#x 절편= -2.8564471221974657
61 print('x 설명력=',model.rvalue)#x 설명력= 0.8822203446134699
62 print('x 유의성=',model.pvalue)#x 유의성= 2.8476895206683644e-50
63 print('x 표준오차=',model.stderr)#x 표준오차= 0.028577934409305443
64
65
66 #2)다중 선형 회귀모형
67 # -독립 변수 (X) 2개이상
68 import statsmodels.formula.api as sm
69 corr=score_df.corr()
70 print("상관 계수 행렬")
71 print(corr)
72 """
73 sid score iq academy game tv
74 sid 1.000000 -0.014399 -0.007048 -0.004398 0.018806 0.024565
75 score -0.014399 1.000000 0.882220 0.896265 -0.298193 -0.819752
76 iq -0.007048 0.882220 1.000000 0.671783 -0.031516 -0.585033
77 academy -0.004398 0.896265 0.671783 1.000000 -0.351315 -0.948551
78 game 0.018806 -0.298193 -0.031516 -0.351315 1.000000 0.239217
79 tv 0.024565 -0.819752 -0.585033 -0.948551 0.239217 1.000000
80 """
81
82 #변수 모델 :X(iq,academy )->y(score)
83 model = sm.ols(formula="score ~ iq + academy",
84 data=score_df).fit()
85 print("model",model) #object info
86 #model <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000000000CEAC588>
87
88 #모델의 파라메터: 기울기 절편
89 print(model.params)
90 """
91 Intercept 25.229141-> 절편
92 iq 0.376966 ->X1 기울기
93 academy 2.992800 ->X2 기울기
94 dtype: float64
95 """
96
97 #다중 선형 회귀 방정식
98 print(score_df.head())
99 """
100 sid score iq academy game tv
101 0 10001 90 140 2 1 0
102 1 10002 75 125 1 3 3
103 2 10003 77 120 1 0 4
104 3 10004 83 135 2 3 2
105 4 10005 65 105 0 4 4
106 """
107 Y=0.376966*140+2.992800*2+25.229141
108 print("예측치=",Y)#예측치= 83.989981
109
110 #모델 결과
111 print(model.summary())
112 """
113 OLS Regression Results
114 ==============================================================================
115 Dep. Variable: score R-squared: 0.946
116 Model: OLS Adj. R-squared: 0.946
117 Method: Least Squares F-statistic: 1295.
118 Date: Sat, 16 Feb 2019 Prob (F-statistic): 4.50e-94
119 Time: 11:23:48 Log-Likelihood: -275.05
120 No. Observations: 150 AIC: 556.1
121 Df Residuals: 147 BIC: 565.1
122 Df Model: 2
123 Covariance Type: nonrobust
124 ==============================================================================
125 coef std err t P>|t| [0.025 0.975]
126 ------------------------------------------------------------------------------
127 Intercept 25.2291 2.187 11.537 0.000 20.907 29.551
128 iq 0.3770 0.019 19.786 0.000 0.339 0.415
129 academy 2.9928 0.140 21.444 0.000 2.717 3.269
130 ==============================================================================
131 Omnibus: 36.342 Durbin-Watson: 1.913
132 Prob(Omnibus): 0.000 Jarque-Bera (JB): 54.697
133 Skew: 1.286 Prob(JB): 1.33e-12
134 Kurtosis: 4.461 Cond. No. 2.18e+03
135 ==============================================================================
136
137 Warnings:
138 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
139 [2] The condition number is large, 2.18e+03. This might indicate that there are
140 strong multicollinearity or other numerical problems.
141 """
142 """
143 1.Prob (F-statistic): 4.50e-94:유의성 (0.05미만이여야 한다)
144 2.Adj. R-squared: 0.946:설명력 (1에 가까와야 좋다)
145 3.P>|t| :X 유의성 검정: 0.05미만예야 좋타
146 """
147
02.dot_regression
1 # -*- coding: utf-8 -*-
2 """
3 회귀모형 예측에 행렬곱(dot) 적용예
4 """
5 import pandas as pd
6 import numpy as np
7
8 #1.data set 가져오기
9 score_df=pd.read_csv("../data/score_iq.csv")
10 print(score_df.head())# 6칼럼
11 """
12 sid score iq academy game tv
13 0 10001 90 140 2 1 0
14 1 10002 75 125 1 3 3
15 2 10003 77 120 1 0 4
16 3 10004 83 135 2 3 2
17 4 10005 65 105 0 4 4
18 """
19
20 #2.subset 생성
21 score_arr=score_df[['score','iq','academy']]#3칼럼
22 print(score_arr.shape)#(150, 3)
23 print(score_arr.info())
24 """
25 <class 'pandas.core.frame.DataFrame'>
26 RangeIndex: 150 entries, 0 to 149
27 Data columns (total 3 columns):
28 score 150 non-null int64
29 iq 150 non-null int64
30 academy 150 non-null int64
31 dtypes: int64(3)
32 memory usage: 3.6 KB
33 None
34 """
35 #3.X,y변수 선택
36 score_X=score_arr.ix[:,1:] #2개 (150x2) 2차원
37 score_y=score_arr.ix[:,0]#1개(150) 1차원
38 print(score_X.shape) #(150, 2)
39 print(score_y.shape) #(150,)
40
41 #4.기울기 ,절편
42 """
43 Intercept 25.229141-> 절편
44 iq 0.376966 -> X1 기울기
45 academy 2.992800 -> X2 기울기
46 dtype: float64
47 """
48 #기울기 변수
49 slop=np.array([[0.376966],[2.992800]]) #2차원
50 Intercept=25.229141 #상수 0차원
51
52 #Y=(a1*x1+a2*x2)+b
53 #(a1*x1+a2*x2)->행렵곱
54
55 #5.행렬곱(dot) 적용
56 print(score_X.shape) #(150, 2)
57 print(slop.shape) #(2, 1)
58 #(150, 2) * (2, 1) =(150,1)
59 matmul = np.dot(score_X,slop)
60 Y = matmul + Intercept
61 print(Y)
62 """
63 [[83.989981]
64 [75.342691]
65 ...
66 [73.457861]]
67 """
68
69 #6. model 평가 (정답 vs 예측치)
70 #Y = 예측치
71 #score_y #정답
72 print(Y.shape) #(150, 1) 2차원 ->1차원
73 print(score_y.shape) #(150,) 1차원
74
75 #2차원 ->1차원
76 Y_fitted=Y.reshape(150) # (150,)
77 df=pd.DataFrame({"Y_fitted":Y_fitted,'score':score_y})
78 print(df) # (150, 2)
79
80 #상관 분석
81 print(df.head())
82 """
83 Y_fitted score
84 0 83.989981 90
85 1 75.342691 75
86 2 73.457861 77
87 3 82.105151 83
88 4 64.810571 65
89 """
90 cor=df.Y_fitted.corr(df.score)
91 print('corr=',cor) #corr= 0.9727792069594755
03.sklearn_Dataset
1 # -*- coding: utf-8 -*-
2 """
3 sklearn 제공 datasets
4 """
5 from sklearn import datasets
6 import numpy as np
7
8 #1.선형회귀분석 적합 데이터셋
9 #1) iris (붖꽃)
10 iris=datasets.load_iris()
11 print(iris)
12
13 iris_x=iris.data #x
14 iris_y=iris.target #y
15
16 print(type(iris_x)) #<class 'numpy.ndarray'>
17 print(np.shape(iris_x)) #(150, 4)
18 print(np.shape(iris_y)) #(150,)
19
20 print(iris_x)
21 """
22 [[5.1 3.5 1.4 0.2]
23 [4.9 3. 1.4 0.2]
24 [4.7 3.2 1.3 0.2]
25 [4.6 3.1 1.5 0.2]]
26 """
27
28 print(iris_y)
29 """
30 [0 0 ... 0 0]
31 """
32
33 #y범주
34 print(list(iris.target_names)) #['setosa'=0, 'versicolor'=1, 'virginica'=2]
35
36 #2)당뇨병 데이터셋
37 diabetes=datasets.load_diabetes()
38 diabetes_x=diabetes.data # x
39 diabetes_y=diabetes.target # y
40 print(diabetes_x.shape) #(442, 10)
41 print(diabetes_y.shape) #(442,)
42 print(diabetes_y)
43
44 #3)보스톤 데이터셋
45 boston=datasets.load_boston()
46 boston_x=boston.data
47 boston_y=boston.target
48 print(boston_x.shape)#(506, 13)
49 print(boston_y.shape)#(506,)
50 print(boston.feature_names)
51 #['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT']
52
53 #2. 분류분석에 적합한 데이터셋
54 #4) wine 데이터셋 다항분류 (softmax 함수)
55 #'class_0:0.98,+class_1:0.01,+class_2:0.01=1
56 wine= datasets.load_wine()
57 wine_x=wine.data #(442, 10)
58 wine_y=wine.target #(442,)
59 print(wine.target_names) #['class_0' 'class_1' 'class_2']
60 print(wine_x.shape)#(178, 13)
61 print(wine_y)
62 """
63 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
65 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
66 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
67 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
68 """
69
70 #5) 이진분류 (sigmoid 함수)
71 # YES 0.5> ,NO 0.5 <
72 breast=datasets.load_breast_cancer()
73 print(breast.data.shape) #(569, 30)
74 print(breast.target.shape)#(569,)
75 print(breast.target_names) #['malignant' 'benign']
76 print(breast)
04.sklearn_Regression
1 # -*- coding: utf-8 -*-
2 """
3 sklearn 관련 Regressin모델
4 - y변수가 연속인 경우
5 """
6
7 import pandas as pd
8 from sklearn import datasets
9 from sklearn.linear_model import LinearRegression #model
10 from sklearn.model_selection import train_test_split #train set VS test set
11 from sklearn.metrics import mean_squared_error #MES (평균제곱 오차)
12
13 # 1. dataset 가져오기
14 iris=pd.read_csv("../data/iris.csv")
15 print(iris.info())
16 """
17 RangeIndex: 150 entries, 0 to 149
18 Data columns (total 5 columns):
19 Sepal.Length 150 non-null float64
20 Sepal.Width 150 non-null float64
21 Petal.Length 150 non-null float64
22 Petal.Width 150 non-null float64
23 Species 150 non-null object
24 dtypes: float64(4), object(1)
25 """
26 print(iris.head())
27 """
28 Sepal.Length Sepal.Width Petal.Length Petal.Width Species
29 0 5.1 3.5 1.4 0.2 setosa
30 1 4.9 3.0 1.4 0.2 setosa
31 2 4.7 3.2 1.3 0.2 setosa
32 3 4.6 3.1 1.5 0.2 setosa
33 4 5.0 3.6 1.4 0.2 setosa
34 """
35
36 #2. 변수(x,y) 선택
37 cols=list(iris.columns)
38 print(cols)
39 #['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
40
41 x_cols = cols[1:4] #'Sepal.Width', 'Petal.Length', 'Petal.Width'
42 y_cols = cols[0] #'Sepal.Length'
43
44 #subset
45 data_df=iris[cols[:4]] #1~4칼럼
46 print(data_df.shape)#(150, 4)
47
48 #3 train set(70%)/test set(30%) #자동 랜덤 ,random_state=123똑같은 랜덤
49 iris_train,iris_test=train_test_split(
50 data_df,test_size=0.3,random_state=123)
51
52 print(iris_train.shape)#(105, 4) model 생성
53 print(iris_test.shape) #(45, 4) model 검정
54
55 #4.model 생성
56 #help(LinearRegression)
57 #class-> object
58 lr_model=LinearRegression()#default model객체
59 #fit(train_x,train_y) :학습->model
60 lr_model.fit(iris_train[x_cols],iris_train[y_cols]) #train set
61
62 #획귀 계수(기울기),절편
63 print("기울기=",lr_model.coef_)#기울기= [ 0.63924286 0.75744562 -0.68796484]
64 print("절편=",lr_model.intercept_)#절편= 1.8609363992411732
65 print("예측치",lr_model.predict) # 105
66
67 #5. 모델 평가 :test 예측치 =회귀방정식
68
69 #1)train set
70 model_socre1=lr_model.score(iris_train[x_cols],
71 iris_train[y_cols])
72 #2)test set
73 model_socre2=lr_model.score(iris_test[x_cols],
74 iris_test[y_cols])
75
76 #1.socre
77 print('train_model score=',model_socre1)#train_model score= 0.8581515699458577
78 print('test_model score=',model_socre2)#test_model score= 0.854680765745176
79
80
81 #model 예측치 vs 정답
82 pred=lr_model.predict(iris_test[x_cols])# 예측치 predict(x)
83 Y=iris_test[y_cols]#정답
84
85 #2.평균제곱오차 (MSE)
86 MSE=mean_squared_error(Y,pred) #(정답,예측치)
87 print('MSE=',MSE)#MSE= 0.11633863200224713
88
89
90 ######################
91 ### load_iris()
92 ######################
93
94 from sklearn.datasets import load_iris
95
96 #1. data loading
97 iris=load_iris()
98
99 # 2. 변수 선택
100 X=iris.data # x
101 y=iris.target #y(0~2)
102
103 print(X.shape)#(150, 4)
104 print(y.shape)#(150,)
105
106 # 3. train /test split(7:3)
107 x_train,x_test,y_train,y_test=train_test_split(
108 X,y, test_size=0.3,random_state=123)
109
110 print(x_train.shape)#(105, 4) - 1~4번째
111 print(x_test.shape)#(45, 4)
112 print(y_train.shape)#(105,) - 5번째
113 print(y_test.shape)#(45,)
114
115 #4.model 생성:tran set
116 lr_model2=LinearRegression()
117 lr_model2.fit(x_train,y_train) # train -> model
118
119 print(lr_model2.coef_) #기울기 [-0.12591445 -0.0481559 0.24484363 0.57025678]
120 print(lr_model2.intercept_) #절편 0.2537496076784179
121
122 #5. model평가 :test set
123 #1) score
124 model_score=lr_model2.score(x_test,y_test)
125 print(model_score) #0.9427868501294299
126
127 #2) Mes(예측치 vs 정답)
128 pred=lr_model2.predict(x_test)
129 Y=y_test
130 MSE=mean_squared_error(pred,Y)
131 print('MSE=',MSE)#MSE= 0.04447086315865546
132
133 #E=pred-Y
134 #sqared=E^2
135 import numpy as np
136 mes=np.mean((pred-Y)**2)
137 print('MSE=',MSE) #MSE= 0.04447086315865546
138
139 #3시각화 평가
140 import matplotlib.pyplot as plt
141 fig=plt.figure(figsize=(20,5))
142 chart=fig.add_subplot(1,1,1)
143 chart.plot(pred,color='r',label="pred")
144 chart.plot(Y,color='b',label="y")
145 plt.legend(loc='best')
146 plt.show()
05.LogisticRegression
1 # -*- coding: utf-8 -*-
2 """
3 sklearn logistic Regreesion
4 - y변수가 범주인 경우
5 """
6
7 from sklearn.datasets import load_iris #다항분류
8 from sklearn.datasets import load_breast_cancer #이항분류
9 from sklearn.linear_model import LogisticRegression
10
11 import matplotlib.pyplot as plt
12 import pandas as np
13
14 #####################################
15 ## 1. load_breast_cancer : 이항분류
16 #####################################
17
18 #1.loading data
19 breast=load_breast_cancer()
20
21 # 2. 변수 선택
22 X=breast.data
23 y=breast.target
24 print(X.shape,y.shape)#(569, 30) (569,)
25
26 # 3.model 생성
27 #help(LogisticRegression)
28 #1.random_state : 난수 seed값
29 #2.solver :최적화 알고리즘
30 # {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'} default: 'liblinear'
31 # 작은 데이터셋:'liblinear'
32 # 큰 데이터셋:'sag', 'saga'
33 # 멀티 클래스 문제:'newton-cg','lbfgs'
34 # 다항붕류 'multinomal'
35
36 #적용 예)
37 #1.일반 데이터셋 ,이항분류 :default
38 #2일반 데이터셋 ,다항분류 :solver='lbfgs',multi_class="multinomial"
39 #3.빅 데이터셋 ,이항분류 :solver='sag'
40
41 #object
42 lr_model=LogisticRegression(random_state=0)
43 lr_model.fit(X,y) #model 생성
44
45 #예측치 predict
46 pred=lr_model.predict(X)
47 print('prdict=',pred[:5])#prdict= [0 0 0 1 0]
48 print('y정답=',y[:5])#y정답= [0 0 0 0 0]
49
50 # model 평가 : score = 분류정확도(accuracy)
51 score=lr_model.score(X,y)
52 print(score) #0.9595782073813708
53
54 #:교차 분할표(confusing matrix)
55 tab=pd.crosstab(y,pred) #crosstab(row:정답,col:예측치)
56 print(tab)
57 """
58 col_0 0 1
59 row_0
60 0 198 14
61 1 9 348
62 """
63 acc=(198+348)/len(y)
64 print('accuracy=',acc)#accuracy= 0.9595782073813708
65
66
67 #################################
68 ## 2. load_irsi : 다항분류
69 #################################
70 #1.data loading
71 X,y=load_iris(return_X_y=True)
72
73 #2.model 생성
74 lr_model2=LogisticRegression(random_state=123,
75 solver='lbfgs',
76 multi_class="multinomial")
77 lr_model2.fit(X,y)
78 print(lr_model2) #model 정보
79 """
80 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
81 intercept_scaling=1, max_iter=100, multi_class='multinomial',
82 n_jobs=1, penalty='l2', random_state=123, solver='lbfgs',
83 tol=0.0001, verbose=0, warm_start=False)
84 """
85 # 예측치
86 pred=lr_model2.predict(X) #예측치
87 Y=y #정답
88
89 score=lr_model2.score(X,y)
90 print('accuracy=',score)#accuracy= 0.9733333333333334
91 tab=pd.crosstab(Y,pred)
92 print(tab)
93 """
94 col_0 0 1 2
95 row_0
96 0 50 0 0
97 1 0 47 3
98 2 0 1 49
99 """
100 print(type(tab))#<class 'pandas.core.frame.DataFrame'>
101
102
103 acc=(tab.ix[0,0]+tab.ix[1,1]+tab.ix[2,2])/len(y)
104 print('accuracy=',acc) #accuracy= 0.9733333333333334