前面所说的都是只有二位的数据,现在我们把问题普遍化,将高维数据微量低维数据映射的问题。
假设样本X的数量有m个,特征数目有k个,也就是m*k大小的样本,希望降维之后变成m*n,也就是说,降维到n维的空间内。简单地说,我们需要找前n个主成分。
用矩阵表示如下:
下面用代码封装一个降维工具:
class PCA:
def __init__(self, n_components):
'''initiate PCA'''
assert n_cimponents >= 1, "n_components must be valid"
self.n_components = n_components
self.components_ = None
def fit(self, X, eta=0.01, n_iters=1e4):
'''get first n pca of data X'''
assert self.n_components <= X.shape[1],\
'''n_components must not be greater than the feature number of X'''
def demean(X):
return X-np.mean(X,axis=0)
def f(w,X):
'''cost function'''
return np.sum(X.dot(w)**2)/len(X)
def df(w,X):
'''derived function'''
return X.T.dot(X.dot(w))*2./len(X)
def direction(w):
'''turn w to unit vector'''
return w/np.linalg.norm(w)
def first_component(X, initial_w, eta=0.01, n_iters=1e4,epsilon=1e-8):
w = direction(initial_w)
cur_iter=0
while cur_iter<n_iters:
gradient=df(w,X)
last_w = w
w = w+eta*gradient
w = direction(w)
if (abs(f(w,X)-f(last_w,X))<epsilon):
break
cur_iter+=1
return w
X_pca = demean(X)
self.components_ = np.empty(shape=(self.n_components,X.shape[1]))#initiate an empty w matix (n,m)
for i in range(self.n_components):
initial_w = np.random.random(X_pca.shape[1])
w = first_component(X_pca,initial_w, eta, n_iters)
self.components_[i,:] = w #put w in w matrix
X_pca = X_pca - X_pca.dot(w).reshape(-1,1)*w
return self
def transform(self, X):
assert X.shape[1]==self.components_.shape[1]
return X.dot(self.components_.T)
def inverse_transform(self,X):
assert X.shape[1]==self.components_.shape[0]
returnX.dot(self.components_)
def __repr__(self):
return "PCA(n_components=%d)"%self.n_components