目标是爬取ebay'上的二手乐高数据,并使用岭回归交叉验证的方式给出回归方程
from bs4 import BeautifulSoup
import numpy as np
import random
def scrapePage(retX, retY, inFile, yr, numPce, origPrc):
# 打开并读取HTML文件
with open(inFile, encoding='utf-8') as f:
html = f.read()
soup = BeautifulSoup(html)
i = 1
# 根据HTML页面结构进行解析
currentRow = soup.find_all('table', r = "%d" % i)
while(len(currentRow) != 0):
currentRow = soup.find_all('table', r = "%d" % i)
title = currentRow[0].find_all('a')[1].text
lwrTitle = title.lower()
# 查找是否有全新标签
if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
newFlag = 1.0
else:
newFlag = 0.0
# 查找是否已经标志出售,我们只收集已出售的数据
soldUnicde = currentRow[0].find_all('td')[3].find_all('span')
if len(soldUnicde) == 0:
print("商品 #%d 没有出售" % i)
else:
# 解析页面获取当前价格
soldPrice = currentRow[0].find_all('td')[4]
priceStr = soldPrice.text
priceStr = priceStr.replace('$','')
priceStr = priceStr.replace(',','')
if len(soldPrice) > 1:
priceStr = priceStr.replace('Free shipping', '')
sellingPrice = float(priceStr)
# 去掉不完整的套装价格
if sellingPrice > origPrc * 0.5:
print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice))
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
i += 1
currentRow = soup.find_all('table', r = "%d" % i)
#分别抓取各网页数据
def setDataCollect(retX, retY):
scrapePage(retX, retY, 'lego8288.html', 2006, 800, 49.99)
scrapePage(retX, retY, 'lego10030.html', 2002, 3096, 269.99)
scrapePage(retX, retY, 'lego10179.html', 2007, 5195, 499.99)
scrapePage(retX, retY, 'lego10181.html', 2007, 3428, 199.99)
scrapePage(retX, retY, 'lego10189.html', 2008, 5922, 299.99)
scrapePage(retX, retY, 'lego10196.html', 2009, 3263, 249.99)
#标准化
def regularize(xMat,yMat):
inxMat = xMat.copy()
inyMat = yMat.copy()
yMean = np.mean(yMat,0)
inyMat = yMat - yMean
inMeans = np.mean(inxMat,0)
inVar = np.var(inxMat,0)
print(inMeans)
inxMat = (inxMat - inMeans)/inVar
计算平方误差
def rssError(yArr,yHatArr):
return ((yArr - yHatArr)**2).sum()
#计算回归系数W
def standRegres(xArr,yArr):
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
xTx = xMat.T * xMat
if np.linalg.det(xTx) == 0.0:
print("无法求逆")
return
ws = xTx.I * (xMat.T * yMat)
return ws
#交叉验证岭回归
def crossValidation(xArr,yArr,numVal = 10):
#得到数据数
m = len(yArr)
#建索引表
indexList = list(range(m))
#误差表
errorMat = np.zeros((numVal,30))
#交叉验证numVal次
for i in range(numVal):
trainX = []
trainY = []
testX = []
testY = []
#将数据“洗牌”
random.shuffle(indexList)
#划分训练集与测试集
for j in range(m):
if j < m*0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX,trainY)
#对每次交叉验证,计算三十个lamda的系数
for k in range(30):
matTestX = np.mat(testX)
matTrainX = np.mat(trainX)
meanTrain = np.mean(matTrainX,0)
varTrain = np.var(matTrainX,0)
matTestX = (matTestX - meanTrain)/varTrain
yEst = matTestX * np.mat(wMat[k,:]).T + np.mean(trainY)
errorMat[i,k] = rssError(yEst.T.A,np.array(testY))
#得到最小误差的系数
meanErrors = np.mean(errorMat,0)
minMean = float(min(meanErrors))
bestWeights = wMat[np.nonzero(meanErrors == minMean)]
xMat = np.mat(xArr)
yMat = np.mat(yArr)
meanX = np.mean(xMat,0)
varX = np.var(xMat,0)
#逆标准化数据
unReg = bestWeights / varX
print('%f%+f*年份%+f*部件数量%+f*是否全新%+f*原价'%((-1 * np.sum(np.multiply(meanX,unReg))+np.mean(yMat)),unReg[0,0],unReg[0,1],unReg[0,2],unReg[0,3]))
#岭回归测试
def ridgeTest(xArr,yArr):
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
yMean = np.mean(yMat,axis = 0)
yMat = yMat-yMean
xMeans = np.mean(xMat,axis = 0)
xVar = np.var(xMat,axis = 0)
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = np.zeros((numTestPts,np.shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,np.exp(i-10))
wMat[i,:] = ws.T
return wMat
if __name__ == "__main__":
lgX = []
lgY = []
setDataCollect(lgX,lgY)
crossValidation(lgX,lgY)
Part2-Chapter8-预测乐高玩具套装价格
猜你喜欢
转载自blog.csdn.net/JachinMa/article/details/89198011
今日推荐
周排行