Part2-Chapter8-预测乐高玩具套装价格

目标是爬取ebay'上的二手乐高数据，并使用岭回归交叉验证的方式给出回归方程

from bs4 import BeautifulSoup
import numpy as np
import random

def scrapePage(retX, retY, inFile, yr, numPce, origPrc):
	# 打开并读取HTML文件
	with open(inFile, encoding='utf-8') as f:
	html = f.read()
	soup = BeautifulSoup(html)
  	i = 1
  	# 根据HTML页面结构进行解析
	currentRow = soup.find_all('table', r = "%d" % i)
	while(len(currentRow) != 0):
   	 	currentRow = soup.find_all('table', r = "%d" % i)
    	title = currentRow[0].find_all('a')[1].text
    	lwrTitle = title.lower()
    	# 查找是否有全新标签
    	if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
       		 newFlag = 1.0
    	else:
        	newFlag = 0.0
   	 	# 查找是否已经标志出售，我们只收集已出售的数据
    	soldUnicde = currentRow[0].find_all('td')[3].find_all('span')
    	if len(soldUnicde) == 0:
        	print("商品 #%d 没有出售" % i)
    	else:
       	 	# 解析页面获取当前价格
        	soldPrice = currentRow[0].find_all('td')[4]
        	priceStr = soldPrice.text
       	 	priceStr = priceStr.replace('$','')
        	priceStr = priceStr.replace(',','')
        	if len(soldPrice) > 1:
            	priceStr = priceStr.replace('Free shipping', '')
       	 	sellingPrice = float(priceStr)
        	# 去掉不完整的套装价格
       	 	if  sellingPrice > origPrc * 0.5:
            	print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice))
            	retX.append([yr, numPce, newFlag, origPrc])
            	retY.append(sellingPrice)
    	i += 1
    	currentRow = soup.find_all('table', r = "%d" % i)
     
 #分别抓取各网页数据
def setDataCollect(retX, retY)：
	scrapePage(retX, retY, 'lego8288.html', 2006, 800, 49.99)             
	scrapePage(retX, retY, 'lego10030.html', 2002, 3096, 269.99)          
	scrapePage(retX, retY, 'lego10179.html', 2007, 5195, 499.99)             
	scrapePage(retX, retY, 'lego10181.html', 2007, 3428, 199.99)               
	scrapePage(retX, retY, 'lego10189.html', 2008, 5922, 299.99)                
	scrapePage(retX, retY, 'lego10196.html', 2009, 3263, 249.99)
	
#标准化
def regularize(xMat,yMat):
	inxMat = xMat.copy()
	inyMat = yMat.copy()
	yMean = np.mean(yMat,0)
	inyMat = yMat - yMean
	inMeans = np.mean(inxMat,0)
	inVar = np.var(inxMat,0)
	print(inMeans)
	inxMat = (inxMat - inMeans)/inVar

计算平方误差
def rssError(yArr,yHatArr):
	return ((yArr - yHatArr)**2).sum()


#计算回归系数W
def standRegres(xArr,yArr):
	xMat = np.mat(xArr)
	yMat = np.mat(yArr).T
	xTx = xMat.T * xMat
	if np.linalg.det(xTx) == 0.0:
    	print("无法求逆")
    	return
	ws = xTx.I * (xMat.T * yMat)
	return ws

#交叉验证岭回归
def crossValidation(xArr,yArr,numVal = 10):
	#得到数据数
	m = len(yArr)
	#建索引表
	indexList = list(range(m))
	#误差表
	errorMat = np.zeros((numVal,30))
	#交叉验证numVal次
	for i in range(numVal):
    	trainX = []
    	trainY = []
    	testX = []
    	testY = []
    	#将数据“洗牌”
   		random.shuffle(indexList)
   		#划分训练集与测试集
    	for j in range(m):
        	if j < m*0.9:
            	trainX.append(xArr[indexList[j]])
            	trainY.append(yArr[indexList[j]])
        	else:
            	testX.append(xArr[indexList[j]])
            	testY.append(yArr[indexList[j]])
    	wMat = ridgeTest(trainX,trainY)
		
		#对每次交叉验证，计算三十个lamda的系数
    	for k in range(30):
        	matTestX = np.mat(testX)
        	matTrainX = np.mat(trainX)
        	meanTrain = np.mean(matTrainX,0)
        	varTrain = np.var(matTrainX,0)
        	matTestX = (matTestX - meanTrain)/varTrain
        	yEst = matTestX * np.mat(wMat[k,:]).T + np.mean(trainY)
        	errorMat[i,k] = rssError(yEst.T.A,np.array(testY))
	
	#得到最小误差的系数
	meanErrors = np.mean(errorMat,0)
	minMean = float(min(meanErrors)) 	
	bestWeights = wMat[np.nonzero(meanErrors == minMean)]
	xMat = np.mat(xArr)
	yMat = np.mat(yArr)
	meanX = np.mean(xMat,0)
	varX = np.var(xMat,0)
	#逆标准化数据
	unReg = bestWeights / varX
	print('%f%+f*年份%+f*部件数量%+f*是否全新%+f*原价'%((-1 * np.sum(np.multiply(meanX,unReg))+np.mean(yMat)),unReg[0,0],unReg[0,1],unReg[0,2],unReg[0,3]))

#岭回归测试
def ridgeTest(xArr,yArr):
	xMat = np.mat(xArr)
	yMat = np.mat(yArr).T
	yMean = np.mean(yMat,axis = 0)
	yMat = yMat-yMean
	xMeans = np.mean(xMat,axis = 0)
	xVar = np.var(xMat,axis = 0)
	xMat = (xMat - xMeans)/xVar
	numTestPts = 30
	wMat = np.zeros((numTestPts,np.shape(xMat)[1]))
	for i in range(numTestPts):
    	ws = ridgeRegres(xMat,yMat,np.exp(i-10))
    	wMat[i,:] = ws.T
	return wMat

if __name__ == "__main__":
	lgX = []
	lgY = []
	setDataCollect(lgX,lgY)
	crossValidation(lgX,lgY)
Part2-Chapter8-预测乐高玩具套装价格

猜你喜欢