- Predict the relevance of search results on homedepot.com
Github: https://github.com/yjfiejd/Product_search_relevance_NLP-/blob/master/Product_search_relevance(jupyter%20notebook).ipynb
思路分析:
#目的:给出输入关键字与搜索结果,评价搜索准确度 #处理思路 #1,导入包、数据 -> 合并数据格式concat,merge, #2,文本预处理 -> 【简单方法】:看输入词是在搜索结果中出现几次,需要先统一数据集格式 -> str_stemmer and str_commond_words 处理数据 #3,自制文本特征 -> 关键词长度/搜索词语与title和describtion中重复词语数 -> 去掉之前的英文,保留自制特征 #4,重塑训练/测试集 -> 拆分出X_train, X_test, y_train, 去除label #5,建立模型:Ridge回归模型RandomForestRegressor 找出最佳参数max_depth=7 通过多种参数导入 -> 画图 #6,上传结果:生成csv文件
具体代码
# -*- coding:utf8 -*- # @TIME : 2018/4/26 下午6:15 # @Author : Allen # @File : product_search_relevance.py #1) 导入需要用的库 import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier, BaggingRegressor from nltk.stem.snowball import SnowballStemmer import os os.chdir('/Users/a1/Desktop/算法实战/Word2vec/Product_search_relevance/data')
#读取数据 df_train = pd.read_csv('train.csv', encoding = "ISO-8859-1") df_test = pd.read_csv('test.csv', encoding = "ISO-8859-1") df_desc = pd.read_csv('product_descriptions.csv')
df_train.head() #输出如下
id product_uid product_title search_term relevance 0 2 100001 Simpson Strong-Tie 12-Gauge Angle angle bracket 3.00 1 3 100001 Simpson Strong-Tie 12-Gauge Angle l bracket 2.50 2 9 100002 BEHR Premium Textured DeckOver 1-gal. #SC-141 ... deck over 3.00 3 16 100005 Delta Vero 1-Handle Shower Only Faucet Trim Ki... rain shower head 2.33 4 17 100005 Delta Vero 1-Handle Shower Only Faucet Trim Ki... shower only faucet 2.67
df_desc.head() #输出如下
product_uid product_description 0 100001 Not only do angles make joints stronger, they ... 1 100002 BEHR Premium Textured DECKOVER is an innovativ... 2 100003 Classic architecture meets contemporary design... 3 100004 The Grape Solar 265-Watt Polycrystalline PV So... 4 100005 Update your bathroom with the Delta Vero Singl...
#合并测试集与训练集,便于统一文本预处理 #PANDAS 数据合并与重塑(concat篇):https://blog.csdn.net/stevenkwong/article/details/52528616 #PANDAS 数据合并与重塑(join/merge篇):https://blog.csdn.net/stevenkwong/article/details/52540605 df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) df_all.head() #print(df_all.shape) #输出如下
id product_title product_uid relevance search_term 0 2 Simpson Strong-Tie 12-Gauge Angle 100001 3.00 angle bracket 1 3 Simpson Strong-Tie 12-Gauge Angle 100001 2.50 l bracket 2 9 BEHR Premium Textured DeckOver 1-gal. #SC-141 ... 100002 3.00 deck over 3 16 Delta Vero 1-Handle Shower Only Faucet Trim Ki... 100005 2.33 rain shower head 4 17 Delta Vero 1-Handle Shower Only Faucet Trim Ki... 100005 2.67 shower only faucet
#把描述信息加入表,how='left'表示左边全部保留,on表示以什么为基准对齐 df_all = pd.merge(df_all, df_desc, how='left', on='product_uid') df_all.head() #print("******************************") 输出如下
id product_title product_uid relevance search_term product_description 0 2 Simpson Strong-Tie 12-Gauge Angle 100001 3.00 angle bracket Not only do angles make joints stronger, they ... 1 3 Simpson Strong-Tie 12-Gauge Angle 100001 2.50 l bracket Not only do angles make joints stronger, they ... 2 9 BEHR Premium Textured DeckOver 1-gal. #SC-141 ... 100002 3.00 deck over BEHR Premium Textured DECKOVER is an innovativ... 3 16 Delta Vero 1-Handle Shower Only Faucet Trim Ki... 100005 2.33 rain shower head Update your bathroom with the Delta Vero Singl... 4 17 Delta Vero 1-Handle Shower Only Faucet Trim Ki... 100005 2.67 shower only faucet Update your bathroom with the Delta Vero Singl... In [10]:
#2) 文本预处理,把表格的语句处理为计算机能懂的格式,这里使用NLTK stemmer = SnowballStemmer('english') #这里简单处理,只提取了词干 def str_stemmer(s): return " ".join([stemmer.stem(word) for word in s.lower().split()]) #计算"关键词次数" def str_common_word(str1, str2): return sum(int(str2.find(word)>=0) for word in str1.split())
#把每一个column都跑一遍,用str_stemmer清洁所有的文本内容 df_all['search_term'] = df_all['search_term'].map(lambda x: str_stemmer(x)) df_all['product_title'] = df_all['product_title'].map(lambda x: str_stemmer(x)) df_all['product_description'] = df_all['product_description'].map(lambda x: str_stemmer(x)) #3) 自制文本特征 #关键字的长度 df_all['len_of_query'] = df_all['search_term'].map(lambda x: len(x.split())).astype(np.int64) #搜索词与标题中有多少关键字重合 df_all['commons_in_title'] = df_all.apply(lambda x:str_common_word(x['search_term'], x['product_title']), axis=1) #搜索词与描述中有多少关键字重合 #df_all['commons_in_desc'] = df_all.apply(lambda x: str_common_word(x['search_term'], x['product_description'], axis = 1) df_all['commons_in_desc'] = df_all.apply(lambda x: str_common_word(x['search_term'], x['product_description']), axis=1) #搞完之后,我们把不能被『机器学习模型』处理的column给drop掉,这一步太偷懒了 df_all = df_all.drop(['search_term', 'product_title', 'product_description'], axis=1) df_all.head() #输出如下:
id product_uid relevance len_of_query commons_in_title commons_in_desc 0 2 100001 3.00 2 1 1 1 3 100001 2.50 2 1 1 2 9 100002 3.00 2 1 1 3 16 100005 2.33 3 1 1 4 17 100005 2.67 3 3 2
#4) 重塑训练/测试集合, 刚才把train与test合并了,现在分开 df_train = df_all.loc[df_train.index] df_test = df_all.loc[df_test.index] #记录id,后续上传能用 test_ids = df_test['id'] #分离出y_train y_train = df_train['relevance'].values #删除原集合中的label X_train = df_train.drop(['id', 'relevance'], axis=1).values X_test = df_test.drop(['id', 'relevance'], axis=1).values print("###################") #5)建立模型:Ridge模型 from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score %matplotlib inline params = [1, 3, 4, 6, 7, 8,] test_scores = [] for param in params: clf = RandomForestRegressor(n_estimators=30, max_depth=param) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) import matplotlib.pyplot as plt plt.plot(params, test_scores) plt.title("Param vs CV Error");
#6) 上传结果 rf = RandomForestRegressor(n_estimators=30, max_depth=7) rf.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
y_pred = rf.predict(X_test) len(y_pred) pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('submission123.csv',index=False)# 再来一波进阶篇一会