以下是主要代码,用word2vec来对词汇进行处理,只给了主要代码,其他读入和处理的代码在我博客里nlp处理的基础版和进阶版中都可以很容易查到。点击打开链接
#训练NLP模型
#先用最简单的Woed2Vec
from gensim.models.word.2Vec import Word2Vec
model = Word2Vec(corpus,size=128,windows=5,min_counts = 5,workers = 4)
#用NLP来表示
#先取全部的词汇
vocab = model.vocab
#得到任意text的vector
def get_vector(word_list):
res = np.zeros([128])
count = 0
if word in wordlist:
if word in vocab:
res += model[word]
count +=1
return res/count
wordlist_train = X_train
wordlist_test = X_test
X_train = [get_vector(x) for x in X_train]
X_test = [get_vector(x) for x in X_test]
#建立ML模型
from sklearn.svm import SVR
from sklearn.model_selection import cross_val.score
params = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]
test_scores=[]
for param in params:
clf = SVR(gamma = param)
test_score = cross_val_score(clf,X_train,y_train,cv = 3,scoring = 'roc_auc')
test_scores.append(np.mean(test_score))
import matplotlib.pyplot as plt
plt.plot(params,test_scores)
plt.title('param vs sv AUC Score')