数据集来自天池竞赛
以下部分为暂时的数据处理,每天都会更新完善~~
import numpy as np
import pandas as pd
#读取训练/测试/验证集数据
train_file = 'H:/TianChiOppoRound1/oppo_round1_train_20180929.txt'
train_df = pd.read_csv(train_file,sep='\t',header=None,names=['prefix','query_prediction','title','tag','label'],low_memory=False)
test_file = 'H:/TianChiOppoRound1/oppo_round1_test_A_20180929.txt'
test_df = pd.read_csv(test_file,sep='\t',header=None,names=['prefix','query_prediction','title','tag'],low_memory=False)
vali_file = 'H:/TianChiOppoRound1/oppo_round1_vali_20180929.txt'
vali_df = pd.read_csv(vali_file,sep='\t',header=None,names=['prefix','query_prediction','title','tag','label'],low_memory=False)
#定义函数将query_prediction数据展开为列表
def split_query_prediction(text):
if pd.isna(text):
return []
return [s.strip() for s in text.replace('{','').replace('}','').split(',')]
train_df['pred_list'] = train_df['query_prediction'].apply(split_query_prediction)
train_df['pred_len'] = train_df['pred_list'].apply(len)
train_query_prediction = train_df.pop('query_prediction')
#train_df.to_csv('H:/TianChiOppoRound1/train.csv')
# print(np.shape(train_df))
# print(train_df.columns.values)
持续更新中。。。