风险大脑-支付风险识别天池大赛(一)数据预处理

        

        报了个名,直播一下比赛吧,可能因为没时间会随时断更,大家有好的思路欢迎交流。



        大赛提供的所有数据信息是包含在引号里面的(“xxxx”),这样在后续输入模型使用数据前需要做字符串索引。但由于数据量也比较大(将近100万条数据),字符串索引算子会异常慢(被坑回来补上这篇博客),所以这里直接将引号去除,便于后续直接以Double数值类型导入到Inceptor(hive)数仓中,提供给模型训练。

        PS:之前试过用SubLine直接替换,按道理它是文本处理最牛的编辑器了,可还是卡爆了,不得已才用Java处理。数据量100万行,大小1.2GB,用时5、6分钟,处理后文件大小约为600MB,还能接受。

        处理代码:

package test;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

public class ProcessCommer {

	public static void main(String[] args) throws IOException {

		// 读取、写入的文件
		String testReadFilePath = "E:\\ant\\atec_anti_fraud_train.csv";
		String testWriteFilePath = "E:\\ant\\atec_anti_fraud_train_convert.csv";
		File testReadFile = new File(testReadFilePath);
		File testWriteFile = new File(testWriteFilePath);

		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(testReadFile)));
		BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(testWriteFile)));

		String strLine = "";

		// 逐行读入并替换引号,最后逐行写入
		try {
			while ((strLine = br.readLine()) != null) {
				String tmpStr = strLine.replace("\"", "");
				// 输出控制台
				System.out.println(tmpStr);
				// 写入文件
				bw.write(tmpStr + "\n");
			}

			bw.flush();
			// 关闭流
			br.close();
			bw.close();

		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			br.close();
			bw.close();
		}
	}
}

        

        另外,此次数据有298个feature,在Inceptor(hive)中建表的时候,难道要手动敲将近300个字段?不存在的(好懒哭):

package test;

public class GenFeature {
	
	public static void main(String[] args) {
		String c = "f";
		int i = 1;
		while(i < 298) {
			System.out.println(c + i + " DOUBLE,");
			i++;
		}
	}

}

        以下以DOUBLE为例生成,请自行更改为偏爱的类型,直接复制拿去用吧:

f1 DOUBLE,
f2 DOUBLE,
f3 DOUBLE,
f4 DOUBLE,
f5 DOUBLE,
f6 DOUBLE,
f7 DOUBLE,
f8 DOUBLE,
f9 DOUBLE,
f10 DOUBLE,
f11 DOUBLE,
f12 DOUBLE,
f13 DOUBLE,
f14 DOUBLE,
f15 DOUBLE,
f16 DOUBLE,
f17 DOUBLE,
f18 DOUBLE,
f19 DOUBLE,
f20 DOUBLE,
f21 DOUBLE,
f22 DOUBLE,
f23 DOUBLE,
f24 DOUBLE,
f25 DOUBLE,
f26 DOUBLE,
f27 DOUBLE,
f28 DOUBLE,
f29 DOUBLE,
f30 DOUBLE,
f31 DOUBLE,
f32 DOUBLE,
f33 DOUBLE,
f34 DOUBLE,
f35 DOUBLE,
f36 DOUBLE,
f37 DOUBLE,
f38 DOUBLE,
f39 DOUBLE,
f40 DOUBLE,
f41 DOUBLE,
f42 DOUBLE,
f43 DOUBLE,
f44 DOUBLE,
f45 DOUBLE,
f46 DOUBLE,
f47 DOUBLE,
f48 DOUBLE,
f49 DOUBLE,
f50 DOUBLE,
f51 DOUBLE,
f52 DOUBLE,
f53 DOUBLE,
f54 DOUBLE,
f55 DOUBLE,
f56 DOUBLE,
f57 DOUBLE,
f58 DOUBLE,
f59 DOUBLE,
f60 DOUBLE,
f61 DOUBLE,
f62 DOUBLE,
f63 DOUBLE,
f64 DOUBLE,
f65 DOUBLE,
f66 DOUBLE,
f67 DOUBLE,
f68 DOUBLE,
f69 DOUBLE,
f70 DOUBLE,
f71 DOUBLE,
f72 DOUBLE,
f73 DOUBLE,
f74 DOUBLE,
f75 DOUBLE,
f76 DOUBLE,
f77 DOUBLE,
f78 DOUBLE,
f79 DOUBLE,
f80 DOUBLE,
f81 DOUBLE,
f82 DOUBLE,
f83 DOUBLE,
f84 DOUBLE,
f85 DOUBLE,
f86 DOUBLE,
f87 DOUBLE,
f88 DOUBLE,
f89 DOUBLE,
f90 DOUBLE,
f91 DOUBLE,
f92 DOUBLE,
f93 DOUBLE,
f94 DOUBLE,
f95 DOUBLE,
f96 DOUBLE,
f97 DOUBLE,
f98 DOUBLE,
f99 DOUBLE,
f100 DOUBLE,
f101 DOUBLE,
f102 DOUBLE,
f103 DOUBLE,
f104 DOUBLE,
f105 DOUBLE,
f106 DOUBLE,
f107 DOUBLE,
f108 DOUBLE,
f109 DOUBLE,
f110 DOUBLE,
f111 DOUBLE,
f112 DOUBLE,
f113 DOUBLE,
f114 DOUBLE,
f115 DOUBLE,
f116 DOUBLE,
f117 DOUBLE,
f118 DOUBLE,
f119 DOUBLE,
f120 DOUBLE,
f121 DOUBLE,
f122 DOUBLE,
f123 DOUBLE,
f124 DOUBLE,
f125 DOUBLE,
f126 DOUBLE,
f127 DOUBLE,
f128 DOUBLE,
f129 DOUBLE,
f130 DOUBLE,
f131 DOUBLE,
f132 DOUBLE,
f133 DOUBLE,
f134 DOUBLE,
f135 DOUBLE,
f136 DOUBLE,
f137 DOUBLE,
f138 DOUBLE,
f139 DOUBLE,
f140 DOUBLE,
f141 DOUBLE,
f142 DOUBLE,
f143 DOUBLE,
f144 DOUBLE,
f145 DOUBLE,
f146 DOUBLE,
f147 DOUBLE,
f148 DOUBLE,
f149 DOUBLE,
f150 DOUBLE,
f151 DOUBLE,
f152 DOUBLE,
f153 DOUBLE,
f154 DOUBLE,
f155 DOUBLE,
f156 DOUBLE,
f157 DOUBLE,
f158 DOUBLE,
f159 DOUBLE,
f160 DOUBLE,
f161 DOUBLE,
f162 DOUBLE,
f163 DOUBLE,
f164 DOUBLE,
f165 DOUBLE,
f166 DOUBLE,
f167 DOUBLE,
f168 DOUBLE,
f169 DOUBLE,
f170 DOUBLE,
f171 DOUBLE,
f172 DOUBLE,
f173 DOUBLE,
f174 DOUBLE,
f175 DOUBLE,
f176 DOUBLE,
f177 DOUBLE,
f178 DOUBLE,
f179 DOUBLE,
f180 DOUBLE,
f181 DOUBLE,
f182 DOUBLE,
f183 DOUBLE,
f184 DOUBLE,
f185 DOUBLE,
f186 DOUBLE,
f187 DOUBLE,
f188 DOUBLE,
f189 DOUBLE,
f190 DOUBLE,
f191 DOUBLE,
f192 DOUBLE,
f193 DOUBLE,
f194 DOUBLE,
f195 DOUBLE,
f196 DOUBLE,
f197 DOUBLE,
f198 DOUBLE,
f199 DOUBLE,
f200 DOUBLE,
f201 DOUBLE,
f202 DOUBLE,
f203 DOUBLE,
f204 DOUBLE,
f205 DOUBLE,
f206 DOUBLE,
f207 DOUBLE,
f208 DOUBLE,
f209 DOUBLE,
f210 DOUBLE,
f211 DOUBLE,
f212 DOUBLE,
f213 DOUBLE,
f214 DOUBLE,
f215 DOUBLE,
f216 DOUBLE,
f217 DOUBLE,
f218 DOUBLE,
f219 DOUBLE,
f220 DOUBLE,
f221 DOUBLE,
f222 DOUBLE,
f223 DOUBLE,
f224 DOUBLE,
f225 DOUBLE,
f226 DOUBLE,
f227 DOUBLE,
f228 DOUBLE,
f229 DOUBLE,
f230 DOUBLE,
f231 DOUBLE,
f232 DOUBLE,
f233 DOUBLE,
f234 DOUBLE,
f235 DOUBLE,
f236 DOUBLE,
f237 DOUBLE,
f238 DOUBLE,
f239 DOUBLE,
f240 DOUBLE,
f241 DOUBLE,
f242 DOUBLE,
f243 DOUBLE,
f244 DOUBLE,
f245 DOUBLE,
f246 DOUBLE,
f247 DOUBLE,
f248 DOUBLE,
f249 DOUBLE,
f250 DOUBLE,
f251 DOUBLE,
f252 DOUBLE,
f253 DOUBLE,
f254 DOUBLE,
f255 DOUBLE,
f256 DOUBLE,
f257 DOUBLE,
f258 DOUBLE,
f259 DOUBLE,
f260 DOUBLE,
f261 DOUBLE,
f262 DOUBLE,
f263 DOUBLE,
f264 DOUBLE,
f265 DOUBLE,
f266 DOUBLE,
f267 DOUBLE,
f268 DOUBLE,
f269 DOUBLE,
f270 DOUBLE,
f271 DOUBLE,
f272 DOUBLE,
f273 DOUBLE,
f274 DOUBLE,
f275 DOUBLE,
f276 DOUBLE,
f277 DOUBLE,
f278 DOUBLE,
f279 DOUBLE,
f280 DOUBLE,
f281 DOUBLE,
f282 DOUBLE,
f283 DOUBLE,
f284 DOUBLE,
f285 DOUBLE,
f286 DOUBLE,
f287 DOUBLE,
f288 DOUBLE,
f289 DOUBLE,
f290 DOUBLE,
f291 DOUBLE,
f292 DOUBLE,
f293 DOUBLE,
f294 DOUBLE,
f295 DOUBLE,
f296 DOUBLE,
f297 DOUBLE



猜你喜欢

转载自blog.csdn.net/whdxjbw/article/details/80843936