Spark从本地文件中统计包含某个字母的行数

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u010675669/article/details/81704276

Spark从本地文件中统计包含某个字母的行数:

import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
/**
 * spark从文件中统计包含某个字段的记录数
 * @author admin
 *
 */
public class SparkWordCount {

	public static void main(String[] args) {
		
		String path = SparkWordCount.class.getClassLoader().getResource("").getPath();
		String logFile = path+"/file/README.md"; // Should be some file on your system
		
		String appName = "SparkWordCount";
		String master = "local";
		// 配置
		SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
	    SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
	    
	    //文件的每一行作为一条记录
	    Dataset<String> logData = spark.read().textFile(logFile).cache();
	    
	    // 统计
	    long numAs = logData.filter(s -> s.contains("a")).count();
	    long numBs = logData.filter(s -> s.contains("b")).count();

	    System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);

	    spark.stop();
	}

}

猜你喜欢

转载自blog.csdn.net/u010675669/article/details/81704276