版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u010675669/article/details/81704276
Spark从本地文件中统计包含某个字母的行数:
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
/**
* spark从文件中统计包含某个字段的记录数
* @author admin
*
*/
public class SparkWordCount {
public static void main(String[] args) {
String path = SparkWordCount.class.getClassLoader().getResource("").getPath();
String logFile = path+"/file/README.md"; // Should be some file on your system
String appName = "SparkWordCount";
String master = "local";
// 配置
SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
//文件的每一行作为一条记录
Dataset<String> logData = spark.read().textFile(logFile).cache();
// 统计
long numAs = logData.filter(s -> s.contains("a")).count();
long numBs = logData.filter(s -> s.contains("b")).count();
System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
spark.stop();
}
}