导入maven依赖
需要注意的是,如果使用scala写程序,导入的依赖跟java是不一样的
Maven Dependencies You can add the following dependencies to your pom.xml to include Apache Flink in your project. These dependencies include a local execution environment and thus support local testing. Scala API: To use the Scala API, replace the flink-java artifact id with flink-scala_2.11 and flink-streaming-java_2.11 with flink-streaming-scala_2.11. <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>1.8.2</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>1.8.2</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_2.11</artifactId> <version>1.8.2</version> </dependency>
批处理wordcount示例
import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.util.Collector; public class WordCount { // 批量处理示例代码 public static void main(String[] args) throws Exception { String inputPath = "E:\\flink\\words.txt"; String outputPath = "E:\\flink\\result"; //获取运行环境 ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); //读取文件 DataSet<String> text = env.readTextFile(inputPath); DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1) text.flatMap(new Tokenizer()) // group by the tuple field "0" and sum up tuple field "1" .groupBy(0) //以tuple的第一个字段分组 .sum(1);//以tuple的第二个字段计算总和 //setParallelism来设置并行度,类似spark。如果不设置并行度,将以多线程的形式输出,生成多个文件 counts.writeAsCsv(outputPath, "\n", " ").setParallelism(1); env.execute("Batch WordCount Example"); } // 自定义函数,也可以不在这里自定义,直接卸载上面flatMap()中也可以 public static class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> { @Override public void flatMap(String value, Collector<Tuple2<String, Integer>> out) { // normalize and split the line String[] tokens = value.toLowerCase().split(","); for (String token : tokens) { if (token.length() > 0) { //包装成tuple2 out.collect(new Tuple2<String, Integer>(token, 1)); } } } } }
流式处理wordcount示例
import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.windowing.time.Time; import org.apache.flink.util.Collector; /** * 滑动窗口计算 * 通过socket模拟产生单词数据 * flink对数据进行统计计算 */ public class SocketWindowWordCount { public static void main(String[] args) throws Exception { //获取socket的端口号 int port; try { ParameterTool parameterTool = ParameterTool.fromArgs(args); port = parameterTool.getInt("port"); }catch (Exception e){ System.out.println("No port set. use default port 9000"); port = 9999; } //获取运行环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); String hostname = "master01.hadoop.mobile.cn"; String delimiter = "\n"; DataStreamSource<String> text = env.socketTextStream(hostname, port, delimiter); //跟spark一样,使用flatmap算子来操作 //输入数据为string类型,输出为自定义的WordWithCount类型对象 DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() { public void flatMap(String value, Collector<WordWithCount> out) throws Exception { String[] splits = value.split(" "); for (String word : splits) { out.collect(new WordWithCount(word, 1L)); } } }).keyBy("word") .timeWindow(Time.seconds(10), Time.seconds(5))//指定时间窗口大小为10秒,指定时间间隔为5秒 //每隔1秒统计前2秒的数据 .sum("count"); //把数据打印到控制台并且设置并行度 windowCounts.print().setParallelism(1); System.out.println(System.currentTimeMillis()); env.execute("Socket window count"); } public static class WordWithCount{ public String word; public long count; public WordWithCount(){} public WordWithCount(String word,long count){ this.word = word; this.count = count; } @Override public String toString() { return "WordWithCount{" + "word='" + word + '\'' + ", count=" + count + '}'; } } }
关于keyby算子:
/** * Partitions the operator state of a {@link DataStream} using field expressions. * A field expression is either the name of a public field or a getter method with parentheses * of the {@link DataStream}'s underlying type. A dot can be used to drill * down into objects, as in {@code "field1.getInnerField2()" }. * * @param fields * One or more field expressions on which the state of the {@link DataStream} operators will be * partitioned. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) * keyby用于分组的,接收的为变长参数,所以key可以指定一个或者多个字段。 * 此外在指定key的时候可以直接指定该字段的名字(但是要求为public类型的,否则报错如下: * Exception in thread "main" org.apache.flink.api.common.InvalidProgramException: This type (GenericType<SocketWindowWordCount.WordWithCount>) cannot be used as key. * at org.apache.flink.api.common.operators.Keys$ExpressionKeys.<init>(Keys.java:330) * at org.apache.flink.streaming.api.datastream.DataStream.keyBy(DataStream.java:337) * at SocketWindowWordCount.main(SocketWindowWordCount.java:41) ) 也可以通过getter方法来获取 **/ public KeyedStream<T, Tuple> keyBy(String... fields) { return keyBy(new Keys.ExpressionKeys<>(fields, getType())); }