第一个flink application

导入maven依赖

需要注意的是，如果使用scala写程序，导入的依赖跟java是不一样的

Maven Dependencies
You can add the following dependencies to your pom.xml to include Apache Flink in your project. These dependencies include a local execution environment and thus support local testing.

Scala API: To use the Scala API, replace the flink-java artifact id with flink-scala_2.11 and flink-streaming-java_2.11 with flink-streaming-scala_2.11.
<dependency>
  <groupId>org.apache.flink</groupId>
  <artifactId>flink-java</artifactId>
  <version>1.8.2</version>
</dependency>
<dependency>
  <groupId>org.apache.flink</groupId>
  <artifactId>flink-streaming-java_2.11</artifactId>
  <version>1.8.2</version>
</dependency>
<dependency>
  <groupId>org.apache.flink</groupId>
  <artifactId>flink-clients_2.11</artifactId>
  <version>1.8.2</version>
</dependency>

批处理wordcount示例

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class WordCount {

    // 批量处理示例代码
    public static void main(String[] args) throws Exception {
        String inputPath = "E:\\flink\\words.txt";
        String outputPath = "E:\\flink\\result";
        //获取运行环境
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        //读取文件
        DataSet<String> text = env.readTextFile(inputPath);

        DataSet<Tuple2<String, Integer>> counts =
                // split up the lines in pairs (2-tuples) containing: (word,1)
                text.flatMap(new Tokenizer())
                        // group by the tuple field "0" and sum up tuple field "1"
                        .groupBy(0) //以tuple的第一个字段分组
                        .sum(1);//以tuple的第二个字段计算总和

        //setParallelism来设置并行度，类似spark。如果不设置并行度，将以多线程的形式输出，生成多个文件
        counts.writeAsCsv(outputPath, "\n", " ").setParallelism(1);

        env.execute("Batch WordCount Example");

    }

    // 自定义函数，也可以不在这里自定义，直接卸载上面flatMap()中也可以
    public static class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> {

        @Override
        public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
            // normalize and split the line
            String[] tokens = value.toLowerCase().split(",");

            for (String token : tokens) {
                if (token.length() > 0) {
                    //包装成tuple2
                    out.collect(new Tuple2<String, Integer>(token, 1));
                }
            }
        }
    }
}

流式处理wordcount示例

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

/**
 *  滑动窗口计算
 * 通过socket模拟产生单词数据
 * flink对数据进行统计计算
 */
public class SocketWindowWordCount {

    public static void main(String[] args) throws Exception {
        //获取socket的端口号
        int port;
        try {
            ParameterTool parameterTool = ParameterTool.fromArgs(args);
            port = parameterTool.getInt("port");
        }catch (Exception e){
            System.out.println("No port set. use default port 9000");
            port = 9999;
        }

        //获取运行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        String hostname = "master01.hadoop.mobile.cn";
        String delimiter = "\n";
        DataStreamSource<String> text  = env.socketTextStream(hostname, port, delimiter);
        //跟spark一样，使用flatmap算子来操作
        //输入数据为string类型，输出为自定义的WordWithCount类型对象
        DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() {
            public void flatMap(String value, Collector<WordWithCount> out) throws Exception {
                String[] splits = value.split(" ");
                for (String word : splits) {
                    out.collect(new WordWithCount(word, 1L));
                }
            }
        }).keyBy("word")
                .timeWindow(Time.seconds(10), Time.seconds(5))//指定时间窗口大小为10秒，指定时间间隔为5秒
                //每隔1秒统计前2秒的数据
                .sum("count");

        //把数据打印到控制台并且设置并行度
        windowCounts.print().setParallelism(1);
        System.out.println(System.currentTimeMillis());
        env.execute("Socket window count");
    }

    public static class WordWithCount{
        public String word;
        public long count;
        public  WordWithCount(){}
        public WordWithCount(String word,long count){
            this.word = word;
            this.count = count;
        }
        @Override
        public String toString() {
            return "WordWithCount{" +
                    "word='" + word + '\'' +
                    ", count=" + count +
                    '}';
        }
    }

}

关于keyby算子：

    /**
     * Partitions the operator state of a {@link DataStream} using field expressions.
     * A field expression is either the name of a public field or a getter method with parentheses
     * of the {@link DataStream}'s underlying type. A dot can be used to drill
     * down into objects, as in {@code "field1.getInnerField2()" }.
     *
     * @param fields
     *            One or more field expressions on which the state of the {@link DataStream} operators will be
     *            partitioned.
     * @return The {@link DataStream} with partitioned state (i.e. KeyedStream)
     * keyby用于分组的，接收的为变长参数，所以key可以指定一个或者多个字段。
     *    此外在指定key的时候可以直接指定该字段的名字（但是要求为public类型的，否则报错如下：
     *    Exception in thread "main" org.apache.flink.api.common.InvalidProgramException: This type (GenericType<SocketWindowWordCount.WordWithCount>) cannot be used as key.
     *    at org.apache.flink.api.common.operators.Keys$ExpressionKeys.<init>(Keys.java:330)
     *    at org.apache.flink.streaming.api.datastream.DataStream.keyBy(DataStream.java:337)
     *    at SocketWindowWordCount.main(SocketWindowWordCount.java:41)
     ）
     也可以通过getter方法来获取
     **/
    public KeyedStream<T, Tuple> keyBy(String... fields) {
        return keyBy(new Keys.ExpressionKeys<>(fields, getType()));
    }

第一个flink application

导入maven依赖

批处理wordcount示例

流式处理wordcount示例

猜你喜欢