Search in sources :

Example 1 with SourceFunction

use of org.apache.flink.streaming.api.functions.source.SourceFunction in project flink by apache.

the class KafkaProducerTestBase method runCustomPartitioningTest.

/**
	 * 
	 * <pre>
	 *             +------> (sink) --+--> [KAFKA-1] --> (source) -> (map) --+
	 *            /                  |                                       \
	 *           /                   |                                        \
	 * (source) ----------> (sink) --+--> [KAFKA-2] --> (source) -> (map) -----+-> (sink)
	 *           \                   |                                        /
	 *            \                  |                                       /
	 *             +------> (sink) --+--> [KAFKA-3] --> (source) -> (map) --+
	 * </pre>
	 * 
	 * The mapper validates that the values come consistently from the correct Kafka partition.
	 * 
	 * The final sink validates that there are no duplicates and that all partitions are present.
	 */
public void runCustomPartitioningTest() {
    try {
        LOG.info("Starting KafkaProducerITCase.testCustomPartitioning()");
        final String topic = "customPartitioningTestTopic";
        final int parallelism = 3;
        createTestTopic(topic, parallelism, 1);
        TypeInformation<Tuple2<Long, String>> longStringInfo = TypeInfoParser.parse("Tuple2<Long, String>");
        StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
        env.setRestartStrategy(RestartStrategies.noRestart());
        env.getConfig().disableSysoutLogging();
        TypeInformationSerializationSchema<Tuple2<Long, String>> serSchema = new TypeInformationSerializationSchema<>(longStringInfo, env.getConfig());
        TypeInformationSerializationSchema<Tuple2<Long, String>> deserSchema = new TypeInformationSerializationSchema<>(longStringInfo, env.getConfig());
        // ------ producing topology ---------
        // source has DOP 1 to make sure it generates no duplicates
        DataStream<Tuple2<Long, String>> stream = env.addSource(new SourceFunction<Tuple2<Long, String>>() {

            private boolean running = true;

            @Override
            public void run(SourceContext<Tuple2<Long, String>> ctx) throws Exception {
                long cnt = 0;
                while (running) {
                    ctx.collect(new Tuple2<Long, String>(cnt, "kafka-" + cnt));
                    cnt++;
                }
            }

            @Override
            public void cancel() {
                running = false;
            }
        }).setParallelism(1);
        Properties props = new Properties();
        props.putAll(FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings));
        props.putAll(secureProps);
        // sink partitions into 
        kafkaServer.produceIntoKafka(stream, topic, new KeyedSerializationSchemaWrapper<>(serSchema), props, new CustomPartitioner(parallelism)).setParallelism(parallelism);
        // ------ consuming topology ---------
        Properties consumerProps = new Properties();
        consumerProps.putAll(standardProps);
        consumerProps.putAll(secureProps);
        FlinkKafkaConsumerBase<Tuple2<Long, String>> source = kafkaServer.getConsumer(topic, deserSchema, consumerProps);
        env.addSource(source).setParallelism(parallelism).map(new RichMapFunction<Tuple2<Long, String>, Integer>() {

            private int ourPartition = -1;

            @Override
            public Integer map(Tuple2<Long, String> value) {
                int partition = value.f0.intValue() % parallelism;
                if (ourPartition != -1) {
                    assertEquals("inconsistent partitioning", ourPartition, partition);
                } else {
                    ourPartition = partition;
                }
                return partition;
            }
        }).setParallelism(parallelism).addSink(new SinkFunction<Integer>() {

            private int[] valuesPerPartition = new int[parallelism];

            @Override
            public void invoke(Integer value) throws Exception {
                valuesPerPartition[value]++;
                boolean missing = false;
                for (int i : valuesPerPartition) {
                    if (i < 100) {
                        missing = true;
                        break;
                    }
                }
                if (!missing) {
                    throw new SuccessException();
                }
            }
        }).setParallelism(1);
        tryExecute(env, "custom partitioning test");
        deleteTestTopic(topic);
        LOG.info("Finished KafkaProducerITCase.testCustomPartitioning()");
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : SourceFunction(org.apache.flink.streaming.api.functions.source.SourceFunction) KeyedSerializationSchemaWrapper(org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper) Properties(java.util.Properties) SuccessException(org.apache.flink.test.util.SuccessException) TypeInformationSerializationSchema(org.apache.flink.streaming.util.serialization.TypeInformationSerializationSchema) SinkFunction(org.apache.flink.streaming.api.functions.sink.SinkFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) SuccessException(org.apache.flink.test.util.SuccessException) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 2 with SourceFunction

use of org.apache.flink.streaming.api.functions.source.SourceFunction in project flink by apache.

the class SourceFunctionUtil method runSourceFunction.

public static <T extends Serializable> List<T> runSourceFunction(SourceFunction<T> sourceFunction) throws Exception {
    final List<T> outputs = new ArrayList<T>();
    if (sourceFunction instanceof RichFunction) {
        AbstractStreamOperator<?> operator = mock(AbstractStreamOperator.class);
        when(operator.getExecutionConfig()).thenReturn(new ExecutionConfig());
        RuntimeContext runtimeContext = new StreamingRuntimeContext(operator, new MockEnvironment("MockTask", 3 * 1024 * 1024, new MockInputSplitProvider(), 1024), new HashMap<String, Accumulator<?, ?>>());
        ((RichFunction) sourceFunction).setRuntimeContext(runtimeContext);
        ((RichFunction) sourceFunction).open(new Configuration());
    }
    try {
        SourceFunction.SourceContext<T> ctx = new CollectingSourceContext<T>(new Object(), outputs);
        sourceFunction.run(ctx);
    } catch (Exception e) {
        throw new RuntimeException("Cannot invoke source.", e);
    }
    return outputs;
}
Also used : Accumulator(org.apache.flink.api.common.accumulators.Accumulator) SourceFunction(org.apache.flink.streaming.api.functions.source.SourceFunction) StreamingRuntimeContext(org.apache.flink.streaming.api.operators.StreamingRuntimeContext) Configuration(org.apache.flink.configuration.Configuration) RichFunction(org.apache.flink.api.common.functions.RichFunction) ArrayList(java.util.ArrayList) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) MockEnvironment(org.apache.flink.runtime.operators.testutils.MockEnvironment) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext) StreamingRuntimeContext(org.apache.flink.streaming.api.operators.StreamingRuntimeContext) MockInputSplitProvider(org.apache.flink.runtime.operators.testutils.MockInputSplitProvider)

Example 3 with SourceFunction

use of org.apache.flink.streaming.api.functions.source.SourceFunction in project flink by apache.

the class WindowFoldITCase method testFoldProcessWindow.

@Test
public void testFoldProcessWindow() throws Exception {
    testResults = new ArrayList<>();
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
    env.setParallelism(1);
    DataStream<Tuple2<String, Integer>> source1 = env.addSource(new SourceFunction<Tuple2<String, Integer>>() {

        private static final long serialVersionUID = 1L;

        @Override
        public void run(SourceContext<Tuple2<String, Integer>> ctx) throws Exception {
            ctx.collect(Tuple2.of("a", 0));
            ctx.collect(Tuple2.of("a", 1));
            ctx.collect(Tuple2.of("a", 2));
            ctx.collect(Tuple2.of("b", 3));
            ctx.collect(Tuple2.of("b", 4));
            ctx.collect(Tuple2.of("b", 5));
            ctx.collect(Tuple2.of("a", 6));
            ctx.collect(Tuple2.of("a", 7));
            ctx.collect(Tuple2.of("a", 8));
        // source is finite, so it will have an implicit MAX watermark when it finishes
        }

        @Override
        public void cancel() {
        }
    }).assignTimestampsAndWatermarks(new Tuple2TimestampExtractor());
    source1.keyBy(0).window(TumblingEventTimeWindows.of(Time.of(3, TimeUnit.MILLISECONDS))).fold(Tuple2.of(0, "R:"), new FoldFunction<Tuple2<String, Integer>, Tuple2<Integer, String>>() {

        @Override
        public Tuple2<Integer, String> fold(Tuple2<Integer, String> accumulator, Tuple2<String, Integer> value) throws Exception {
            accumulator.f1 += value.f0;
            accumulator.f0 += value.f1;
            return accumulator;
        }
    }, new ProcessWindowFunction<Tuple2<Integer, String>, Tuple3<String, Integer, Integer>, Tuple, TimeWindow>() {

        @Override
        public void process(Tuple tuple, Context context, Iterable<Tuple2<Integer, String>> elements, Collector<Tuple3<String, Integer, Integer>> out) throws Exception {
            int i = 0;
            for (Tuple2<Integer, String> in : elements) {
                out.collect(new Tuple3<>(in.f1, in.f0, i++));
            }
        }
    }).addSink(new SinkFunction<Tuple3<String, Integer, Integer>>() {

        @Override
        public void invoke(Tuple3<String, Integer, Integer> value) throws Exception {
            testResults.add(value.toString());
        }
    });
    env.execute("Fold Process Window Test");
    List<String> expectedResult = Arrays.asList("(R:aaa,3,0)", "(R:aaa,21,0)", "(R:bbb,12,0)");
    Collections.sort(expectedResult);
    Collections.sort(testResults);
    Assert.assertEquals(expectedResult, testResults);
}
Also used : SourceFunction(org.apache.flink.streaming.api.functions.source.SourceFunction) FoldFunction(org.apache.flink.api.common.functions.FoldFunction) ProcessWindowFunction(org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Collector(org.apache.flink.util.Collector) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Tuple(org.apache.flink.api.java.tuple.Tuple) Test(org.junit.Test)

Example 4 with SourceFunction

use of org.apache.flink.streaming.api.functions.source.SourceFunction in project flink by apache.

the class WindowFoldITCase method testFoldProcessAllWindow.

@Test
public void testFoldProcessAllWindow() throws Exception {
    testResults = new ArrayList<>();
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
    env.setParallelism(1);
    DataStream<Tuple2<String, Integer>> source1 = env.addSource(new SourceFunction<Tuple2<String, Integer>>() {

        private static final long serialVersionUID = 1L;

        @Override
        public void run(SourceContext<Tuple2<String, Integer>> ctx) throws Exception {
            ctx.collect(Tuple2.of("a", 0));
            ctx.collect(Tuple2.of("a", 1));
            ctx.collect(Tuple2.of("a", 2));
            ctx.collect(Tuple2.of("b", 3));
            ctx.collect(Tuple2.of("b", 4));
            ctx.collect(Tuple2.of("b", 5));
            ctx.collect(Tuple2.of("a", 6));
            ctx.collect(Tuple2.of("a", 7));
            ctx.collect(Tuple2.of("a", 8));
        // source is finite, so it will have an implicit MAX watermark when it finishes
        }

        @Override
        public void cancel() {
        }
    }).assignTimestampsAndWatermarks(new Tuple2TimestampExtractor());
    source1.windowAll(TumblingEventTimeWindows.of(Time.of(3, TimeUnit.MILLISECONDS))).fold(Tuple2.of(0, "R:"), new FoldFunction<Tuple2<String, Integer>, Tuple2<Integer, String>>() {

        @Override
        public Tuple2<Integer, String> fold(Tuple2<Integer, String> accumulator, Tuple2<String, Integer> value) throws Exception {
            accumulator.f1 += value.f0;
            accumulator.f0 += value.f1;
            return accumulator;
        }
    }, new ProcessAllWindowFunction<Tuple2<Integer, String>, Tuple3<String, Integer, Integer>, TimeWindow>() {

        @Override
        public void process(Context context, Iterable<Tuple2<Integer, String>> elements, Collector<Tuple3<String, Integer, Integer>> out) throws Exception {
            int i = 0;
            for (Tuple2<Integer, String> in : elements) {
                out.collect(new Tuple3<>(in.f1, in.f0, i++));
            }
        }
    }).addSink(new SinkFunction<Tuple3<String, Integer, Integer>>() {

        @Override
        public void invoke(Tuple3<String, Integer, Integer> value) throws Exception {
            testResults.add(value.toString());
        }
    });
    env.execute("Fold Process Window Test");
    List<String> expectedResult = Arrays.asList("(R:aaa,3,0)", "(R:aaa,21,0)", "(R:bbb,12,0)");
    Collections.sort(expectedResult);
    Collections.sort(testResults);
    Assert.assertEquals(expectedResult, testResults);
}
Also used : SourceFunction(org.apache.flink.streaming.api.functions.source.SourceFunction) FoldFunction(org.apache.flink.api.common.functions.FoldFunction) ProcessAllWindowFunction(org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Collector(org.apache.flink.util.Collector) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test)

Example 5 with SourceFunction

use of org.apache.flink.streaming.api.functions.source.SourceFunction in project flink by apache.

the class StreamExecutionEnvironment method addSource.

/**
	 * Ads a data source with a custom type information thus opening a
	 * {@link DataStream}. Only in very special cases does the user need to
	 * support type information. Otherwise use
	 * {@link #addSource(org.apache.flink.streaming.api.functions.source.SourceFunction)}
	 *
	 * @param function
	 * 		the user defined function
	 * @param sourceName
	 * 		Name of the data source
	 * @param <OUT>
	 * 		type of the returned stream
	 * @param typeInfo
	 * 		the user defined type information for the stream
	 * @return the data stream constructed
	 */
@SuppressWarnings("unchecked")
public <OUT> DataStreamSource<OUT> addSource(SourceFunction<OUT> function, String sourceName, TypeInformation<OUT> typeInfo) {
    if (typeInfo == null) {
        if (function instanceof ResultTypeQueryable) {
            typeInfo = ((ResultTypeQueryable<OUT>) function).getProducedType();
        } else {
            try {
                typeInfo = TypeExtractor.createTypeInfo(SourceFunction.class, function.getClass(), 0, null, null);
            } catch (final InvalidTypesException e) {
                typeInfo = (TypeInformation<OUT>) new MissingTypeInfo(sourceName, e);
            }
        }
    }
    boolean isParallel = function instanceof ParallelSourceFunction;
    clean(function);
    StreamSource<OUT, ?> sourceOperator;
    if (function instanceof StoppableFunction) {
        sourceOperator = new StoppableStreamSource<>(cast2StoppableSourceFunction(function));
    } else {
        sourceOperator = new StreamSource<>(function);
    }
    return new DataStreamSource<>(this, typeInfo, sourceOperator, isParallel, sourceName);
}
Also used : ParallelSourceFunction(org.apache.flink.streaming.api.functions.source.ParallelSourceFunction) SourceFunction(org.apache.flink.streaming.api.functions.source.SourceFunction) InputFormatSourceFunction(org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction) MissingTypeInfo(org.apache.flink.api.java.typeutils.MissingTypeInfo) ResultTypeQueryable(org.apache.flink.api.java.typeutils.ResultTypeQueryable) DataStreamSource(org.apache.flink.streaming.api.datastream.DataStreamSource) StoppableFunction(org.apache.flink.api.common.functions.StoppableFunction) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) ParallelSourceFunction(org.apache.flink.streaming.api.functions.source.ParallelSourceFunction) InvalidTypesException(org.apache.flink.api.common.functions.InvalidTypesException)

Aggregations

SourceFunction (org.apache.flink.streaming.api.functions.source.SourceFunction)21 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)19 Test (org.junit.Test)15 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)7 ArrayList (java.util.ArrayList)5 FoldFunction (org.apache.flink.api.common.functions.FoldFunction)4 StreamGraph (org.apache.flink.streaming.api.graph.StreamGraph)4 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)3 Collector (org.apache.flink.util.Collector)3 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Properties (java.util.Properties)2 CountDownLatch (java.util.concurrent.CountDownLatch)2 JoinFunction (org.apache.flink.api.common.functions.JoinFunction)2 RichMapFunction (org.apache.flink.api.common.functions.RichMapFunction)2 SinkFunction (org.apache.flink.streaming.api.functions.sink.SinkFunction)2 SourceTransformation (org.apache.flink.streaming.api.transformations.SourceTransformation)2 StreamTransformation (org.apache.flink.streaming.api.transformations.StreamTransformation)2 TimeWindow (org.apache.flink.streaming.api.windowing.windows.TimeWindow)2 AccumulatingProcessingTimeWindowOperator (org.apache.flink.streaming.runtime.operators.windowing.AccumulatingProcessingTimeWindowOperator)2