Search in sources :

Example 1 with FlatMapFunction

use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.

the class KafkaConsumerTestBase method runProduceConsumeMultipleTopics.

/**
	 * Test producing and consuming into multiple topics
	 * @throws java.lang.Exception
	 */
public void runProduceConsumeMultipleTopics() throws java.lang.Exception {
    final int NUM_TOPICS = 5;
    final int NUM_ELEMENTS = 20;
    StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env.getConfig().disableSysoutLogging();
    // create topics with content
    final List<String> topics = new ArrayList<>();
    for (int i = 0; i < NUM_TOPICS; i++) {
        final String topic = "topic-" + i;
        topics.add(topic);
        // create topic
        createTestTopic(topic, i + 1, /*partitions*/
        1);
    }
    // run first job, producing into all topics
    DataStream<Tuple3<Integer, Integer, String>> stream = env.addSource(new RichParallelSourceFunction<Tuple3<Integer, Integer, String>>() {

        @Override
        public void run(SourceContext<Tuple3<Integer, Integer, String>> ctx) throws Exception {
            int partition = getRuntimeContext().getIndexOfThisSubtask();
            for (int topicId = 0; topicId < NUM_TOPICS; topicId++) {
                for (int i = 0; i < NUM_ELEMENTS; i++) {
                    ctx.collect(new Tuple3<>(partition, i, "topic-" + topicId));
                }
            }
        }

        @Override
        public void cancel() {
        }
    });
    Tuple2WithTopicSchema schema = new Tuple2WithTopicSchema(env.getConfig());
    Properties props = new Properties();
    props.putAll(standardProps);
    props.putAll(secureProps);
    kafkaServer.produceIntoKafka(stream, "dummy", schema, props, null);
    env.execute("Write to topics");
    // run second job consuming from multiple topics
    env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env.getConfig().disableSysoutLogging();
    stream = env.addSource(kafkaServer.getConsumer(topics, schema, props));
    stream.flatMap(new FlatMapFunction<Tuple3<Integer, Integer, String>, Integer>() {

        Map<String, Integer> countPerTopic = new HashMap<>(NUM_TOPICS);

        @Override
        public void flatMap(Tuple3<Integer, Integer, String> value, Collector<Integer> out) throws Exception {
            Integer count = countPerTopic.get(value.f2);
            if (count == null) {
                count = 1;
            } else {
                count++;
            }
            countPerTopic.put(value.f2, count);
            // check map:
            for (Map.Entry<String, Integer> el : countPerTopic.entrySet()) {
                if (el.getValue() < NUM_ELEMENTS) {
                    // not enough yet
                    break;
                }
                if (el.getValue() > NUM_ELEMENTS) {
                    throw new RuntimeException("There is a failure in the test. I've read " + el.getValue() + " from topic " + el.getKey());
                }
            }
            // we've seen messages from all topics
            throw new SuccessException();
        }
    }).setParallelism(1);
    tryExecute(env, "Count elements from the topics");
    // delete all topics again
    for (int i = 0; i < NUM_TOPICS; i++) {
        final String topic = "topic-" + i;
        deleteTestTopic(topic);
    }
}
Also used : ArrayList(java.util.ArrayList) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) SuccessException(org.apache.flink.test.util.SuccessException) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) IOException(java.io.IOException) Tuple3(org.apache.flink.api.java.tuple.Tuple3) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) Collector(org.apache.flink.util.Collector) SuccessException(org.apache.flink.test.util.SuccessException) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Map(java.util.Map) HashMap(java.util.HashMap)

Example 2 with FlatMapFunction

use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.

the class KafkaConsumerTestBase method runProduceConsumeMultipleTopics.

/**
 * Test producing and consuming into multiple topics.
 *
 * @throws Exception
 */
public void runProduceConsumeMultipleTopics(boolean useLegacySchema) throws Exception {
    final String topicNamePrefix = "runProduceConsumeMultipleTopics-" + (useLegacySchema ? "legacy" : "");
    final int numTopics = 5;
    final int numElements = 20;
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // create topics with content
    final List<String> topics = new ArrayList<>();
    for (int i = 0; i < numTopics; i++) {
        final String topic = topicNamePrefix + i;
        topics.add(topic);
        // create topic
        createTestTopic(topic, i + 1, /*partitions*/
        1);
    }
    // before FLINK-6078 the RemoteExecutionEnvironment set the parallelism to 1 as well
    env.setParallelism(1);
    // run first job, producing into all topics
    DataStream<Tuple3<Integer, Integer, String>> stream = env.addSource(new RichParallelSourceFunction<Tuple3<Integer, Integer, String>>() {

        @Override
        public void run(SourceContext<Tuple3<Integer, Integer, String>> ctx) throws Exception {
            int partition = getRuntimeContext().getIndexOfThisSubtask();
            for (int topicId = 0; topicId < numTopics; topicId++) {
                for (int i = 0; i < numElements; i++) {
                    ctx.collect(new Tuple3<>(partition, i, topicNamePrefix + topicId));
                }
            }
        }

        @Override
        public void cancel() {
        }
    });
    Properties props = new Properties();
    props.putAll(standardProps);
    props.putAll(secureProps);
    if (useLegacySchema) {
        Tuple2WithTopicSchema schema = new Tuple2WithTopicSchema(env.getConfig());
        kafkaServer.produceIntoKafka(stream, "dummy", schema, props, null);
    } else {
        TestDeserializer schema = new TestDeserializer(env.getConfig());
        kafkaServer.produceIntoKafka(stream, "dummy", schema, props);
    }
    env.execute("Write to topics");
    // run second job consuming from multiple topics
    env = StreamExecutionEnvironment.getExecutionEnvironment();
    if (useLegacySchema) {
        Tuple2WithTopicSchema schema = new Tuple2WithTopicSchema(env.getConfig());
        stream = getStream(env, topics, schema, props);
    } else {
        TestDeserializer schema = new TestDeserializer(env.getConfig());
        stream = getStream(env, topics, schema, props);
    }
    stream.flatMap(new FlatMapFunction<Tuple3<Integer, Integer, String>, Integer>() {

        Map<String, Integer> countPerTopic = new HashMap<>(numTopics);

        @Override
        public void flatMap(Tuple3<Integer, Integer, String> value, Collector<Integer> out) throws Exception {
            Integer count = countPerTopic.get(value.f2);
            if (count == null) {
                count = 1;
            } else {
                count++;
            }
            countPerTopic.put(value.f2, count);
            // check map:
            for (Map.Entry<String, Integer> el : countPerTopic.entrySet()) {
                if (el.getValue() < numElements) {
                    // not enough yet
                    break;
                }
                if (el.getValue() > numElements) {
                    throw new RuntimeException("There is a failure in the test. I've read " + el.getValue() + " from topic " + el.getKey());
                }
            }
            // we've seen messages from all topics
            throw new SuccessException();
        }
    }).setParallelism(1);
    tryExecute(env, "Count elements from the topics");
    // delete all topics again
    for (int i = 0; i < numTopics; i++) {
        final String topic = topicNamePrefix + i;
        deleteTestTopic(topic);
    }
}
Also used : ArrayList(java.util.ArrayList) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) NotLeaderForPartitionException(org.apache.kafka.common.errors.NotLeaderForPartitionException) SuccessException(org.apache.flink.test.util.SuccessException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) Tuple3(org.apache.flink.api.java.tuple.Tuple3) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) Collector(org.apache.flink.util.Collector) SuccessException(org.apache.flink.test.util.SuccessException) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Map(java.util.Map) HashMap(java.util.HashMap)

Example 3 with FlatMapFunction

use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.

the class DataSetAllroundTestProgram method main.

@SuppressWarnings("Convert2Lambda")
public static void main(String[] args) throws Exception {
    // get parameters
    ParameterTool params = ParameterTool.fromArgs(args);
    int loadFactor = Integer.parseInt(params.getRequired("loadFactor"));
    String outputPath = params.getRequired("outputPath");
    boolean infinite = params.getBoolean("infinite", false);
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    int numKeys = loadFactor * 128 * 1024;
    DataSet<Tuple2<String, Integer>> x1Keys;
    DataSet<Tuple2<String, Integer>> x2Keys = env.createInput(Generator.generate(numKeys * 32, 2)).setParallelism(4);
    DataSet<Tuple2<String, Integer>> x8Keys = env.createInput(Generator.generate(numKeys, 8)).setParallelism(4);
    if (infinite) {
        x1Keys = env.createInput(Generator.generateInfinitely(numKeys)).setParallelism(4);
    } else {
        x1Keys = env.createInput(Generator.generate(numKeys, 1)).setParallelism(4);
    }
    DataSet<Tuple2<String, Integer>> joined = x2Keys.map(x -> Tuple4.of("0-0", 0L, 1, x.f0)).returns(Types.TUPLE(Types.STRING, Types.LONG, Types.INT, Types.STRING)).join(x8Keys).where(3).equalTo(0).with((l, r) -> Tuple2.of(l.f3, 1)).returns(Types.TUPLE(Types.STRING, Types.INT)).groupBy(new KeySelector<Tuple2<String, Integer>, String>() {

        @Override
        public String getKey(Tuple2<String, Integer> value) {
            return value.f0;
        }
    }).reduce((value1, value2) -> Tuple2.of(value1.f0, value1.f1 + value2.f1));
    // co-group two datasets on their primary keys.
    // we filter both inputs such that only 6.25% of the keys overlap.
    // result: (key, cnt), #keys records with unique keys, cnt = (6.25%: 2, 93.75%: 1)
    DataSet<Tuple2<String, Integer>> coGrouped = x1Keys.filter(x -> x.f1 > 59).coGroup(x1Keys.filter(x -> x.f1 < 68)).where("f0").equalTo("f0").with((CoGroupFunction<Tuple2<String, Integer>, Tuple2<String, Integer>, Tuple2<String, Integer>>) (l, r, out) -> {
        int cnt = 0;
        String key = "";
        for (Tuple2<String, Integer> t : l) {
            cnt++;
            key = t.f0;
        }
        for (Tuple2<String, Integer> t : r) {
            cnt++;
            key = t.f0;
        }
        out.collect(Tuple2.of(key, cnt));
    }).returns(Types.TUPLE(Types.STRING, Types.INT));
    // join datasets on keys (1-1 join) and replicate by 16 (previously computed count)
    // result: (key, cnt), 16 * #keys records, all keys preserved, cnt = (6.25%: 2, 93.75%: 1)
    DataSet<Tuple2<String, Integer>> joined2 = joined.join(coGrouped, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE).where(0).equalTo("f0").flatMap((FlatMapFunction<Tuple2<Tuple2<String, Integer>, Tuple2<String, Integer>>, Tuple2<String, Integer>>) (p, out) -> {
        for (int i = 0; i < p.f0.f1; i++) {
            out.collect(Tuple2.of(p.f0.f0, p.f1.f1));
        }
    }).returns(Types.TUPLE(Types.STRING, Types.INT));
    // iteration. double the count field until all counts are at 32 or more
    // result: (key, cnt), 16 * #keys records, all keys preserved, cnt = (6.25%: 64, 93.75%: 32)
    IterativeDataSet<Tuple2<String, Integer>> initial = joined2.iterate(16);
    DataSet<Tuple2<String, Integer>> iteration = initial.map(x -> Tuple2.of(x.f0, x.f1 * 2)).returns(Types.TUPLE(Types.STRING, Types.INT));
    DataSet<Boolean> termination = iteration.flatMap((FlatMapFunction<Tuple2<String, Integer>, Boolean>) (x, out) -> {
        if (x.f1 < 32) {
            out.collect(false);
        }
    }).returns(Types.BOOLEAN);
    DataSet<Tuple2<Integer, Integer>> result = initial.closeWith(iteration, termination).groupBy(1).reduceGroup((GroupReduceFunction<Tuple2<String, Integer>, Tuple2<Integer, Integer>>) (g, out) -> {
        int key = 0;
        int cnt = 0;
        for (Tuple2<String, Integer> r : g) {
            key = r.f1;
            cnt++;
        }
        out.collect(Tuple2.of(key, cnt));
    }).returns(Types.TUPLE(Types.INT, Types.INT)).map(x -> Tuple2.of(x.f0, x.f1 / (loadFactor * 128))).returns(Types.TUPLE(Types.INT, Types.INT));
    // sort and emit result
    result.sortPartition(0, Order.ASCENDING).setParallelism(1).writeAsText(outputPath, FileSystem.WriteMode.OVERWRITE).setParallelism(1);
    env.execute();
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) Types(org.apache.flink.api.common.typeinfo.Types) KeySelector(org.apache.flink.api.java.functions.KeySelector) JoinOperatorBase(org.apache.flink.api.common.operators.base.JoinOperatorBase) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple4(org.apache.flink.api.java.tuple.Tuple4) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) IterativeDataSet(org.apache.flink.api.java.operators.IterativeDataSet) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) ParameterTool(org.apache.flink.api.java.utils.ParameterTool) CoGroupFunction(org.apache.flink.api.common.functions.CoGroupFunction) DataSet(org.apache.flink.api.java.DataSet) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) FileSystem(org.apache.flink.core.fs.FileSystem) Order(org.apache.flink.api.common.operators.Order) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) KeySelector(org.apache.flink.api.java.functions.KeySelector) CoGroupFunction(org.apache.flink.api.common.functions.CoGroupFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction)

Example 4 with FlatMapFunction

use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.

the class CommonExecLookupJoin method createAsyncLookupJoin.

@SuppressWarnings("unchecked")
private StreamOperatorFactory<RowData> createAsyncLookupJoin(RelOptTable temporalTable, ExecNodeConfig config, Map<Integer, LookupJoinUtil.LookupKey> allLookupKeys, AsyncTableFunction<Object> asyncLookupFunction, RelBuilder relBuilder, RowType inputRowType, RowType tableSourceRowType, RowType resultRowType, boolean isLeftOuterJoin) {
    int asyncBufferCapacity = config.get(ExecutionConfigOptions.TABLE_EXEC_ASYNC_LOOKUP_BUFFER_CAPACITY);
    long asyncTimeout = config.get(ExecutionConfigOptions.TABLE_EXEC_ASYNC_LOOKUP_TIMEOUT).toMillis();
    DataTypeFactory dataTypeFactory = ShortcutUtils.unwrapContext(relBuilder).getCatalogManager().getDataTypeFactory();
    LookupJoinCodeGenerator.GeneratedTableFunctionWithDataType<AsyncFunction<RowData, Object>> generatedFuncWithType = LookupJoinCodeGenerator.generateAsyncLookupFunction(config.getTableConfig(), dataTypeFactory, inputRowType, tableSourceRowType, resultRowType, allLookupKeys, LookupJoinUtil.getOrderedLookupKeys(allLookupKeys.keySet()), asyncLookupFunction, StringUtils.join(temporalTable.getQualifiedName(), "."));
    RowType rightRowType = Optional.ofNullable(temporalTableOutputType).map(FlinkTypeFactory::toLogicalRowType).orElse(tableSourceRowType);
    // a projection or filter after table source scan
    GeneratedResultFuture<TableFunctionResultFuture<RowData>> generatedResultFuture = LookupJoinCodeGenerator.generateTableAsyncCollector(config.getTableConfig(), "TableFunctionResultFuture", inputRowType, rightRowType, JavaScalaConversionUtil.toScala(Optional.ofNullable(joinCondition)));
    DataStructureConverter<?, ?> fetcherConverter = DataStructureConverters.getConverter(generatedFuncWithType.dataType());
    AsyncFunction<RowData, RowData> asyncFunc;
    if (existCalcOnTemporalTable) {
        // a projection or filter after table source scan
        GeneratedFunction<FlatMapFunction<RowData, RowData>> generatedCalc = LookupJoinCodeGenerator.generateCalcMapFunction(config.getTableConfig(), JavaScalaConversionUtil.toScala(projectionOnTemporalTable), filterOnTemporalTable, temporalTableOutputType, tableSourceRowType);
        asyncFunc = new AsyncLookupJoinWithCalcRunner(generatedFuncWithType.tableFunc(), (DataStructureConverter<RowData, Object>) fetcherConverter, generatedCalc, generatedResultFuture, InternalSerializers.create(rightRowType), isLeftOuterJoin, asyncBufferCapacity);
    } else {
        // right type is the same as table source row type, because no calc after temporal table
        asyncFunc = new AsyncLookupJoinRunner(generatedFuncWithType.tableFunc(), (DataStructureConverter<RowData, Object>) fetcherConverter, generatedResultFuture, InternalSerializers.create(rightRowType), isLeftOuterJoin, asyncBufferCapacity);
    }
    // when the downstream do not need orderness
    return new AsyncWaitOperatorFactory<>(asyncFunc, asyncTimeout, asyncBufferCapacity, AsyncDataStream.OutputMode.ORDERED);
}
Also used : AsyncLookupJoinRunner(org.apache.flink.table.runtime.operators.join.lookup.AsyncLookupJoinRunner) AsyncWaitOperatorFactory(org.apache.flink.streaming.api.operators.async.AsyncWaitOperatorFactory) DataStructureConverter(org.apache.flink.table.data.conversion.DataStructureConverter) RowType(org.apache.flink.table.types.logical.RowType) DataTypeFactory(org.apache.flink.table.catalog.DataTypeFactory) AsyncLookupJoinWithCalcRunner(org.apache.flink.table.runtime.operators.join.lookup.AsyncLookupJoinWithCalcRunner) AsyncFunction(org.apache.flink.streaming.api.functions.async.AsyncFunction) LookupJoinCodeGenerator(org.apache.flink.table.planner.codegen.LookupJoinCodeGenerator) RowData(org.apache.flink.table.data.RowData) TableFunctionResultFuture(org.apache.flink.table.runtime.collector.TableFunctionResultFuture) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction)

Example 5 with FlatMapFunction

use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.

the class SolutionSetDuplicatesITCase method testProgram.

@Test
public void testProgram() {
    try {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        DataSet<Tuple2<Long, Long>> data = env.generateSequence(0, 10).flatMap(new FlatMapFunction<Long, Tuple2<Long, Long>>() {

            @Override
            public void flatMap(Long value, Collector<Tuple2<Long, Long>> out) {
                out.collect(new Tuple2<Long, Long>(value, value));
                out.collect(new Tuple2<Long, Long>(value, value));
                out.collect(new Tuple2<Long, Long>(value, value));
            }
        }).rebalance();
        DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iter = data.iterateDelta(data, 10, 0);
        List<Integer> result = iter.closeWith(iter.getWorkset(), iter.getWorkset()).map(new MapFunction<Tuple2<Long, Long>, Integer>() {

            @Override
            public Integer map(Tuple2<Long, Long> value) {
                return value.f0.intValue();
            }
        }).collect();
        assertEquals(11, result.size());
        Collections.sort(result);
        assertEquals(Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), result);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) Collector(org.apache.flink.util.Collector) MapFunction(org.apache.flink.api.common.functions.MapFunction) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) Test(org.junit.Test)

Aggregations

FlatMapFunction (org.apache.flink.api.common.functions.FlatMapFunction)15 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)9 Collector (org.apache.flink.util.Collector)9 Test (org.junit.Test)7 IOException (java.io.IOException)4 RichFlatMapFunction (org.apache.flink.api.common.functions.RichFlatMapFunction)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Map (java.util.Map)3 DataStream (org.apache.flink.streaming.api.datastream.DataStream)3 Collections (java.util.Collections)2 List (java.util.List)2 Properties (java.util.Properties)2 MapFunction (org.apache.flink.api.common.functions.MapFunction)2 TypeHint (org.apache.flink.api.common.typeinfo.TypeHint)2 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)2 KeySelector (org.apache.flink.api.java.functions.KeySelector)2 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)2 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)2 ParameterTool (org.apache.flink.api.java.utils.ParameterTool)2