use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.
the class KafkaConsumerTestBase method runProduceConsumeMultipleTopics.
/**
* Test producing and consuming into multiple topics
* @throws java.lang.Exception
*/
public void runProduceConsumeMultipleTopics() throws java.lang.Exception {
final int NUM_TOPICS = 5;
final int NUM_ELEMENTS = 20;
StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.getConfig().disableSysoutLogging();
// create topics with content
final List<String> topics = new ArrayList<>();
for (int i = 0; i < NUM_TOPICS; i++) {
final String topic = "topic-" + i;
topics.add(topic);
// create topic
createTestTopic(topic, i + 1, /*partitions*/
1);
}
// run first job, producing into all topics
DataStream<Tuple3<Integer, Integer, String>> stream = env.addSource(new RichParallelSourceFunction<Tuple3<Integer, Integer, String>>() {
@Override
public void run(SourceContext<Tuple3<Integer, Integer, String>> ctx) throws Exception {
int partition = getRuntimeContext().getIndexOfThisSubtask();
for (int topicId = 0; topicId < NUM_TOPICS; topicId++) {
for (int i = 0; i < NUM_ELEMENTS; i++) {
ctx.collect(new Tuple3<>(partition, i, "topic-" + topicId));
}
}
}
@Override
public void cancel() {
}
});
Tuple2WithTopicSchema schema = new Tuple2WithTopicSchema(env.getConfig());
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
kafkaServer.produceIntoKafka(stream, "dummy", schema, props, null);
env.execute("Write to topics");
// run second job consuming from multiple topics
env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.getConfig().disableSysoutLogging();
stream = env.addSource(kafkaServer.getConsumer(topics, schema, props));
stream.flatMap(new FlatMapFunction<Tuple3<Integer, Integer, String>, Integer>() {
Map<String, Integer> countPerTopic = new HashMap<>(NUM_TOPICS);
@Override
public void flatMap(Tuple3<Integer, Integer, String> value, Collector<Integer> out) throws Exception {
Integer count = countPerTopic.get(value.f2);
if (count == null) {
count = 1;
} else {
count++;
}
countPerTopic.put(value.f2, count);
// check map:
for (Map.Entry<String, Integer> el : countPerTopic.entrySet()) {
if (el.getValue() < NUM_ELEMENTS) {
// not enough yet
break;
}
if (el.getValue() > NUM_ELEMENTS) {
throw new RuntimeException("There is a failure in the test. I've read " + el.getValue() + " from topic " + el.getKey());
}
}
// we've seen messages from all topics
throw new SuccessException();
}
}).setParallelism(1);
tryExecute(env, "Count elements from the topics");
// delete all topics again
for (int i = 0; i < NUM_TOPICS; i++) {
final String topic = "topic-" + i;
deleteTestTopic(topic);
}
}
use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.
the class KafkaConsumerTestBase method runProduceConsumeMultipleTopics.
/**
* Test producing and consuming into multiple topics.
*
* @throws Exception
*/
public void runProduceConsumeMultipleTopics(boolean useLegacySchema) throws Exception {
final String topicNamePrefix = "runProduceConsumeMultipleTopics-" + (useLegacySchema ? "legacy" : "");
final int numTopics = 5;
final int numElements = 20;
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// create topics with content
final List<String> topics = new ArrayList<>();
for (int i = 0; i < numTopics; i++) {
final String topic = topicNamePrefix + i;
topics.add(topic);
// create topic
createTestTopic(topic, i + 1, /*partitions*/
1);
}
// before FLINK-6078 the RemoteExecutionEnvironment set the parallelism to 1 as well
env.setParallelism(1);
// run first job, producing into all topics
DataStream<Tuple3<Integer, Integer, String>> stream = env.addSource(new RichParallelSourceFunction<Tuple3<Integer, Integer, String>>() {
@Override
public void run(SourceContext<Tuple3<Integer, Integer, String>> ctx) throws Exception {
int partition = getRuntimeContext().getIndexOfThisSubtask();
for (int topicId = 0; topicId < numTopics; topicId++) {
for (int i = 0; i < numElements; i++) {
ctx.collect(new Tuple3<>(partition, i, topicNamePrefix + topicId));
}
}
}
@Override
public void cancel() {
}
});
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
if (useLegacySchema) {
Tuple2WithTopicSchema schema = new Tuple2WithTopicSchema(env.getConfig());
kafkaServer.produceIntoKafka(stream, "dummy", schema, props, null);
} else {
TestDeserializer schema = new TestDeserializer(env.getConfig());
kafkaServer.produceIntoKafka(stream, "dummy", schema, props);
}
env.execute("Write to topics");
// run second job consuming from multiple topics
env = StreamExecutionEnvironment.getExecutionEnvironment();
if (useLegacySchema) {
Tuple2WithTopicSchema schema = new Tuple2WithTopicSchema(env.getConfig());
stream = getStream(env, topics, schema, props);
} else {
TestDeserializer schema = new TestDeserializer(env.getConfig());
stream = getStream(env, topics, schema, props);
}
stream.flatMap(new FlatMapFunction<Tuple3<Integer, Integer, String>, Integer>() {
Map<String, Integer> countPerTopic = new HashMap<>(numTopics);
@Override
public void flatMap(Tuple3<Integer, Integer, String> value, Collector<Integer> out) throws Exception {
Integer count = countPerTopic.get(value.f2);
if (count == null) {
count = 1;
} else {
count++;
}
countPerTopic.put(value.f2, count);
// check map:
for (Map.Entry<String, Integer> el : countPerTopic.entrySet()) {
if (el.getValue() < numElements) {
// not enough yet
break;
}
if (el.getValue() > numElements) {
throw new RuntimeException("There is a failure in the test. I've read " + el.getValue() + " from topic " + el.getKey());
}
}
// we've seen messages from all topics
throw new SuccessException();
}
}).setParallelism(1);
tryExecute(env, "Count elements from the topics");
// delete all topics again
for (int i = 0; i < numTopics; i++) {
final String topic = topicNamePrefix + i;
deleteTestTopic(topic);
}
}
use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.
the class DataSetAllroundTestProgram method main.
@SuppressWarnings("Convert2Lambda")
public static void main(String[] args) throws Exception {
// get parameters
ParameterTool params = ParameterTool.fromArgs(args);
int loadFactor = Integer.parseInt(params.getRequired("loadFactor"));
String outputPath = params.getRequired("outputPath");
boolean infinite = params.getBoolean("infinite", false);
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
int numKeys = loadFactor * 128 * 1024;
DataSet<Tuple2<String, Integer>> x1Keys;
DataSet<Tuple2<String, Integer>> x2Keys = env.createInput(Generator.generate(numKeys * 32, 2)).setParallelism(4);
DataSet<Tuple2<String, Integer>> x8Keys = env.createInput(Generator.generate(numKeys, 8)).setParallelism(4);
if (infinite) {
x1Keys = env.createInput(Generator.generateInfinitely(numKeys)).setParallelism(4);
} else {
x1Keys = env.createInput(Generator.generate(numKeys, 1)).setParallelism(4);
}
DataSet<Tuple2<String, Integer>> joined = x2Keys.map(x -> Tuple4.of("0-0", 0L, 1, x.f0)).returns(Types.TUPLE(Types.STRING, Types.LONG, Types.INT, Types.STRING)).join(x8Keys).where(3).equalTo(0).with((l, r) -> Tuple2.of(l.f3, 1)).returns(Types.TUPLE(Types.STRING, Types.INT)).groupBy(new KeySelector<Tuple2<String, Integer>, String>() {
@Override
public String getKey(Tuple2<String, Integer> value) {
return value.f0;
}
}).reduce((value1, value2) -> Tuple2.of(value1.f0, value1.f1 + value2.f1));
// co-group two datasets on their primary keys.
// we filter both inputs such that only 6.25% of the keys overlap.
// result: (key, cnt), #keys records with unique keys, cnt = (6.25%: 2, 93.75%: 1)
DataSet<Tuple2<String, Integer>> coGrouped = x1Keys.filter(x -> x.f1 > 59).coGroup(x1Keys.filter(x -> x.f1 < 68)).where("f0").equalTo("f0").with((CoGroupFunction<Tuple2<String, Integer>, Tuple2<String, Integer>, Tuple2<String, Integer>>) (l, r, out) -> {
int cnt = 0;
String key = "";
for (Tuple2<String, Integer> t : l) {
cnt++;
key = t.f0;
}
for (Tuple2<String, Integer> t : r) {
cnt++;
key = t.f0;
}
out.collect(Tuple2.of(key, cnt));
}).returns(Types.TUPLE(Types.STRING, Types.INT));
// join datasets on keys (1-1 join) and replicate by 16 (previously computed count)
// result: (key, cnt), 16 * #keys records, all keys preserved, cnt = (6.25%: 2, 93.75%: 1)
DataSet<Tuple2<String, Integer>> joined2 = joined.join(coGrouped, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE).where(0).equalTo("f0").flatMap((FlatMapFunction<Tuple2<Tuple2<String, Integer>, Tuple2<String, Integer>>, Tuple2<String, Integer>>) (p, out) -> {
for (int i = 0; i < p.f0.f1; i++) {
out.collect(Tuple2.of(p.f0.f0, p.f1.f1));
}
}).returns(Types.TUPLE(Types.STRING, Types.INT));
// iteration. double the count field until all counts are at 32 or more
// result: (key, cnt), 16 * #keys records, all keys preserved, cnt = (6.25%: 64, 93.75%: 32)
IterativeDataSet<Tuple2<String, Integer>> initial = joined2.iterate(16);
DataSet<Tuple2<String, Integer>> iteration = initial.map(x -> Tuple2.of(x.f0, x.f1 * 2)).returns(Types.TUPLE(Types.STRING, Types.INT));
DataSet<Boolean> termination = iteration.flatMap((FlatMapFunction<Tuple2<String, Integer>, Boolean>) (x, out) -> {
if (x.f1 < 32) {
out.collect(false);
}
}).returns(Types.BOOLEAN);
DataSet<Tuple2<Integer, Integer>> result = initial.closeWith(iteration, termination).groupBy(1).reduceGroup((GroupReduceFunction<Tuple2<String, Integer>, Tuple2<Integer, Integer>>) (g, out) -> {
int key = 0;
int cnt = 0;
for (Tuple2<String, Integer> r : g) {
key = r.f1;
cnt++;
}
out.collect(Tuple2.of(key, cnt));
}).returns(Types.TUPLE(Types.INT, Types.INT)).map(x -> Tuple2.of(x.f0, x.f1 / (loadFactor * 128))).returns(Types.TUPLE(Types.INT, Types.INT));
// sort and emit result
result.sortPartition(0, Order.ASCENDING).setParallelism(1).writeAsText(outputPath, FileSystem.WriteMode.OVERWRITE).setParallelism(1);
env.execute();
}
use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.
the class CommonExecLookupJoin method createAsyncLookupJoin.
@SuppressWarnings("unchecked")
private StreamOperatorFactory<RowData> createAsyncLookupJoin(RelOptTable temporalTable, ExecNodeConfig config, Map<Integer, LookupJoinUtil.LookupKey> allLookupKeys, AsyncTableFunction<Object> asyncLookupFunction, RelBuilder relBuilder, RowType inputRowType, RowType tableSourceRowType, RowType resultRowType, boolean isLeftOuterJoin) {
int asyncBufferCapacity = config.get(ExecutionConfigOptions.TABLE_EXEC_ASYNC_LOOKUP_BUFFER_CAPACITY);
long asyncTimeout = config.get(ExecutionConfigOptions.TABLE_EXEC_ASYNC_LOOKUP_TIMEOUT).toMillis();
DataTypeFactory dataTypeFactory = ShortcutUtils.unwrapContext(relBuilder).getCatalogManager().getDataTypeFactory();
LookupJoinCodeGenerator.GeneratedTableFunctionWithDataType<AsyncFunction<RowData, Object>> generatedFuncWithType = LookupJoinCodeGenerator.generateAsyncLookupFunction(config.getTableConfig(), dataTypeFactory, inputRowType, tableSourceRowType, resultRowType, allLookupKeys, LookupJoinUtil.getOrderedLookupKeys(allLookupKeys.keySet()), asyncLookupFunction, StringUtils.join(temporalTable.getQualifiedName(), "."));
RowType rightRowType = Optional.ofNullable(temporalTableOutputType).map(FlinkTypeFactory::toLogicalRowType).orElse(tableSourceRowType);
// a projection or filter after table source scan
GeneratedResultFuture<TableFunctionResultFuture<RowData>> generatedResultFuture = LookupJoinCodeGenerator.generateTableAsyncCollector(config.getTableConfig(), "TableFunctionResultFuture", inputRowType, rightRowType, JavaScalaConversionUtil.toScala(Optional.ofNullable(joinCondition)));
DataStructureConverter<?, ?> fetcherConverter = DataStructureConverters.getConverter(generatedFuncWithType.dataType());
AsyncFunction<RowData, RowData> asyncFunc;
if (existCalcOnTemporalTable) {
// a projection or filter after table source scan
GeneratedFunction<FlatMapFunction<RowData, RowData>> generatedCalc = LookupJoinCodeGenerator.generateCalcMapFunction(config.getTableConfig(), JavaScalaConversionUtil.toScala(projectionOnTemporalTable), filterOnTemporalTable, temporalTableOutputType, tableSourceRowType);
asyncFunc = new AsyncLookupJoinWithCalcRunner(generatedFuncWithType.tableFunc(), (DataStructureConverter<RowData, Object>) fetcherConverter, generatedCalc, generatedResultFuture, InternalSerializers.create(rightRowType), isLeftOuterJoin, asyncBufferCapacity);
} else {
// right type is the same as table source row type, because no calc after temporal table
asyncFunc = new AsyncLookupJoinRunner(generatedFuncWithType.tableFunc(), (DataStructureConverter<RowData, Object>) fetcherConverter, generatedResultFuture, InternalSerializers.create(rightRowType), isLeftOuterJoin, asyncBufferCapacity);
}
// when the downstream do not need orderness
return new AsyncWaitOperatorFactory<>(asyncFunc, asyncTimeout, asyncBufferCapacity, AsyncDataStream.OutputMode.ORDERED);
}
use of org.apache.flink.api.common.functions.FlatMapFunction in project flink by apache.
the class SolutionSetDuplicatesITCase method testProgram.
@Test
public void testProgram() {
try {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<Long, Long>> data = env.generateSequence(0, 10).flatMap(new FlatMapFunction<Long, Tuple2<Long, Long>>() {
@Override
public void flatMap(Long value, Collector<Tuple2<Long, Long>> out) {
out.collect(new Tuple2<Long, Long>(value, value));
out.collect(new Tuple2<Long, Long>(value, value));
out.collect(new Tuple2<Long, Long>(value, value));
}
}).rebalance();
DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iter = data.iterateDelta(data, 10, 0);
List<Integer> result = iter.closeWith(iter.getWorkset(), iter.getWorkset()).map(new MapFunction<Tuple2<Long, Long>, Integer>() {
@Override
public Integer map(Tuple2<Long, Long> value) {
return value.f0.intValue();
}
}).collect();
assertEquals(11, result.size());
Collections.sort(result);
assertEquals(Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), result);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
Aggregations