use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class StreamOperatorChainingTest method testMultiChainingWithSplit.
/**
* Verify that multi-chaining works with object reuse enabled.
*/
private void testMultiChainingWithSplit(StreamExecutionEnvironment env) throws Exception {
// set parallelism to 2 to avoid chaining with source in case when available processors is
// 1.
env.setParallelism(2);
// the actual elements will not be used
DataStream<Integer> input = env.fromElements(1, 2, 3);
sink1Results = new ArrayList<>();
sink2Results = new ArrayList<>();
sink3Results = new ArrayList<>();
input = input.map(value -> value);
OutputTag<Integer> oneOutput = new OutputTag<Integer>("one") {
};
OutputTag<Integer> otherOutput = new OutputTag<Integer>("other") {
};
SingleOutputStreamOperator<Object> split = input.process(new ProcessFunction<Integer, Object>() {
private static final long serialVersionUID = 1L;
@Override
public void processElement(Integer value, Context ctx, Collector<Object> out) throws Exception {
if (value.equals(1)) {
ctx.output(oneOutput, value);
} else {
ctx.output(otherOutput, value);
}
}
});
split.getSideOutput(oneOutput).map(value -> "First 1: " + value).addSink(new SinkFunction<String>() {
@Override
public void invoke(String value, Context ctx) throws Exception {
sink1Results.add(value);
}
});
split.getSideOutput(oneOutput).map(value -> "First 2: " + value).addSink(new SinkFunction<String>() {
@Override
public void invoke(String value, Context ctx) throws Exception {
sink2Results.add(value);
}
});
split.getSideOutput(otherOutput).map(value -> "Second: " + value).addSink(new SinkFunction<String>() {
@Override
public void invoke(String value, Context ctx) throws Exception {
sink3Results.add(value);
}
});
// be build our own StreamTask and OperatorChain
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
Assert.assertTrue(jobGraph.getVerticesSortedTopologicallyFromSources().size() == 2);
JobVertex chainedVertex = jobGraph.getVerticesSortedTopologicallyFromSources().get(1);
Configuration configuration = chainedVertex.getConfiguration();
StreamConfig streamConfig = new StreamConfig(configuration);
StreamMap<Integer, Integer> headOperator = streamConfig.getStreamOperator(Thread.currentThread().getContextClassLoader());
try (MockEnvironment environment = createMockEnvironment(chainedVertex.getName())) {
StreamTask<Integer, StreamMap<Integer, Integer>> mockTask = createMockTask(streamConfig, environment);
OperatorChain<Integer, StreamMap<Integer, Integer>> operatorChain = createOperatorChain(streamConfig, environment, mockTask);
headOperator.setup(mockTask, streamConfig, operatorChain.getMainOperatorOutput());
operatorChain.initializeStateAndOpenOperators(null);
headOperator.processElement(new StreamRecord<>(1));
headOperator.processElement(new StreamRecord<>(2));
headOperator.processElement(new StreamRecord<>(3));
assertThat(sink1Results, contains("First 1: 1"));
assertThat(sink2Results, contains("First 2: 1"));
assertThat(sink3Results, contains("Second: 2", "Second: 3"));
}
}
use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class DataStreamJavaITCase method getComplexUnifiedPipeline.
// --------------------------------------------------------------------------------------------
// Helper methods
// --------------------------------------------------------------------------------------------
private Table getComplexUnifiedPipeline(StreamExecutionEnvironment env) {
final DataStream<String> allowedNamesStream = env.fromElements("Bob", "Alice");
final StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
tableEnv.createTemporaryView("AllowedNamesTable", tableEnv.fromDataStream(allowedNamesStream).as("allowedName"));
final Table nameCountTable = tableEnv.sqlQuery("SELECT name, COUNT(*) AS c " + "FROM (VALUES ('Bob'), ('Alice'), ('Greg'), ('Bob')) AS NameTable(name) " + "WHERE name IN (SELECT allowedName FROM AllowedNamesTable)" + "GROUP BY name");
final DataStream<Row> nameCountStream = tableEnv.toChangelogStream(nameCountTable);
final DataStream<Tuple2<String, Long>> updatesPerNameStream = nameCountStream.keyBy(r -> r.<String>getFieldAs("name")).process(new KeyedProcessFunction<String, Row, Tuple2<String, Long>>() {
ValueState<Long> count;
@Override
public void open(Configuration parameters) {
count = getRuntimeContext().getState(new ValueStateDescriptor<>("count", Long.class));
}
@Override
public void processElement(Row r, Context ctx, Collector<Tuple2<String, Long>> out) throws IOException {
Long currentCount = count.value();
if (currentCount == null) {
currentCount = 0L;
}
final long updatedCount = currentCount + 1;
count.update(updatedCount);
out.collect(Tuple2.of(ctx.getCurrentKey(), updatedCount));
}
});
tableEnv.createTemporaryView("UpdatesPerName", updatesPerNameStream);
return tableEnv.sqlQuery("SELECT DISTINCT f0, f1 FROM UpdatesPerName");
}
use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class DataStreamJavaITCase method testFromAndToChangelogStreamEventTime.
@Test
public void testFromAndToChangelogStreamEventTime() throws Exception {
final StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
final DataStream<Tuple3<Long, Integer, String>> dataStream = getWatermarkedDataStream();
final DataStream<Row> changelogStream = dataStream.map(t -> Row.ofKind(RowKind.INSERT, t.f1, t.f2)).returns(Types.ROW(Types.INT, Types.STRING));
// derive physical columns and add a rowtime
final Table table = tableEnv.fromChangelogStream(changelogStream, Schema.newBuilder().columnByMetadata("rowtime", TIMESTAMP_LTZ(3)).columnByExpression("computed", $("f1").upperCase()).watermark("rowtime", sourceWatermark()).build());
tableEnv.createTemporaryView("t", table);
// access and reorder columns
final Table reordered = tableEnv.sqlQuery("SELECT computed, rowtime, f0 FROM t");
// write out the rowtime column with fully declared schema
final DataStream<Row> result = tableEnv.toChangelogStream(reordered, Schema.newBuilder().column("f1", STRING()).columnByMetadata("rowtime", TIMESTAMP_LTZ(3)).columnByExpression("ignored", $("f1").upperCase()).column("f0", INT()).build());
// test event time window and field access
testResult(result.keyBy(k -> k.getField("f1")).window(TumblingEventTimeWindows.of(Time.milliseconds(5))).<Row>apply((key, window, input, out) -> {
int sum = 0;
for (Row row : input) {
sum += row.<Integer>getFieldAs("f0");
}
out.collect(Row.of(key, sum));
}).returns(Types.ROW(Types.STRING, Types.INT)), Row.of("A", 47), Row.of("C", 1000), Row.of("C", 1000));
}
use of org.apache.flink.streaming.api.datastream.DataStream in project beam by apache.
the class FlinkStreamingPortablePipelineTranslator method translateFlatten.
private <T> void translateFlatten(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id);
Map<String, String> allInputs = transform.getInputsMap();
if (allInputs.isEmpty()) {
// create an empty dummy source to satisfy downstream operations
// we cannot create an empty source in Flink, therefore we have to
// add the flatMap that simply never forwards the single element
long shutdownAfterIdleSourcesMs = context.getPipelineOptions().getShutdownSourcesAfterIdleMs();
DataStreamSource<WindowedValue<byte[]>> dummySource = context.getExecutionEnvironment().addSource(new ImpulseSourceFunction(shutdownAfterIdleSourcesMs));
DataStream<WindowedValue<T>> result = dummySource.<WindowedValue<T>>flatMap((s, collector) -> {
// never return anything
}).returns(new CoderTypeInformation<>(WindowedValue.getFullCoder((Coder<T>) VoidCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions()));
context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
} else {
DataStream<T> result = null;
// Determine DataStreams that we use as input several times. For those, we need to uniquify
// input streams because Flink seems to swallow watermarks when we have a union of one and
// the same stream.
HashMultiset<DataStream<T>> inputCounts = HashMultiset.create();
for (String input : allInputs.values()) {
DataStream<T> current = context.getDataStreamOrThrow(input);
inputCounts.add(current, 1);
}
for (String input : allInputs.values()) {
DataStream<T> current = context.getDataStreamOrThrow(input);
final int timesRequired = inputCounts.count(current);
if (timesRequired > 1) {
current = current.flatMap(new FlatMapFunction<T, T>() {
private static final long serialVersionUID = 1L;
@Override
public void flatMap(T t, Collector<T> collector) {
collector.collect(t);
}
});
}
result = (result == null) ? current : result.union(current);
}
context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
}
}
use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class KafkaTableSinkTestBase method testKafkaTableSink.
@Test
@SuppressWarnings("unchecked")
public void testKafkaTableSink() throws Exception {
DataStream dataStream = mock(DataStream.class);
KafkaTableSink kafkaTableSink = spy(createTableSink());
kafkaTableSink.emitDataStream(dataStream);
verify(dataStream).addSink(eq(PRODUCER));
verify(kafkaTableSink).createKafkaProducer(eq(TOPIC), eq(PROPERTIES), any(getSerializationSchema().getClass()), eq(PARTITIONER));
}
Aggregations