Search in sources :

Example 51 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class WordCount method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    if (!parseParameters(args)) {
        return;
    }
    // set up the execution environment
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // get input data
    DataStream<String> text = getTextDataStream(env);
    DataStream<Tuple2<String, Integer>> counts = // normalize and split each line
    text.map(line -> line.toLowerCase().split("\\W+")).flatMap((String[] tokens, Collector<Tuple2<String, Integer>> out) -> {
        // emit the pairs with non-zero-length words
        Arrays.stream(tokens).filter(t -> t.length() > 0).forEach(t -> out.collect(new Tuple2<>(t, 1)));
    }).keyBy(0).sum(1);
    // emit result
    if (fileOutput) {
        counts.writeAsCsv(outputPath);
    } else {
        counts.print();
    }
    // execute program
    env.execute("Streaming WordCount Example");
}
Also used : DataStream(org.apache.flink.streaming.api.datastream.DataStream) Arrays(java.util.Arrays) WordCountData(org.apache.flink.examples.java.wordcount.util.WordCountData) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collector(org.apache.flink.util.Collector) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collector(org.apache.flink.util.Collector) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 52 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project camel by apache.

the class DataStreamFlinkProducer method collectResults.

protected void collectResults(Exchange exchange, Object result) {
    if (result instanceof DataStream) {
        DataStream dsResults = (DataStream) result;
        if (getEndpoint().isCollect()) {
            throw new IllegalArgumentException("collect mode not supported for Flink DataStreams.");
        } else {
            exchange.getIn().setBody(result);
            exchange.getIn().setHeader(FlinkConstants.FLINK_DATASTREAM_HEADER, result);
        }
    } else {
        exchange.getIn().setBody(result);
    }
}
Also used : DataStream(org.apache.flink.streaming.api.datastream.DataStream)

Example 53 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class SavepointWriterWindowITCase method testSlideWindow.

@Test
public void testSlideWindow() throws Exception {
    final String savepointPath = getTempDirPath(new AbstractID().toHexString());
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStateBackend(stateBackend);
    env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
    DataStream<Tuple2<String, Integer>> bootstrapData = env.fromCollection(WORDS).map(word -> Tuple2.of(word, 1), TUPLE_TYPE_INFO).assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple2<String, Integer>>noWatermarks().withTimestampAssigner((record, ts) -> 2L));
    WindowedStateTransformation<Tuple2<String, Integer>, String, TimeWindow> transformation = OperatorTransformation.bootstrapWith(bootstrapData).keyBy(tuple -> tuple.f0, Types.STRING).window(SlidingEventTimeWindows.of(Time.milliseconds(5), Time.milliseconds(1)));
    SavepointWriter.newSavepoint(stateBackend, 128).withOperator(UID, windowBootstrap.bootstrap(transformation)).write(savepointPath);
    env.execute("write state");
    WindowedStream<Tuple2<String, Integer>, String, TimeWindow> stream = env.addSource(new MaxWatermarkSource<Tuple2<String, Integer>>()).returns(TUPLE_TYPE_INFO).keyBy(tuple -> tuple.f0).window(SlidingEventTimeWindows.of(Time.milliseconds(5), Time.milliseconds(1)));
    DataStream<Tuple2<String, Integer>> windowed = windowStream.window(stream).uid(UID);
    CompletableFuture<Collection<Tuple2<String, Integer>>> future = collector.collect(windowed);
    submitJob(savepointPath, env);
    Collection<Tuple2<String, Integer>> results = future.get().stream().distinct().collect(Collectors.toList());
    Assert.assertThat("Incorrect results from bootstrapped windows", results, STANDARD_MATCHER);
}
Also used : Arrays(java.util.Arrays) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) RunWith(org.junit.runner.RunWith) CompletableFuture(java.util.concurrent.CompletableFuture) CountEvictor(org.apache.flink.streaming.api.windowing.evictors.CountEvictor) EmbeddedRocksDBStateBackend(org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) ArrayList(java.util.ArrayList) AggregateFunction(org.apache.flink.api.common.functions.AggregateFunction) StateBackend(org.apache.flink.runtime.state.StateBackend) StreamCollector(org.apache.flink.streaming.util.StreamCollector) WindowedStream(org.apache.flink.streaming.api.datastream.WindowedStream) Collector(org.apache.flink.util.Collector) TimeWindow(org.apache.flink.streaming.api.windowing.windows.TimeWindow) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) ReduceFunction(org.apache.flink.api.common.functions.ReduceFunction) Parameterized(org.junit.runners.Parameterized) AbstractTestBase(org.apache.flink.test.util.AbstractTestBase) Types(org.apache.flink.api.common.typeinfo.Types) Time(org.apache.flink.streaming.api.windowing.time.Time) Iterator(java.util.Iterator) AbstractID(org.apache.flink.util.AbstractID) Collection(java.util.Collection) SingleOutputStreamOperator(org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator) SlidingEventTimeWindows(org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows) Matchers(org.hamcrest.Matchers) WatermarkStrategy(org.apache.flink.api.common.eventtime.WatermarkStrategy) Test(org.junit.Test) MaxWatermarkSource(org.apache.flink.state.api.utils.MaxWatermarkSource) ProcessWindowFunction(org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction) Collectors(java.util.stream.Collectors) DataStream(org.apache.flink.streaming.api.datastream.DataStream) WindowFunction(org.apache.flink.streaming.api.functions.windowing.WindowFunction) List(java.util.List) Rule(org.junit.Rule) TumblingEventTimeWindows(org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows) ClusterClient(org.apache.flink.client.program.ClusterClient) HashMapStateBackend(org.apache.flink.runtime.state.hashmap.HashMapStateBackend) Matcher(org.hamcrest.Matcher) SerializedThrowable(org.apache.flink.util.SerializedThrowable) Optional(java.util.Optional) Assert(org.junit.Assert) RuntimeExecutionMode(org.apache.flink.api.common.RuntimeExecutionMode) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) TimeWindow(org.apache.flink.streaming.api.windowing.windows.TimeWindow) MaxWatermarkSource(org.apache.flink.state.api.utils.MaxWatermarkSource) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collection(java.util.Collection) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) AbstractID(org.apache.flink.util.AbstractID) Test(org.junit.Test)

Example 54 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class WritableSavepointWindowITCase method testSlideWindow.

@Test
public void testSlideWindow() throws Exception {
    final String savepointPath = getTempDirPath(new AbstractID().toHexString());
    ExecutionEnvironment bEnv = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<String, Integer>> bootstrapData = bEnv.fromCollection(WORDS).map(word -> Tuple2.of(word, 1)).returns(TUPLE_TYPE_INFO);
    WindowedOperatorTransformation<Tuple2<String, Integer>, String, TimeWindow> transformation = OperatorTransformation.bootstrapWith(bootstrapData).assignTimestamps(record -> 2L).keyBy(tuple -> tuple.f0, Types.STRING).window(SlidingEventTimeWindows.of(Time.milliseconds(5), Time.milliseconds(1)));
    Savepoint.create(new MemoryStateBackend(), 128).withOperator(UID, windowBootstrap.bootstrap(transformation)).write(savepointPath);
    bEnv.execute("write state");
    StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
    WindowedStream<Tuple2<String, Integer>, String, TimeWindow> stream = sEnv.addSource(new MaxWatermarkSource<Tuple2<String, Integer>>()).returns(TUPLE_TYPE_INFO).keyBy(tuple -> tuple.f0).window(SlidingEventTimeWindows.of(Time.milliseconds(5), Time.milliseconds(1)));
    DataStream<Tuple2<String, Integer>> windowed = windowStream.window(stream).uid(UID);
    CompletableFuture<Collection<Tuple2<String, Integer>>> future = collector.collect(windowed);
    submitJob(savepointPath, sEnv);
    Collection<Tuple2<String, Integer>> results = future.get();
    Assert.assertEquals("Incorrect number of results", 15, results.size());
    Assert.assertThat("Incorrect bootstrap state", new HashSet<>(results), STANDARD_MATCHER);
}
Also used : Arrays(java.util.Arrays) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) EmbeddedRocksDBStateBackend(org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) AggregateFunction(org.apache.flink.api.common.functions.AggregateFunction) DataSet(org.apache.flink.api.java.DataSet) StateBackend(org.apache.flink.runtime.state.StateBackend) StreamCollector(org.apache.flink.streaming.util.StreamCollector) WindowedStream(org.apache.flink.streaming.api.datastream.WindowedStream) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Parameterized(org.junit.runners.Parameterized) AbstractTestBase(org.apache.flink.test.util.AbstractTestBase) AbstractID(org.apache.flink.util.AbstractID) Collection(java.util.Collection) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) List(java.util.List) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) TumblingEventTimeWindows(org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows) ClusterClient(org.apache.flink.client.program.ClusterClient) SerializedThrowable(org.apache.flink.util.SerializedThrowable) Optional(java.util.Optional) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) RunWith(org.junit.runner.RunWith) CompletableFuture(java.util.concurrent.CompletableFuture) CountEvictor(org.apache.flink.streaming.api.windowing.evictors.CountEvictor) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Collector(org.apache.flink.util.Collector) TimeWindow(org.apache.flink.streaming.api.windowing.windows.TimeWindow) ReduceFunction(org.apache.flink.api.common.functions.ReduceFunction) Types(org.apache.flink.api.common.typeinfo.Types) Time(org.apache.flink.streaming.api.windowing.time.Time) Iterator(java.util.Iterator) SingleOutputStreamOperator(org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator) SlidingEventTimeWindows(org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows) Matchers(org.hamcrest.Matchers) Test(org.junit.Test) MaxWatermarkSource(org.apache.flink.state.api.utils.MaxWatermarkSource) ProcessWindowFunction(org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction) RocksDBStateBackend(org.apache.flink.contrib.streaming.state.RocksDBStateBackend) DataStream(org.apache.flink.streaming.api.datastream.DataStream) WindowFunction(org.apache.flink.streaming.api.functions.windowing.WindowFunction) Rule(org.junit.Rule) HashMapStateBackend(org.apache.flink.runtime.state.hashmap.HashMapStateBackend) Matcher(org.hamcrest.Matcher) Assert(org.junit.Assert) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) TimeWindow(org.apache.flink.streaming.api.windowing.windows.TimeWindow) MaxWatermarkSource(org.apache.flink.state.api.utils.MaxWatermarkSource) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collection(java.util.Collection) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) AbstractID(org.apache.flink.util.AbstractID) Test(org.junit.Test)

Example 55 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class WritableSavepointWindowITCase method testSlideWindowWithEvictor.

@Test
public void testSlideWindowWithEvictor() throws Exception {
    final String savepointPath = getTempDirPath(new AbstractID().toHexString());
    ExecutionEnvironment bEnv = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<String, Integer>> bootstrapData = bEnv.fromCollection(WORDS).map(word -> Tuple2.of(word, 1)).returns(TUPLE_TYPE_INFO);
    WindowedOperatorTransformation<Tuple2<String, Integer>, String, TimeWindow> transformation = OperatorTransformation.bootstrapWith(bootstrapData).assignTimestamps(record -> 2L).keyBy(tuple -> tuple.f0, Types.STRING).window(SlidingEventTimeWindows.of(Time.milliseconds(5), Time.milliseconds(1))).evictor(CountEvictor.of(1));
    Savepoint.create(new MemoryStateBackend(), 128).withOperator(UID, windowBootstrap.bootstrap(transformation)).write(savepointPath);
    bEnv.execute("write state");
    StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
    WindowedStream<Tuple2<String, Integer>, String, TimeWindow> stream = sEnv.addSource(new MaxWatermarkSource<Tuple2<String, Integer>>()).returns(TUPLE_TYPE_INFO).keyBy(tuple -> tuple.f0).window(SlidingEventTimeWindows.of(Time.milliseconds(5), Time.milliseconds(1))).evictor(CountEvictor.of(1));
    DataStream<Tuple2<String, Integer>> windowed = windowStream.window(stream).uid(UID);
    CompletableFuture<Collection<Tuple2<String, Integer>>> future = collector.collect(windowed);
    submitJob(savepointPath, sEnv);
    Collection<Tuple2<String, Integer>> results = future.get();
    Assert.assertEquals("Incorrect number of results", 15, results.size());
    Assert.assertThat("Incorrect bootstrap state", new HashSet<>(results), EVICTOR_MATCHER);
}
Also used : Arrays(java.util.Arrays) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) EmbeddedRocksDBStateBackend(org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) AggregateFunction(org.apache.flink.api.common.functions.AggregateFunction) DataSet(org.apache.flink.api.java.DataSet) StateBackend(org.apache.flink.runtime.state.StateBackend) StreamCollector(org.apache.flink.streaming.util.StreamCollector) WindowedStream(org.apache.flink.streaming.api.datastream.WindowedStream) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Parameterized(org.junit.runners.Parameterized) AbstractTestBase(org.apache.flink.test.util.AbstractTestBase) AbstractID(org.apache.flink.util.AbstractID) Collection(java.util.Collection) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) List(java.util.List) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) TumblingEventTimeWindows(org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows) ClusterClient(org.apache.flink.client.program.ClusterClient) SerializedThrowable(org.apache.flink.util.SerializedThrowable) Optional(java.util.Optional) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) RunWith(org.junit.runner.RunWith) CompletableFuture(java.util.concurrent.CompletableFuture) CountEvictor(org.apache.flink.streaming.api.windowing.evictors.CountEvictor) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Collector(org.apache.flink.util.Collector) TimeWindow(org.apache.flink.streaming.api.windowing.windows.TimeWindow) ReduceFunction(org.apache.flink.api.common.functions.ReduceFunction) Types(org.apache.flink.api.common.typeinfo.Types) Time(org.apache.flink.streaming.api.windowing.time.Time) Iterator(java.util.Iterator) SingleOutputStreamOperator(org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator) SlidingEventTimeWindows(org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows) Matchers(org.hamcrest.Matchers) Test(org.junit.Test) MaxWatermarkSource(org.apache.flink.state.api.utils.MaxWatermarkSource) ProcessWindowFunction(org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction) RocksDBStateBackend(org.apache.flink.contrib.streaming.state.RocksDBStateBackend) DataStream(org.apache.flink.streaming.api.datastream.DataStream) WindowFunction(org.apache.flink.streaming.api.functions.windowing.WindowFunction) Rule(org.junit.Rule) HashMapStateBackend(org.apache.flink.runtime.state.hashmap.HashMapStateBackend) Matcher(org.hamcrest.Matcher) Assert(org.junit.Assert) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) TimeWindow(org.apache.flink.streaming.api.windowing.windows.TimeWindow) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collection(java.util.Collection) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) AbstractID(org.apache.flink.util.AbstractID) Test(org.junit.Test)

Aggregations

DataStream (org.apache.flink.streaming.api.datastream.DataStream)87 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)78 Test (org.junit.Test)70 List (java.util.List)62 Collector (org.apache.flink.util.Collector)60 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)50 SingleOutputStreamOperator (org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator)48 Arrays (java.util.Arrays)46 ArrayList (java.util.ArrayList)40 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)40 Assert.assertEquals (org.junit.Assert.assertEquals)38 WatermarkStrategy (org.apache.flink.api.common.eventtime.WatermarkStrategy)36 Configuration (org.apache.flink.configuration.Configuration)36 Assert.assertTrue (org.junit.Assert.assertTrue)33 BasicTypeInfo (org.apache.flink.api.common.typeinfo.BasicTypeInfo)32 StreamOperator (org.apache.flink.streaming.api.operators.StreamOperator)32 Types (org.apache.flink.api.common.typeinfo.Types)31 Assert (org.junit.Assert)31 ReduceFunction (org.apache.flink.api.common.functions.ReduceFunction)29 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)29