Search in sources :

Example 6 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class AvroTypesITCase method testAvroStringAccess.

@Test
public void testAvroStringAccess() {
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
    DataStream<User> ds = testData(env);
    Table t = tEnv.fromDataStream(ds, selectFields(ds));
    Table result = t.select($("name"));
    List<Utf8> results = CollectionUtil.iteratorToList(result.execute().collect()).stream().map(row -> (Utf8) row.getField(0)).collect(Collectors.toList());
    String expected = "Charlie\n" + "Terminator\n" + "Whatever";
    TestBaseUtils.compareResultAsText(results, expected);
}
Also used : PojoTypeInfo(org.apache.flink.api.java.typeutils.PojoTypeInfo) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment) Arrays(java.util.Arrays) HashMap(java.util.HashMap) Expression(org.apache.flink.table.expressions.Expression) ByteBuffer(java.nio.ByteBuffer) ArrayList(java.util.ArrayList) Expressions(org.apache.flink.table.api.Expressions) BigDecimal(java.math.BigDecimal) Colors(org.apache.flink.formats.avro.generated.Colors) LocalTime(java.time.LocalTime) AbstractTestBase(org.apache.flink.test.util.AbstractTestBase) Utf8(org.apache.avro.util.Utf8) Fixed16(org.apache.flink.formats.avro.generated.Fixed16) Expressions.$(org.apache.flink.table.api.Expressions.$) DataStreamUtils(org.apache.flink.streaming.api.datastream.DataStreamUtils) Test(org.junit.Test) Table(org.apache.flink.table.api.Table) CollectionUtil(org.apache.flink.util.CollectionUtil) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) DataStream(org.apache.flink.streaming.api.datastream.DataStream) List(java.util.List) Fixed2(org.apache.flink.formats.avro.generated.Fixed2) ChronoUnit(java.time.temporal.ChronoUnit) Address(org.apache.flink.formats.avro.generated.Address) LocalDate(java.time.LocalDate) User(org.apache.flink.formats.avro.generated.User) Row(org.apache.flink.types.Row) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) TestBaseUtils(org.apache.flink.test.util.TestBaseUtils) User(org.apache.flink.formats.avro.generated.User) Table(org.apache.flink.table.api.Table) Utf8(org.apache.avro.util.Utf8) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment) Test(org.junit.Test)

Example 7 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class WindowWordCount method main.

// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    final CLI params = CLI.fromArgs(args);
    // Create the execution environment. This is the main entrypoint
    // to building a Flink application.
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // Apache Flink’s unified approach to stream and batch processing means that a DataStream
    // application executed over bounded input will produce the same final results regardless
    // of the configured execution mode. It is important to note what final means here: a job
    // executing in STREAMING mode might produce incremental updates (think upserts in
    // a database) while a BATCH job would only produce one final result at the end. The final
    // result will be the same if interpreted correctly, but getting there can be different.
    // 
    // The “classic” execution behavior of the DataStream API is called STREAMING execution
    // mode. Applications should use streaming execution for unbounded jobs that require
    // continuous incremental processing and are expected to stay online indefinitely.
    // 
    // By enabling BATCH execution, we allow Flink to apply additional optimizations that we
    // can only do when we know that our input is bounded. For example, different
    // join/aggregation strategies can be used, in addition to a different shuffle
    // implementation that allows more efficient task scheduling and failure recovery behavior.
    // 
    // By setting the runtime mode to AUTOMATIC, Flink will choose BATCH  if all sources
    // are bounded and otherwise STREAMING.
    env.setRuntimeMode(params.getExecutionMode());
    // This optional step makes the input parameters
    // available in the Flink UI.
    env.getConfig().setGlobalJobParameters(params);
    DataStream<String> text;
    if (params.getInputs().isPresent()) {
        // Create a new file source that will read files from a given set of directories.
        // Each file will be processed as plain text and split based on newlines.
        FileSource.FileSourceBuilder<String> builder = FileSource.forRecordStreamFormat(new TextLineInputFormat(), params.getInputs().get());
        // If a discovery interval is provided, the source will
        // continuously watch the given directories for new files.
        params.getDiscoveryInterval().ifPresent(builder::monitorContinuously);
        text = env.fromSource(builder.build(), WatermarkStrategy.noWatermarks(), "file-input");
    } else {
        text = env.fromElements(WordCountData.WORDS).name("in-memory-input");
    }
    int windowSize = params.getInt("window").orElse(250);
    int slideSize = params.getInt("slide").orElse(150);
    DataStream<Tuple2<String, Integer>> counts = // will output each words as a (2-tuple) containing (word, 1)
    text.flatMap(new WordCount.Tokenizer()).name("tokenizer").keyBy(value -> value.f0).countWindow(windowSize, slideSize).sum(1).name("counter");
    if (params.getOutput().isPresent()) {
        // Given an output directory, Flink will write the results to a file
        // using a simple string encoding. In a production environment, this might
        // be something more structured like CSV, Avro, JSON, or Parquet.
        counts.sinkTo(FileSink.<Tuple2<String, Integer>>forRowFormat(params.getOutput().get(), new SimpleStringEncoder<>()).withRollingPolicy(DefaultRollingPolicy.builder().withMaxPartSize(MemorySize.ofMebiBytes(1)).withRolloverInterval(Duration.ofSeconds(10)).build()).build()).name("file-sink");
    } else {
        counts.print().name("print-sink");
    }
    // Apache Flink applications are composed lazily. Calling execute
    // submits the Job and begins processing.
    env.execute("WindowWordCount");
}
Also used : Tuple2(org.apache.flink.api.java.tuple.Tuple2) WordCount(org.apache.flink.streaming.examples.wordcount.WordCount) WatermarkStrategy(org.apache.flink.api.common.eventtime.WatermarkStrategy) FileSink(org.apache.flink.connector.file.sink.FileSink) MemorySize(org.apache.flink.configuration.MemorySize) FileSource(org.apache.flink.connector.file.src.FileSource) DataStream(org.apache.flink.streaming.api.datastream.DataStream) TextLineInputFormat(org.apache.flink.connector.file.src.reader.TextLineInputFormat) SimpleStringEncoder(org.apache.flink.api.common.serialization.SimpleStringEncoder) DefaultRollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy) Duration(java.time.Duration) CLI(org.apache.flink.streaming.examples.wordcount.util.CLI) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) WordCountData(org.apache.flink.streaming.examples.wordcount.util.WordCountData) CLI(org.apache.flink.streaming.examples.wordcount.util.CLI) TextLineInputFormat(org.apache.flink.connector.file.src.reader.TextLineInputFormat) FileSource(org.apache.flink.connector.file.src.FileSource) Tuple2(org.apache.flink.api.java.tuple.Tuple2) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 8 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class Elasticsearch7SinkExample method main.

public static void main(String[] args) throws Exception {
    final ParameterTool parameterTool = ParameterTool.fromArgs(args);
    if (parameterTool.getNumberOfParameters() < 3) {
        System.out.println("Missing parameters!\n" + "Usage: --numRecords <numRecords> --index <index> --type <type>");
        return;
    }
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.enableCheckpointing(5000);
    DataStream<String> source = env.fromSequence(0, parameterTool.getInt("numRecords") - 1).map((MapFunction<Long, String>) value -> "message #" + value);
    source.sinkTo(new Elasticsearch7SinkBuilder<String>().setBulkFlushMaxActions(1).setHosts(new HttpHost("127.0.0.1", 9200, "http")).setEmitter((element, context, indexer) -> indexer.add(createIndexRequest(element, parameterTool))).build());
    env.execute("Elasticsearch7.x end to end sink test example");
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) DataStream(org.apache.flink.streaming.api.datastream.DataStream) ParameterTool(org.apache.flink.api.java.utils.ParameterTool) IndexRequest(org.elasticsearch.action.index.IndexRequest) Map(java.util.Map) HashMap(java.util.HashMap) Elasticsearch7SinkBuilder(org.apache.flink.connector.elasticsearch.sink.Elasticsearch7SinkBuilder) Requests(org.elasticsearch.client.Requests) HttpHost(org.apache.http.HttpHost) MapFunction(org.apache.flink.api.common.functions.MapFunction) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) HttpHost(org.apache.http.HttpHost) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 9 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class HiveTableSink method createBatchSink.

private DataStreamSink<Row> createBatchSink(DataStream<RowData> dataStream, DataStructureConverter converter, StorageDescriptor sd, HiveWriterFactory recordWriterFactory, OutputFileConfig fileNaming, final int parallelism) throws IOException {
    FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
    builder.setPartitionComputer(new HiveRowPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray()));
    builder.setDynamicGrouped(dynamicGrouping);
    builder.setPartitionColumns(getPartitionKeyArray());
    builder.setFileSystemFactory(fsFactory());
    builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
    builder.setMetaStoreFactory(msFactory());
    builder.setOverwrite(overwrite);
    builder.setStaticPartitions(staticPartitionSpec);
    builder.setTempPath(new org.apache.flink.core.fs.Path(toStagingDir(sd.getLocation(), jobConf)));
    builder.setOutputFileConfig(fileNaming);
    return dataStream.map((MapFunction<RowData, Row>) value -> (Row) converter.toExternal(value)).writeUsingOutputFormat(builder.build()).setParallelism(parallelism);
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) HiveMetastoreClientFactory(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientFactory) HiveBulkWriterFactory(org.apache.flink.connectors.hive.write.HiveBulkWriterFactory) CatalogTable(org.apache.flink.table.catalog.CatalogTable) LoggerFactory(org.slf4j.LoggerFactory) JobConfUtils(org.apache.flink.connectors.hive.util.JobConfUtils) MapFunction(org.apache.flink.api.common.functions.MapFunction) OrcSplitReaderUtil(org.apache.flink.orc.OrcSplitReaderUtil) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) SupportsPartitioning(org.apache.flink.table.connector.sink.abilities.SupportsPartitioning) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SINK_ROLLING_POLICY_CHECK_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_CHECK_INTERVAL) StreamingFileSink(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink) Path(org.apache.hadoop.fs.Path) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) PartFileInfo(org.apache.flink.streaming.api.functions.sink.filesystem.PartFileInfo) CheckpointRollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.CheckpointRollingPolicy) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) HiveShimLoader(org.apache.flink.table.catalog.hive.client.HiveShimLoader) HiveCatalogFactoryOptions(org.apache.flink.table.catalog.hive.factories.HiveCatalogFactoryOptions) DynamicTableSink(org.apache.flink.table.connector.sink.DynamicTableSink) SINK_ROLLING_POLICY_ROLLOVER_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_ROLLOVER_INTERVAL) TableSchema(org.apache.flink.table.api.TableSchema) CompactOperator.convertToUncompacted(org.apache.flink.connector.file.table.stream.compact.CompactOperator.convertToUncompacted) UUID(java.util.UUID) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) Preconditions(org.apache.flink.util.Preconditions) StringUtils(org.apache.flink.util.StringUtils) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) HiveReflectionUtils(org.apache.flink.table.catalog.hive.util.HiveReflectionUtils) LogicalType(org.apache.flink.table.types.logical.LogicalType) DataStreamSinkProvider(org.apache.flink.table.connector.sink.DataStreamSinkProvider) Optional(java.util.Optional) Row(org.apache.flink.types.Row) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) ChangelogMode(org.apache.flink.table.connector.ChangelogMode) RowType(org.apache.flink.table.types.logical.RowType) HiveShim(org.apache.flink.table.catalog.hive.client.HiveShim) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) LinkedHashMap(java.util.LinkedHashMap) ReadableConfig(org.apache.flink.configuration.ReadableConfig) ThreadLocalClassLoaderConfiguration(org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration) FileSystemConnectorOptions(org.apache.flink.connector.file.table.FileSystemConnectorOptions) SINK_ROLLING_POLICY_INACTIVITY_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_INACTIVITY_INTERVAL) SupportsOverwrite(org.apache.flink.table.connector.sink.abilities.SupportsOverwrite) HiveMetastoreClientWrapper(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientWrapper) Nullable(javax.annotation.Nullable) StreamingSink(org.apache.flink.connector.file.table.stream.StreamingSink) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) HiveConfUtils(org.apache.flink.connectors.hive.util.HiveConfUtils) HiveCompactReaderFactory(org.apache.flink.connectors.hive.read.HiveCompactReaderFactory) RowData(org.apache.flink.table.data.RowData) Logger(org.slf4j.Logger) Properties(java.util.Properties) ProviderContext(org.apache.flink.table.connector.ProviderContext) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) BulkWriter(org.apache.flink.api.common.serialization.BulkWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) TypeDescription(org.apache.orc.TypeDescription) TException(org.apache.thrift.TException) IOException(java.io.IOException) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Table(org.apache.hadoop.hive.metastore.api.Table) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) DataStream(org.apache.flink.streaming.api.datastream.DataStream) JobConf(org.apache.hadoop.mapred.JobConf) TableBucketAssigner(org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner) CompactReader(org.apache.flink.connector.file.table.stream.compact.CompactReader) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) FileSystemTableSink(org.apache.flink.connector.file.table.FileSystemTableSink) TableSchemaUtils(org.apache.flink.table.utils.TableSchemaUtils) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) CatalogException(org.apache.flink.table.catalog.exceptions.CatalogException) SINK_ROLLING_POLICY_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_FILE_SIZE) HiveTableUtil.checkAcidTable(org.apache.flink.table.catalog.hive.util.HiveTableUtil.checkAcidTable) RowData(org.apache.flink.table.data.RowData) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Row(org.apache.flink.types.Row) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory)

Example 10 with DataStream

use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.

the class OrcBulkWriterITCase method testOrcBulkWriter.

@Test
public void testOrcBulkWriter() throws Exception {
    final File outDir = TEMPORARY_FOLDER.newFolder();
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    final Properties writerProps = new Properties();
    writerProps.setProperty("orc.compress", "LZ4");
    final OrcBulkWriterFactory<Record> factory = new OrcBulkWriterFactory<>(new RecordVectorizer(schema), writerProps, new Configuration());
    env.setParallelism(1);
    env.enableCheckpointing(100);
    DataStream<Record> stream = env.addSource(new FiniteTestSource<>(testData), TypeInformation.of(Record.class));
    stream.map(str -> str).addSink(StreamingFileSink.forBulkFormat(new Path(outDir.toURI()), factory).withBucketAssigner(new UniqueBucketAssigner<>("test")).build());
    env.execute();
    OrcBulkWriterTestUtil.validate(outDir, testData);
}
Also used : Arrays(java.util.Arrays) Properties(java.util.Properties) FiniteTestSource(org.apache.flink.streaming.util.FiniteTestSource) Test(org.junit.Test) File(java.io.File) DataStream(org.apache.flink.streaming.api.datastream.DataStream) List(java.util.List) UniqueBucketAssigner(org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.UniqueBucketAssigner) Path(org.apache.flink.core.fs.Path) OrcBulkWriterTestUtil(org.apache.flink.orc.util.OrcBulkWriterTestUtil) Configuration(org.apache.hadoop.conf.Configuration) StreamingFileSink(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink) TestLogger(org.apache.flink.util.TestLogger) Record(org.apache.flink.orc.data.Record) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) ClassRule(org.junit.ClassRule) RecordVectorizer(org.apache.flink.orc.vector.RecordVectorizer) TemporaryFolder(org.junit.rules.TemporaryFolder) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Path(org.apache.flink.core.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Properties(java.util.Properties) RecordVectorizer(org.apache.flink.orc.vector.RecordVectorizer) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Record(org.apache.flink.orc.data.Record) File(java.io.File) Test(org.junit.Test)

Aggregations

DataStream (org.apache.flink.streaming.api.datastream.DataStream)87 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)78 Test (org.junit.Test)70 List (java.util.List)62 Collector (org.apache.flink.util.Collector)60 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)50 SingleOutputStreamOperator (org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator)48 Arrays (java.util.Arrays)46 ArrayList (java.util.ArrayList)40 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)40 Assert.assertEquals (org.junit.Assert.assertEquals)38 WatermarkStrategy (org.apache.flink.api.common.eventtime.WatermarkStrategy)36 Configuration (org.apache.flink.configuration.Configuration)36 Assert.assertTrue (org.junit.Assert.assertTrue)33 BasicTypeInfo (org.apache.flink.api.common.typeinfo.BasicTypeInfo)32 StreamOperator (org.apache.flink.streaming.api.operators.StreamOperator)32 Types (org.apache.flink.api.common.typeinfo.Types)31 Assert (org.junit.Assert)31 ReduceFunction (org.apache.flink.api.common.functions.ReduceFunction)29 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)29