use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class AvroTypesITCase method testAvroStringAccess.
@Test
public void testAvroStringAccess() {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
DataStream<User> ds = testData(env);
Table t = tEnv.fromDataStream(ds, selectFields(ds));
Table result = t.select($("name"));
List<Utf8> results = CollectionUtil.iteratorToList(result.execute().collect()).stream().map(row -> (Utf8) row.getField(0)).collect(Collectors.toList());
String expected = "Charlie\n" + "Terminator\n" + "Whatever";
TestBaseUtils.compareResultAsText(results, expected);
}
use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class WindowWordCount method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
final CLI params = CLI.fromArgs(args);
// Create the execution environment. This is the main entrypoint
// to building a Flink application.
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// Apache Flink’s unified approach to stream and batch processing means that a DataStream
// application executed over bounded input will produce the same final results regardless
// of the configured execution mode. It is important to note what final means here: a job
// executing in STREAMING mode might produce incremental updates (think upserts in
// a database) while a BATCH job would only produce one final result at the end. The final
// result will be the same if interpreted correctly, but getting there can be different.
//
// The “classic” execution behavior of the DataStream API is called STREAMING execution
// mode. Applications should use streaming execution for unbounded jobs that require
// continuous incremental processing and are expected to stay online indefinitely.
//
// By enabling BATCH execution, we allow Flink to apply additional optimizations that we
// can only do when we know that our input is bounded. For example, different
// join/aggregation strategies can be used, in addition to a different shuffle
// implementation that allows more efficient task scheduling and failure recovery behavior.
//
// By setting the runtime mode to AUTOMATIC, Flink will choose BATCH if all sources
// are bounded and otherwise STREAMING.
env.setRuntimeMode(params.getExecutionMode());
// This optional step makes the input parameters
// available in the Flink UI.
env.getConfig().setGlobalJobParameters(params);
DataStream<String> text;
if (params.getInputs().isPresent()) {
// Create a new file source that will read files from a given set of directories.
// Each file will be processed as plain text and split based on newlines.
FileSource.FileSourceBuilder<String> builder = FileSource.forRecordStreamFormat(new TextLineInputFormat(), params.getInputs().get());
// If a discovery interval is provided, the source will
// continuously watch the given directories for new files.
params.getDiscoveryInterval().ifPresent(builder::monitorContinuously);
text = env.fromSource(builder.build(), WatermarkStrategy.noWatermarks(), "file-input");
} else {
text = env.fromElements(WordCountData.WORDS).name("in-memory-input");
}
int windowSize = params.getInt("window").orElse(250);
int slideSize = params.getInt("slide").orElse(150);
DataStream<Tuple2<String, Integer>> counts = // will output each words as a (2-tuple) containing (word, 1)
text.flatMap(new WordCount.Tokenizer()).name("tokenizer").keyBy(value -> value.f0).countWindow(windowSize, slideSize).sum(1).name("counter");
if (params.getOutput().isPresent()) {
// Given an output directory, Flink will write the results to a file
// using a simple string encoding. In a production environment, this might
// be something more structured like CSV, Avro, JSON, or Parquet.
counts.sinkTo(FileSink.<Tuple2<String, Integer>>forRowFormat(params.getOutput().get(), new SimpleStringEncoder<>()).withRollingPolicy(DefaultRollingPolicy.builder().withMaxPartSize(MemorySize.ofMebiBytes(1)).withRolloverInterval(Duration.ofSeconds(10)).build()).build()).name("file-sink");
} else {
counts.print().name("print-sink");
}
// Apache Flink applications are composed lazily. Calling execute
// submits the Job and begins processing.
env.execute("WindowWordCount");
}
use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class Elasticsearch7SinkExample method main.
public static void main(String[] args) throws Exception {
final ParameterTool parameterTool = ParameterTool.fromArgs(args);
if (parameterTool.getNumberOfParameters() < 3) {
System.out.println("Missing parameters!\n" + "Usage: --numRecords <numRecords> --index <index> --type <type>");
return;
}
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(5000);
DataStream<String> source = env.fromSequence(0, parameterTool.getInt("numRecords") - 1).map((MapFunction<Long, String>) value -> "message #" + value);
source.sinkTo(new Elasticsearch7SinkBuilder<String>().setBulkFlushMaxActions(1).setHosts(new HttpHost("127.0.0.1", 9200, "http")).setEmitter((element, context, indexer) -> indexer.add(createIndexRequest(element, parameterTool))).build());
env.execute("Elasticsearch7.x end to end sink test example");
}
use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class HiveTableSink method createBatchSink.
private DataStreamSink<Row> createBatchSink(DataStream<RowData> dataStream, DataStructureConverter converter, StorageDescriptor sd, HiveWriterFactory recordWriterFactory, OutputFileConfig fileNaming, final int parallelism) throws IOException {
FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
builder.setPartitionComputer(new HiveRowPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray()));
builder.setDynamicGrouped(dynamicGrouping);
builder.setPartitionColumns(getPartitionKeyArray());
builder.setFileSystemFactory(fsFactory());
builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
builder.setMetaStoreFactory(msFactory());
builder.setOverwrite(overwrite);
builder.setStaticPartitions(staticPartitionSpec);
builder.setTempPath(new org.apache.flink.core.fs.Path(toStagingDir(sd.getLocation(), jobConf)));
builder.setOutputFileConfig(fileNaming);
return dataStream.map((MapFunction<RowData, Row>) value -> (Row) converter.toExternal(value)).writeUsingOutputFormat(builder.build()).setParallelism(parallelism);
}
use of org.apache.flink.streaming.api.datastream.DataStream in project flink by apache.
the class OrcBulkWriterITCase method testOrcBulkWriter.
@Test
public void testOrcBulkWriter() throws Exception {
final File outDir = TEMPORARY_FOLDER.newFolder();
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
final Properties writerProps = new Properties();
writerProps.setProperty("orc.compress", "LZ4");
final OrcBulkWriterFactory<Record> factory = new OrcBulkWriterFactory<>(new RecordVectorizer(schema), writerProps, new Configuration());
env.setParallelism(1);
env.enableCheckpointing(100);
DataStream<Record> stream = env.addSource(new FiniteTestSource<>(testData), TypeInformation.of(Record.class));
stream.map(str -> str).addSink(StreamingFileSink.forBulkFormat(new Path(outDir.toURI()), factory).withBucketAssigner(new UniqueBucketAssigner<>("test")).build());
env.execute();
OrcBulkWriterTestUtil.validate(outDir, testData);
}
Aggregations