Search in sources :

Example 1 with DataStreamSink

use of org.apache.flink.streaming.api.datastream.DataStreamSink in project flink by apache.

the class HiveTableSink method createBatchSink.

private DataStreamSink<Row> createBatchSink(DataStream<RowData> dataStream, DataStructureConverter converter, StorageDescriptor sd, HiveWriterFactory recordWriterFactory, OutputFileConfig fileNaming, final int parallelism) throws IOException {
    FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
    builder.setPartitionComputer(new HiveRowPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray()));
    builder.setDynamicGrouped(dynamicGrouping);
    builder.setPartitionColumns(getPartitionKeyArray());
    builder.setFileSystemFactory(fsFactory());
    builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
    builder.setMetaStoreFactory(msFactory());
    builder.setOverwrite(overwrite);
    builder.setStaticPartitions(staticPartitionSpec);
    builder.setTempPath(new org.apache.flink.core.fs.Path(toStagingDir(sd.getLocation(), jobConf)));
    builder.setOutputFileConfig(fileNaming);
    return dataStream.map((MapFunction<RowData, Row>) value -> (Row) converter.toExternal(value)).writeUsingOutputFormat(builder.build()).setParallelism(parallelism);
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) HiveMetastoreClientFactory(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientFactory) HiveBulkWriterFactory(org.apache.flink.connectors.hive.write.HiveBulkWriterFactory) CatalogTable(org.apache.flink.table.catalog.CatalogTable) LoggerFactory(org.slf4j.LoggerFactory) JobConfUtils(org.apache.flink.connectors.hive.util.JobConfUtils) MapFunction(org.apache.flink.api.common.functions.MapFunction) OrcSplitReaderUtil(org.apache.flink.orc.OrcSplitReaderUtil) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) SupportsPartitioning(org.apache.flink.table.connector.sink.abilities.SupportsPartitioning) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SINK_ROLLING_POLICY_CHECK_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_CHECK_INTERVAL) StreamingFileSink(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink) Path(org.apache.hadoop.fs.Path) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) PartFileInfo(org.apache.flink.streaming.api.functions.sink.filesystem.PartFileInfo) CheckpointRollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.CheckpointRollingPolicy) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) HiveShimLoader(org.apache.flink.table.catalog.hive.client.HiveShimLoader) HiveCatalogFactoryOptions(org.apache.flink.table.catalog.hive.factories.HiveCatalogFactoryOptions) DynamicTableSink(org.apache.flink.table.connector.sink.DynamicTableSink) SINK_ROLLING_POLICY_ROLLOVER_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_ROLLOVER_INTERVAL) TableSchema(org.apache.flink.table.api.TableSchema) CompactOperator.convertToUncompacted(org.apache.flink.connector.file.table.stream.compact.CompactOperator.convertToUncompacted) UUID(java.util.UUID) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) Preconditions(org.apache.flink.util.Preconditions) StringUtils(org.apache.flink.util.StringUtils) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) HiveReflectionUtils(org.apache.flink.table.catalog.hive.util.HiveReflectionUtils) LogicalType(org.apache.flink.table.types.logical.LogicalType) DataStreamSinkProvider(org.apache.flink.table.connector.sink.DataStreamSinkProvider) Optional(java.util.Optional) Row(org.apache.flink.types.Row) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) ChangelogMode(org.apache.flink.table.connector.ChangelogMode) RowType(org.apache.flink.table.types.logical.RowType) HiveShim(org.apache.flink.table.catalog.hive.client.HiveShim) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) LinkedHashMap(java.util.LinkedHashMap) ReadableConfig(org.apache.flink.configuration.ReadableConfig) ThreadLocalClassLoaderConfiguration(org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration) FileSystemConnectorOptions(org.apache.flink.connector.file.table.FileSystemConnectorOptions) SINK_ROLLING_POLICY_INACTIVITY_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_INACTIVITY_INTERVAL) SupportsOverwrite(org.apache.flink.table.connector.sink.abilities.SupportsOverwrite) HiveMetastoreClientWrapper(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientWrapper) Nullable(javax.annotation.Nullable) StreamingSink(org.apache.flink.connector.file.table.stream.StreamingSink) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) HiveConfUtils(org.apache.flink.connectors.hive.util.HiveConfUtils) HiveCompactReaderFactory(org.apache.flink.connectors.hive.read.HiveCompactReaderFactory) RowData(org.apache.flink.table.data.RowData) Logger(org.slf4j.Logger) Properties(java.util.Properties) ProviderContext(org.apache.flink.table.connector.ProviderContext) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) BulkWriter(org.apache.flink.api.common.serialization.BulkWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) TypeDescription(org.apache.orc.TypeDescription) TException(org.apache.thrift.TException) IOException(java.io.IOException) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Table(org.apache.hadoop.hive.metastore.api.Table) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) DataStream(org.apache.flink.streaming.api.datastream.DataStream) JobConf(org.apache.hadoop.mapred.JobConf) TableBucketAssigner(org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner) CompactReader(org.apache.flink.connector.file.table.stream.compact.CompactReader) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) FileSystemTableSink(org.apache.flink.connector.file.table.FileSystemTableSink) TableSchemaUtils(org.apache.flink.table.utils.TableSchemaUtils) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) CatalogException(org.apache.flink.table.catalog.exceptions.CatalogException) SINK_ROLLING_POLICY_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_FILE_SIZE) HiveTableUtil.checkAcidTable(org.apache.flink.table.catalog.hive.util.HiveTableUtil.checkAcidTable) RowData(org.apache.flink.table.data.RowData) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Row(org.apache.flink.types.Row) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory)

Example 2 with DataStreamSink

use of org.apache.flink.streaming.api.datastream.DataStreamSink in project flink by apache.

the class StreamGraphCoLocationConstraintTest method testSettingCoLocationConstraint.

@Test
public void testSettingCoLocationConstraint() throws Exception {
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(7);
    // set up the test program
    DataStream<Long> source = env.generateSequence(1L, 10_000_000);
    source.getTransformation().setCoLocationGroupKey("group1");
    DataStream<Long> step1 = source.keyBy(v -> v).map(v -> v);
    step1.getTransformation().setCoLocationGroupKey("group2");
    DataStream<Long> step2 = step1.keyBy(v -> v).map(v -> v);
    step2.getTransformation().setCoLocationGroupKey("group1");
    DataStreamSink<Long> result = step2.keyBy(v -> v).addSink(new DiscardingSink<>());
    result.getTransformation().setCoLocationGroupKey("group2");
    // get the graph
    final JobGraph jobGraph = env.getStreamGraph().getJobGraph();
    assertEquals(4, jobGraph.getNumberOfVertices());
    List<JobVertex> vertices = jobGraph.getVerticesSortedTopologicallyFromSources();
    for (JobVertex vertex : vertices) {
        assertNotNull(vertex.getCoLocationGroup());
    }
    assertEquals(vertices.get(0).getCoLocationGroup(), vertices.get(2).getCoLocationGroup());
    assertEquals(vertices.get(1).getCoLocationGroup(), vertices.get(3).getCoLocationGroup());
}
Also used : DataStream(org.apache.flink.streaming.api.datastream.DataStream) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) List(java.util.List) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) DiscardingSink(org.apache.flink.streaming.api.functions.sink.DiscardingSink) Assert.assertNotNull(org.junit.Assert.assertNotNull) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Test(org.junit.Test) Assert.fail(org.junit.Assert.fail) Assert.assertEquals(org.junit.Assert.assertEquals) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test)

Example 3 with DataStreamSink

use of org.apache.flink.streaming.api.datastream.DataStreamSink in project flink by apache.

the class StreamGraphCoLocationConstraintTest method testCoLocateDifferenSharingGroups.

@Test
public void testCoLocateDifferenSharingGroups() throws Exception {
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(7);
    // set up the test program
    DataStream<Long> source = env.generateSequence(1L, 10_000_000);
    source.getTransformation().setSlotSharingGroup("ssg1");
    source.getTransformation().setCoLocationGroupKey("co1");
    DataStream<Long> step1 = source.keyBy(v -> v).map(v -> v);
    step1.getTransformation().setSlotSharingGroup("ssg2");
    step1.getTransformation().setCoLocationGroupKey("co2");
    DataStream<Long> step2 = step1.keyBy(v -> v).map(v -> v);
    step2.getTransformation().setSlotSharingGroup("ssg3");
    step2.getTransformation().setCoLocationGroupKey("co1");
    DataStreamSink<Long> result = step2.keyBy(v -> v).addSink(new DiscardingSink<>());
    result.getTransformation().setSlotSharingGroup("ssg4");
    result.getTransformation().setCoLocationGroupKey("co2");
    // get the graph
    try {
        env.getStreamGraph().getJobGraph();
        fail("exception expected");
    } catch (IllegalStateException ignored) {
    }
}
Also used : DataStream(org.apache.flink.streaming.api.datastream.DataStream) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) List(java.util.List) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) DiscardingSink(org.apache.flink.streaming.api.functions.sink.DiscardingSink) Assert.assertNotNull(org.junit.Assert.assertNotNull) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Test(org.junit.Test) Assert.fail(org.junit.Assert.fail) Assert.assertEquals(org.junit.Assert.assertEquals) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test)

Example 4 with DataStreamSink

use of org.apache.flink.streaming.api.datastream.DataStreamSink in project flink by apache.

the class CommonExecLegacySink method translateToPlanInternal.

@SuppressWarnings("unchecked")
@Override
protected Transformation<T> translateToPlanInternal(PlannerBase planner, ExecNodeConfig config) {
    if (tableSink instanceof StreamTableSink) {
        final Transformation<T> transform;
        if (tableSink instanceof RetractStreamTableSink) {
            transform = translateToTransformation(planner, config, true);
        } else if (tableSink instanceof UpsertStreamTableSink) {
            UpsertStreamTableSink<T> upsertSink = (UpsertStreamTableSink<T>) tableSink;
            final boolean isAppendOnlyTable = !needRetraction;
            upsertSink.setIsAppendOnly(isAppendOnlyTable);
            if (upsertKeys != null) {
                upsertSink.setKeyFields(upsertKeys);
            } else {
                if (isAppendOnlyTable) {
                    upsertSink.setKeyFields(null);
                } else {
                    throw new TableException("UpsertStreamTableSink requires that Table has a full primary keys if it is updated.");
                }
            }
            transform = translateToTransformation(planner, config, true);
        } else if (tableSink instanceof AppendStreamTableSink) {
            // verify table is an insert-only (append-only) table
            if (needRetraction) {
                throw new TableException("AppendStreamTableSink requires that Table has only insert changes.");
            }
            transform = translateToTransformation(planner, config, false);
        } else {
            if (isStreaming) {
                throw new TableException("Stream Tables can only be emitted by AppendStreamTableSink, " + "RetractStreamTableSink, or UpsertStreamTableSink.");
            } else {
                transform = translateToTransformation(planner, config, false);
            }
        }
        final DataStream<T> dataStream = new DataStream<T>(planner.getExecEnv(), transform);
        final DataStreamSink<T> dsSink = (DataStreamSink<T>) ((StreamTableSink<T>) tableSink).consumeDataStream(dataStream);
        if (dsSink == null) {
            throw new TableException(String.format("The StreamTableSink#consumeDataStream(DataStream) must be implemented " + "and return the sink transformation DataStreamSink. " + "However, %s doesn't implement this method.", tableSink.getClass().getCanonicalName()));
        }
        return dsSink.getLegacyTransformation();
    } else if (tableSink instanceof DataStreamTableSink) {
        // is no real table sink, so we just need translate its input to Transformation.
        return translateToTransformation(planner, config, ((DataStreamTableSink<T>) tableSink).withChangeFlag());
    } else {
        throw new TableException(String.format("Only Support StreamTableSink! However %s is not a StreamTableSink.", tableSink.getClass().getCanonicalName()));
    }
}
Also used : TableException(org.apache.flink.table.api.TableException) UpsertStreamTableSink(org.apache.flink.table.sinks.UpsertStreamTableSink) DataStream(org.apache.flink.streaming.api.datastream.DataStream) DataStreamTableSink(org.apache.flink.table.planner.sinks.DataStreamTableSink) AppendStreamTableSink(org.apache.flink.table.sinks.AppendStreamTableSink) UpsertStreamTableSink(org.apache.flink.table.sinks.UpsertStreamTableSink) StreamTableSink(org.apache.flink.table.sinks.StreamTableSink) DataStreamTableSink(org.apache.flink.table.planner.sinks.DataStreamTableSink) RetractStreamTableSink(org.apache.flink.table.sinks.RetractStreamTableSink) AppendStreamTableSink(org.apache.flink.table.sinks.AppendStreamTableSink) RetractStreamTableSink(org.apache.flink.table.sinks.RetractStreamTableSink) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink)

Aggregations

DataStream (org.apache.flink.streaming.api.datastream.DataStream)4 DataStreamSink (org.apache.flink.streaming.api.datastream.DataStreamSink)4 List (java.util.List)3 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)2 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)2 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)2 DiscardingSink (org.apache.flink.streaming.api.functions.sink.DiscardingSink)2 Assert.assertEquals (org.junit.Assert.assertEquals)2 Assert.assertNotNull (org.junit.Assert.assertNotNull)2 Assert.fail (org.junit.Assert.fail)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 UncheckedIOException (java.io.UncheckedIOException)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 Optional (java.util.Optional)1 Properties (java.util.Properties)1 UUID (java.util.UUID)1 Nullable (javax.annotation.Nullable)1 VisibleForTesting (org.apache.flink.annotation.VisibleForTesting)1