Search in sources :

Example 1 with PartitionCommitInfo

use of org.apache.flink.connector.file.table.stream.PartitionCommitInfo in project flink by apache.

the class FileSystemTableSink method createStreamingSink.

private DataStreamSink<?> createStreamingSink(ProviderContext providerContext, DataStream<RowData> dataStream, Context sinkContext, final int parallelism) {
    FileSystemFactory fsFactory = FileSystem::get;
    RowDataPartitionComputer computer = partitionComputer();
    boolean autoCompaction = tableOptions.getBoolean(AUTO_COMPACTION);
    Object writer = createWriter(sinkContext);
    boolean isEncoder = writer instanceof Encoder;
    TableBucketAssigner assigner = new TableBucketAssigner(computer);
    TableRollingPolicy rollingPolicy = new TableRollingPolicy(!isEncoder || autoCompaction, tableOptions.get(SINK_ROLLING_POLICY_FILE_SIZE).getBytes(), tableOptions.get(SINK_ROLLING_POLICY_ROLLOVER_INTERVAL).toMillis(), tableOptions.get(SINK_ROLLING_POLICY_INACTIVITY_INTERVAL).toMillis());
    String randomPrefix = "part-" + UUID.randomUUID().toString();
    OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder = OutputFileConfig.builder();
    fileNamingBuilder = autoCompaction ? fileNamingBuilder.withPartPrefix(convertToUncompacted(randomPrefix)) : fileNamingBuilder.withPartPrefix(randomPrefix);
    OutputFileConfig fileNamingConfig = fileNamingBuilder.build();
    BucketsBuilder<RowData, String, ? extends BucketsBuilder<RowData, ?, ?>> bucketsBuilder;
    if (isEncoder) {
        // noinspection unchecked
        bucketsBuilder = StreamingFileSink.forRowFormat(path, new ProjectionEncoder((Encoder<RowData>) writer, computer)).withBucketAssigner(assigner).withOutputFileConfig(fileNamingConfig).withRollingPolicy(rollingPolicy);
    } else {
        // noinspection unchecked
        bucketsBuilder = StreamingFileSink.forBulkFormat(path, new ProjectionBulkFactory((BulkWriter.Factory<RowData>) writer, computer)).withBucketAssigner(assigner).withOutputFileConfig(fileNamingConfig).withRollingPolicy(rollingPolicy);
    }
    long bucketCheckInterval = tableOptions.get(SINK_ROLLING_POLICY_CHECK_INTERVAL).toMillis();
    DataStream<PartitionCommitInfo> writerStream;
    if (autoCompaction) {
        long compactionSize = tableOptions.getOptional(COMPACTION_FILE_SIZE).orElse(tableOptions.get(SINK_ROLLING_POLICY_FILE_SIZE)).getBytes();
        CompactReader.Factory<RowData> reader = createCompactReaderFactory(sinkContext).orElseThrow(() -> new TableException("Please implement available reader for compaction:" + " BulkFormat, FileInputFormat."));
        writerStream = StreamingSink.compactionWriter(providerContext, dataStream, bucketCheckInterval, bucketsBuilder, fsFactory, path, reader, compactionSize, parallelism);
    } else {
        writerStream = StreamingSink.writer(providerContext, dataStream, bucketCheckInterval, bucketsBuilder, parallelism, partitionKeys, tableOptions);
    }
    return StreamingSink.sink(providerContext, writerStream, path, tableIdentifier, partitionKeys, new EmptyMetaStoreFactory(path), fsFactory, tableOptions);
}
Also used : TableException(org.apache.flink.table.api.TableException) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) RowData(org.apache.flink.table.data.RowData) CompactReader(org.apache.flink.connector.file.table.stream.compact.CompactReader) Encoder(org.apache.flink.api.common.serialization.Encoder)

Example 2 with PartitionCommitInfo

use of org.apache.flink.connector.file.table.stream.PartitionCommitInfo in project flink by apache.

the class HiveTableSink method createStreamSink.

private DataStreamSink<?> createStreamSink(ProviderContext providerContext, DataStream<RowData> dataStream, StorageDescriptor sd, Properties tableProps, HiveWriterFactory recordWriterFactory, OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder, final int parallelism) {
    org.apache.flink.configuration.Configuration conf = new org.apache.flink.configuration.Configuration();
    catalogTable.getOptions().forEach(conf::setString);
    String commitPolicies = conf.getString(FileSystemConnectorOptions.SINK_PARTITION_COMMIT_POLICY_KIND);
    if (!getPartitionKeys().isEmpty() && StringUtils.isNullOrWhitespaceOnly(commitPolicies)) {
        throw new FlinkHiveException(String.format("Streaming write to partitioned hive table %s without providing a commit policy. " + "Make sure to set a proper value for %s", identifier, FileSystemConnectorOptions.SINK_PARTITION_COMMIT_POLICY_KIND.key()));
    }
    HiveRowDataPartitionComputer partComputer = new HiveRowDataPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray());
    TableBucketAssigner assigner = new TableBucketAssigner(partComputer);
    HiveRollingPolicy rollingPolicy = new HiveRollingPolicy(conf.get(SINK_ROLLING_POLICY_FILE_SIZE).getBytes(), conf.get(SINK_ROLLING_POLICY_ROLLOVER_INTERVAL).toMillis(), conf.get(SINK_ROLLING_POLICY_INACTIVITY_INTERVAL).toMillis());
    boolean autoCompaction = conf.getBoolean(FileSystemConnectorOptions.AUTO_COMPACTION);
    if (autoCompaction) {
        fileNamingBuilder.withPartPrefix(convertToUncompacted(fileNamingBuilder.build().getPartPrefix()));
    }
    OutputFileConfig outputFileConfig = fileNamingBuilder.build();
    org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(sd.getLocation());
    BucketsBuilder<RowData, String, ? extends BucketsBuilder<RowData, ?, ?>> builder;
    if (flinkConf.get(HiveOptions.TABLE_EXEC_HIVE_FALLBACK_MAPRED_WRITER)) {
        builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
        LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer.");
    } else {
        Optional<BulkWriter.Factory<RowData>> bulkFactory = createBulkWriterFactory(getPartitionKeyArray(), sd);
        if (bulkFactory.isPresent()) {
            builder = StreamingFileSink.forBulkFormat(path, new FileSystemTableSink.ProjectionBulkFactory(bulkFactory.get(), partComputer)).withBucketAssigner(assigner).withRollingPolicy(rollingPolicy).withOutputFileConfig(outputFileConfig);
            LOG.info("Hive streaming sink: Use native parquet&orc writer.");
        } else {
            builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
            LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer because BulkWriter Factory not available.");
        }
    }
    long bucketCheckInterval = conf.get(SINK_ROLLING_POLICY_CHECK_INTERVAL).toMillis();
    DataStream<PartitionCommitInfo> writerStream;
    if (autoCompaction) {
        long compactionSize = conf.getOptional(FileSystemConnectorOptions.COMPACTION_FILE_SIZE).orElse(conf.get(SINK_ROLLING_POLICY_FILE_SIZE)).getBytes();
        writerStream = StreamingSink.compactionWriter(providerContext, dataStream, bucketCheckInterval, builder, fsFactory(), path, createCompactReaderFactory(sd, tableProps), compactionSize, parallelism);
    } else {
        writerStream = StreamingSink.writer(providerContext, dataStream, bucketCheckInterval, builder, parallelism, getPartitionKeys(), conf);
    }
    return StreamingSink.sink(providerContext, writerStream, path, identifier, getPartitionKeys(), msFactory(), fsFactory(), conf);
}
Also used : TableBucketAssigner(org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner) Configuration(org.apache.hadoop.conf.Configuration) ThreadLocalClassLoaderConfiguration(org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration) HiveMetastoreClientFactory(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientFactory) HiveBulkWriterFactory(org.apache.flink.connectors.hive.write.HiveBulkWriterFactory) LoggerFactory(org.slf4j.LoggerFactory) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) HiveCompactReaderFactory(org.apache.flink.connectors.hive.read.HiveCompactReaderFactory) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) RowData(org.apache.flink.table.data.RowData) Path(org.apache.hadoop.fs.Path) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) FileSystemTableSink(org.apache.flink.connector.file.table.FileSystemTableSink)

Example 3 with PartitionCommitInfo

use of org.apache.flink.connector.file.table.stream.PartitionCommitInfo in project flink by apache.

the class CompactOperatorTest method testCompactOperator.

@Test
public void testCompactOperator() throws Exception {
    AtomicReference<OperatorSubtaskState> state = new AtomicReference<>();
    Path f0 = newFile(".uncompacted-f0", 3);
    Path f1 = newFile(".uncompacted-f1", 2);
    Path f2 = newFile(".uncompacted-f2", 2);
    Path f3 = newFile(".uncompacted-f3", 5);
    Path f4 = newFile(".uncompacted-f4", 1);
    Path f5 = newFile(".uncompacted-f5", 5);
    Path f6 = newFile(".uncompacted-f6", 4);
    FileSystem fs = f0.getFileSystem();
    runCompact(harness -> {
        harness.setup();
        harness.open();
        harness.processElement(new CompactionUnit(0, "p0", Arrays.asList(f0, f1, f4)), 0);
        harness.processElement(new CompactionUnit(1, "p0", Collections.singletonList(f3)), 0);
        harness.processElement(new CompactionUnit(2, "p1", Arrays.asList(f2, f5)), 0);
        harness.processElement(new CompactionUnit(3, "p0", Collections.singletonList(f6)), 0);
        harness.processElement(new EndCompaction(1), 0);
        state.set(harness.snapshot(2, 0));
        // check output commit info
        List<PartitionCommitInfo> outputs = harness.extractOutputValues();
        Assert.assertEquals(1, outputs.size());
        Assert.assertEquals(1, outputs.get(0).getCheckpointId());
        Assert.assertEquals(Arrays.asList("p0", "p1"), outputs.get(0).getPartitions());
        // check all compacted file generated
        Assert.assertTrue(fs.exists(new Path(folder, "compacted-f0")));
        Assert.assertTrue(fs.exists(new Path(folder, "compacted-f2")));
        Assert.assertTrue(fs.exists(new Path(folder, "compacted-f3")));
        Assert.assertTrue(fs.exists(new Path(folder, "compacted-f6")));
        // check one compacted file
        byte[] bytes = FileUtils.readAllBytes(new File(folder.getPath(), "compacted-f0").toPath());
        Arrays.sort(bytes);
        Assert.assertArrayEquals(new byte[] { 0, 0, 0, 1, 1, 2 }, bytes);
    });
    runCompact(harness -> {
        harness.setup();
        harness.initializeState(state.get());
        harness.open();
        harness.notifyOfCompletedCheckpoint(2);
        // check all temp files have been deleted
        Assert.assertFalse(fs.exists(f0));
        Assert.assertFalse(fs.exists(f1));
        Assert.assertFalse(fs.exists(f2));
        Assert.assertFalse(fs.exists(f3));
        Assert.assertFalse(fs.exists(f4));
        Assert.assertFalse(fs.exists(f5));
        Assert.assertFalse(fs.exists(f6));
    });
}
Also used : Path(org.apache.flink.core.fs.Path) FileSystem(org.apache.flink.core.fs.FileSystem) CompactionUnit(org.apache.flink.connector.file.table.stream.compact.CompactMessages.CompactionUnit) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) EndCompaction(org.apache.flink.connector.file.table.stream.compact.CompactMessages.EndCompaction) AtomicReference(java.util.concurrent.atomic.AtomicReference) File(java.io.File) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) Test(org.junit.Test)

Example 4 with PartitionCommitInfo

use of org.apache.flink.connector.file.table.stream.PartitionCommitInfo in project flink by apache.

the class CompactOperatorTest method testUnitSelection.

@Test
public void testUnitSelection() throws Exception {
    OneInputStreamOperatorTestHarness<CoordinatorOutput, PartitionCommitInfo> harness0 = create(2, 0);
    harness0.setup();
    harness0.open();
    OneInputStreamOperatorTestHarness<CoordinatorOutput, PartitionCommitInfo> harness1 = create(2, 1);
    harness1.setup();
    harness1.open();
    Path f0 = newFile(".uncompacted-f0", 3);
    Path f1 = newFile(".uncompacted-f1", 2);
    Path f2 = newFile(".uncompacted-f2", 2);
    Path f3 = newFile(".uncompacted-f3", 5);
    Path f4 = newFile(".uncompacted-f4", 1);
    Path f5 = newFile(".uncompacted-f5", 5);
    Path f6 = newFile(".uncompacted-f6", 4);
    FileSystem fs = f0.getFileSystem();
    // broadcast
    harness0.processElement(new CompactionUnit(0, "p0", Arrays.asList(f0, f1, f4)), 0);
    harness0.processElement(new CompactionUnit(1, "p0", Collections.singletonList(f3)), 0);
    harness0.processElement(new CompactionUnit(2, "p0", Arrays.asList(f2, f5)), 0);
    harness0.processElement(new CompactionUnit(3, "p0", Collections.singletonList(f6)), 0);
    harness0.processElement(new EndCompaction(1), 0);
    // check compacted file generated
    Assert.assertTrue(fs.exists(new Path(folder, "compacted-f0")));
    Assert.assertTrue(fs.exists(new Path(folder, "compacted-f2")));
    // f3 and f6 are in the charge of another task
    Assert.assertFalse(fs.exists(new Path(folder, "compacted-f3")));
    Assert.assertFalse(fs.exists(new Path(folder, "compacted-f6")));
    harness1.processElement(new CompactionUnit(0, "p0", Arrays.asList(f0, f1, f4)), 0);
    harness1.processElement(new CompactionUnit(1, "p0", Collections.singletonList(f3)), 0);
    harness1.processElement(new CompactionUnit(2, "p0", Arrays.asList(f2, f5)), 0);
    harness1.processElement(new CompactionUnit(3, "p0", Collections.singletonList(f6)), 0);
    harness1.processElement(new EndCompaction(1), 0);
    // check compacted file generated
    Assert.assertTrue(fs.exists(new Path(folder, "compacted-f3")));
    Assert.assertTrue(fs.exists(new Path(folder, "compacted-f6")));
    harness0.close();
    harness1.close();
}
Also used : Path(org.apache.flink.core.fs.Path) CoordinatorOutput(org.apache.flink.connector.file.table.stream.compact.CompactMessages.CoordinatorOutput) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) FileSystem(org.apache.flink.core.fs.FileSystem) CompactionUnit(org.apache.flink.connector.file.table.stream.compact.CompactMessages.CompactionUnit) EndCompaction(org.apache.flink.connector.file.table.stream.compact.CompactMessages.EndCompaction) Test(org.junit.Test)

Aggregations

PartitionCommitInfo (org.apache.flink.connector.file.table.stream.PartitionCommitInfo)4 CompactionUnit (org.apache.flink.connector.file.table.stream.compact.CompactMessages.CompactionUnit)2 EndCompaction (org.apache.flink.connector.file.table.stream.compact.CompactMessages.EndCompaction)2 FileSystem (org.apache.flink.core.fs.FileSystem)2 Path (org.apache.flink.core.fs.Path)2 OutputFileConfig (org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig)2 RowData (org.apache.flink.table.data.RowData)2 Test (org.junit.Test)2 File (java.io.File)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Encoder (org.apache.flink.api.common.serialization.Encoder)1 FileSystemTableSink (org.apache.flink.connector.file.table.FileSystemTableSink)1 TableBucketAssigner (org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner)1 CoordinatorOutput (org.apache.flink.connector.file.table.stream.compact.CompactMessages.CoordinatorOutput)1 CompactReader (org.apache.flink.connector.file.table.stream.compact.CompactReader)1 HiveCompactReaderFactory (org.apache.flink.connectors.hive.read.HiveCompactReaderFactory)1 HiveBulkWriterFactory (org.apache.flink.connectors.hive.write.HiveBulkWriterFactory)1 HiveOutputFormatFactory (org.apache.flink.connectors.hive.write.HiveOutputFormatFactory)1 HiveWriterFactory (org.apache.flink.connectors.hive.write.HiveWriterFactory)1 ThreadLocalClassLoaderConfiguration (org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration)1