use of org.apache.flink.connector.file.table.stream.PartitionCommitInfo in project flink by apache.
the class FileSystemTableSink method createStreamingSink.
private DataStreamSink<?> createStreamingSink(ProviderContext providerContext, DataStream<RowData> dataStream, Context sinkContext, final int parallelism) {
FileSystemFactory fsFactory = FileSystem::get;
RowDataPartitionComputer computer = partitionComputer();
boolean autoCompaction = tableOptions.getBoolean(AUTO_COMPACTION);
Object writer = createWriter(sinkContext);
boolean isEncoder = writer instanceof Encoder;
TableBucketAssigner assigner = new TableBucketAssigner(computer);
TableRollingPolicy rollingPolicy = new TableRollingPolicy(!isEncoder || autoCompaction, tableOptions.get(SINK_ROLLING_POLICY_FILE_SIZE).getBytes(), tableOptions.get(SINK_ROLLING_POLICY_ROLLOVER_INTERVAL).toMillis(), tableOptions.get(SINK_ROLLING_POLICY_INACTIVITY_INTERVAL).toMillis());
String randomPrefix = "part-" + UUID.randomUUID().toString();
OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder = OutputFileConfig.builder();
fileNamingBuilder = autoCompaction ? fileNamingBuilder.withPartPrefix(convertToUncompacted(randomPrefix)) : fileNamingBuilder.withPartPrefix(randomPrefix);
OutputFileConfig fileNamingConfig = fileNamingBuilder.build();
BucketsBuilder<RowData, String, ? extends BucketsBuilder<RowData, ?, ?>> bucketsBuilder;
if (isEncoder) {
// noinspection unchecked
bucketsBuilder = StreamingFileSink.forRowFormat(path, new ProjectionEncoder((Encoder<RowData>) writer, computer)).withBucketAssigner(assigner).withOutputFileConfig(fileNamingConfig).withRollingPolicy(rollingPolicy);
} else {
// noinspection unchecked
bucketsBuilder = StreamingFileSink.forBulkFormat(path, new ProjectionBulkFactory((BulkWriter.Factory<RowData>) writer, computer)).withBucketAssigner(assigner).withOutputFileConfig(fileNamingConfig).withRollingPolicy(rollingPolicy);
}
long bucketCheckInterval = tableOptions.get(SINK_ROLLING_POLICY_CHECK_INTERVAL).toMillis();
DataStream<PartitionCommitInfo> writerStream;
if (autoCompaction) {
long compactionSize = tableOptions.getOptional(COMPACTION_FILE_SIZE).orElse(tableOptions.get(SINK_ROLLING_POLICY_FILE_SIZE)).getBytes();
CompactReader.Factory<RowData> reader = createCompactReaderFactory(sinkContext).orElseThrow(() -> new TableException("Please implement available reader for compaction:" + " BulkFormat, FileInputFormat."));
writerStream = StreamingSink.compactionWriter(providerContext, dataStream, bucketCheckInterval, bucketsBuilder, fsFactory, path, reader, compactionSize, parallelism);
} else {
writerStream = StreamingSink.writer(providerContext, dataStream, bucketCheckInterval, bucketsBuilder, parallelism, partitionKeys, tableOptions);
}
return StreamingSink.sink(providerContext, writerStream, path, tableIdentifier, partitionKeys, new EmptyMetaStoreFactory(path), fsFactory, tableOptions);
}
use of org.apache.flink.connector.file.table.stream.PartitionCommitInfo in project flink by apache.
the class HiveTableSink method createStreamSink.
private DataStreamSink<?> createStreamSink(ProviderContext providerContext, DataStream<RowData> dataStream, StorageDescriptor sd, Properties tableProps, HiveWriterFactory recordWriterFactory, OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder, final int parallelism) {
org.apache.flink.configuration.Configuration conf = new org.apache.flink.configuration.Configuration();
catalogTable.getOptions().forEach(conf::setString);
String commitPolicies = conf.getString(FileSystemConnectorOptions.SINK_PARTITION_COMMIT_POLICY_KIND);
if (!getPartitionKeys().isEmpty() && StringUtils.isNullOrWhitespaceOnly(commitPolicies)) {
throw new FlinkHiveException(String.format("Streaming write to partitioned hive table %s without providing a commit policy. " + "Make sure to set a proper value for %s", identifier, FileSystemConnectorOptions.SINK_PARTITION_COMMIT_POLICY_KIND.key()));
}
HiveRowDataPartitionComputer partComputer = new HiveRowDataPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray());
TableBucketAssigner assigner = new TableBucketAssigner(partComputer);
HiveRollingPolicy rollingPolicy = new HiveRollingPolicy(conf.get(SINK_ROLLING_POLICY_FILE_SIZE).getBytes(), conf.get(SINK_ROLLING_POLICY_ROLLOVER_INTERVAL).toMillis(), conf.get(SINK_ROLLING_POLICY_INACTIVITY_INTERVAL).toMillis());
boolean autoCompaction = conf.getBoolean(FileSystemConnectorOptions.AUTO_COMPACTION);
if (autoCompaction) {
fileNamingBuilder.withPartPrefix(convertToUncompacted(fileNamingBuilder.build().getPartPrefix()));
}
OutputFileConfig outputFileConfig = fileNamingBuilder.build();
org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(sd.getLocation());
BucketsBuilder<RowData, String, ? extends BucketsBuilder<RowData, ?, ?>> builder;
if (flinkConf.get(HiveOptions.TABLE_EXEC_HIVE_FALLBACK_MAPRED_WRITER)) {
builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer.");
} else {
Optional<BulkWriter.Factory<RowData>> bulkFactory = createBulkWriterFactory(getPartitionKeyArray(), sd);
if (bulkFactory.isPresent()) {
builder = StreamingFileSink.forBulkFormat(path, new FileSystemTableSink.ProjectionBulkFactory(bulkFactory.get(), partComputer)).withBucketAssigner(assigner).withRollingPolicy(rollingPolicy).withOutputFileConfig(outputFileConfig);
LOG.info("Hive streaming sink: Use native parquet&orc writer.");
} else {
builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer because BulkWriter Factory not available.");
}
}
long bucketCheckInterval = conf.get(SINK_ROLLING_POLICY_CHECK_INTERVAL).toMillis();
DataStream<PartitionCommitInfo> writerStream;
if (autoCompaction) {
long compactionSize = conf.getOptional(FileSystemConnectorOptions.COMPACTION_FILE_SIZE).orElse(conf.get(SINK_ROLLING_POLICY_FILE_SIZE)).getBytes();
writerStream = StreamingSink.compactionWriter(providerContext, dataStream, bucketCheckInterval, builder, fsFactory(), path, createCompactReaderFactory(sd, tableProps), compactionSize, parallelism);
} else {
writerStream = StreamingSink.writer(providerContext, dataStream, bucketCheckInterval, builder, parallelism, getPartitionKeys(), conf);
}
return StreamingSink.sink(providerContext, writerStream, path, identifier, getPartitionKeys(), msFactory(), fsFactory(), conf);
}
use of org.apache.flink.connector.file.table.stream.PartitionCommitInfo in project flink by apache.
the class CompactOperatorTest method testCompactOperator.
@Test
public void testCompactOperator() throws Exception {
AtomicReference<OperatorSubtaskState> state = new AtomicReference<>();
Path f0 = newFile(".uncompacted-f0", 3);
Path f1 = newFile(".uncompacted-f1", 2);
Path f2 = newFile(".uncompacted-f2", 2);
Path f3 = newFile(".uncompacted-f3", 5);
Path f4 = newFile(".uncompacted-f4", 1);
Path f5 = newFile(".uncompacted-f5", 5);
Path f6 = newFile(".uncompacted-f6", 4);
FileSystem fs = f0.getFileSystem();
runCompact(harness -> {
harness.setup();
harness.open();
harness.processElement(new CompactionUnit(0, "p0", Arrays.asList(f0, f1, f4)), 0);
harness.processElement(new CompactionUnit(1, "p0", Collections.singletonList(f3)), 0);
harness.processElement(new CompactionUnit(2, "p1", Arrays.asList(f2, f5)), 0);
harness.processElement(new CompactionUnit(3, "p0", Collections.singletonList(f6)), 0);
harness.processElement(new EndCompaction(1), 0);
state.set(harness.snapshot(2, 0));
// check output commit info
List<PartitionCommitInfo> outputs = harness.extractOutputValues();
Assert.assertEquals(1, outputs.size());
Assert.assertEquals(1, outputs.get(0).getCheckpointId());
Assert.assertEquals(Arrays.asList("p0", "p1"), outputs.get(0).getPartitions());
// check all compacted file generated
Assert.assertTrue(fs.exists(new Path(folder, "compacted-f0")));
Assert.assertTrue(fs.exists(new Path(folder, "compacted-f2")));
Assert.assertTrue(fs.exists(new Path(folder, "compacted-f3")));
Assert.assertTrue(fs.exists(new Path(folder, "compacted-f6")));
// check one compacted file
byte[] bytes = FileUtils.readAllBytes(new File(folder.getPath(), "compacted-f0").toPath());
Arrays.sort(bytes);
Assert.assertArrayEquals(new byte[] { 0, 0, 0, 1, 1, 2 }, bytes);
});
runCompact(harness -> {
harness.setup();
harness.initializeState(state.get());
harness.open();
harness.notifyOfCompletedCheckpoint(2);
// check all temp files have been deleted
Assert.assertFalse(fs.exists(f0));
Assert.assertFalse(fs.exists(f1));
Assert.assertFalse(fs.exists(f2));
Assert.assertFalse(fs.exists(f3));
Assert.assertFalse(fs.exists(f4));
Assert.assertFalse(fs.exists(f5));
Assert.assertFalse(fs.exists(f6));
});
}
use of org.apache.flink.connector.file.table.stream.PartitionCommitInfo in project flink by apache.
the class CompactOperatorTest method testUnitSelection.
@Test
public void testUnitSelection() throws Exception {
OneInputStreamOperatorTestHarness<CoordinatorOutput, PartitionCommitInfo> harness0 = create(2, 0);
harness0.setup();
harness0.open();
OneInputStreamOperatorTestHarness<CoordinatorOutput, PartitionCommitInfo> harness1 = create(2, 1);
harness1.setup();
harness1.open();
Path f0 = newFile(".uncompacted-f0", 3);
Path f1 = newFile(".uncompacted-f1", 2);
Path f2 = newFile(".uncompacted-f2", 2);
Path f3 = newFile(".uncompacted-f3", 5);
Path f4 = newFile(".uncompacted-f4", 1);
Path f5 = newFile(".uncompacted-f5", 5);
Path f6 = newFile(".uncompacted-f6", 4);
FileSystem fs = f0.getFileSystem();
// broadcast
harness0.processElement(new CompactionUnit(0, "p0", Arrays.asList(f0, f1, f4)), 0);
harness0.processElement(new CompactionUnit(1, "p0", Collections.singletonList(f3)), 0);
harness0.processElement(new CompactionUnit(2, "p0", Arrays.asList(f2, f5)), 0);
harness0.processElement(new CompactionUnit(3, "p0", Collections.singletonList(f6)), 0);
harness0.processElement(new EndCompaction(1), 0);
// check compacted file generated
Assert.assertTrue(fs.exists(new Path(folder, "compacted-f0")));
Assert.assertTrue(fs.exists(new Path(folder, "compacted-f2")));
// f3 and f6 are in the charge of another task
Assert.assertFalse(fs.exists(new Path(folder, "compacted-f3")));
Assert.assertFalse(fs.exists(new Path(folder, "compacted-f6")));
harness1.processElement(new CompactionUnit(0, "p0", Arrays.asList(f0, f1, f4)), 0);
harness1.processElement(new CompactionUnit(1, "p0", Collections.singletonList(f3)), 0);
harness1.processElement(new CompactionUnit(2, "p0", Arrays.asList(f2, f5)), 0);
harness1.processElement(new CompactionUnit(3, "p0", Collections.singletonList(f6)), 0);
harness1.processElement(new EndCompaction(1), 0);
// check compacted file generated
Assert.assertTrue(fs.exists(new Path(folder, "compacted-f3")));
Assert.assertTrue(fs.exists(new Path(folder, "compacted-f6")));
harness0.close();
harness1.close();
}
Aggregations