use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.
the class HiveTableSink method createStreamSink.
private DataStreamSink<?> createStreamSink(ProviderContext providerContext, DataStream<RowData> dataStream, StorageDescriptor sd, Properties tableProps, HiveWriterFactory recordWriterFactory, OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder, final int parallelism) {
org.apache.flink.configuration.Configuration conf = new org.apache.flink.configuration.Configuration();
catalogTable.getOptions().forEach(conf::setString);
String commitPolicies = conf.getString(FileSystemConnectorOptions.SINK_PARTITION_COMMIT_POLICY_KIND);
if (!getPartitionKeys().isEmpty() && StringUtils.isNullOrWhitespaceOnly(commitPolicies)) {
throw new FlinkHiveException(String.format("Streaming write to partitioned hive table %s without providing a commit policy. " + "Make sure to set a proper value for %s", identifier, FileSystemConnectorOptions.SINK_PARTITION_COMMIT_POLICY_KIND.key()));
}
HiveRowDataPartitionComputer partComputer = new HiveRowDataPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray());
TableBucketAssigner assigner = new TableBucketAssigner(partComputer);
HiveRollingPolicy rollingPolicy = new HiveRollingPolicy(conf.get(SINK_ROLLING_POLICY_FILE_SIZE).getBytes(), conf.get(SINK_ROLLING_POLICY_ROLLOVER_INTERVAL).toMillis(), conf.get(SINK_ROLLING_POLICY_INACTIVITY_INTERVAL).toMillis());
boolean autoCompaction = conf.getBoolean(FileSystemConnectorOptions.AUTO_COMPACTION);
if (autoCompaction) {
fileNamingBuilder.withPartPrefix(convertToUncompacted(fileNamingBuilder.build().getPartPrefix()));
}
OutputFileConfig outputFileConfig = fileNamingBuilder.build();
org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(sd.getLocation());
BucketsBuilder<RowData, String, ? extends BucketsBuilder<RowData, ?, ?>> builder;
if (flinkConf.get(HiveOptions.TABLE_EXEC_HIVE_FALLBACK_MAPRED_WRITER)) {
builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer.");
} else {
Optional<BulkWriter.Factory<RowData>> bulkFactory = createBulkWriterFactory(getPartitionKeyArray(), sd);
if (bulkFactory.isPresent()) {
builder = StreamingFileSink.forBulkFormat(path, new FileSystemTableSink.ProjectionBulkFactory(bulkFactory.get(), partComputer)).withBucketAssigner(assigner).withRollingPolicy(rollingPolicy).withOutputFileConfig(outputFileConfig);
LOG.info("Hive streaming sink: Use native parquet&orc writer.");
} else {
builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer because BulkWriter Factory not available.");
}
}
long bucketCheckInterval = conf.get(SINK_ROLLING_POLICY_CHECK_INTERVAL).toMillis();
DataStream<PartitionCommitInfo> writerStream;
if (autoCompaction) {
long compactionSize = conf.getOptional(FileSystemConnectorOptions.COMPACTION_FILE_SIZE).orElse(conf.get(SINK_ROLLING_POLICY_FILE_SIZE)).getBytes();
writerStream = StreamingSink.compactionWriter(providerContext, dataStream, bucketCheckInterval, builder, fsFactory(), path, createCompactReaderFactory(sd, tableProps), compactionSize, parallelism);
} else {
writerStream = StreamingSink.writer(providerContext, dataStream, bucketCheckInterval, builder, parallelism, getPartitionKeys(), conf);
}
return StreamingSink.sink(providerContext, writerStream, path, identifier, getPartitionKeys(), msFactory(), fsFactory(), conf);
}
use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.
the class HiveTableSink method consume.
private DataStreamSink<?> consume(ProviderContext providerContext, DataStream<RowData> dataStream, boolean isBounded, DataStructureConverter converter) {
checkAcidTable(catalogTable.getOptions(), identifier.toObjectPath());
try (HiveMetastoreClientWrapper client = HiveMetastoreClientFactory.create(HiveConfUtils.create(jobConf), hiveVersion)) {
Table table = client.getTable(identifier.getDatabaseName(), identifier.getObjectName());
StorageDescriptor sd = table.getSd();
Class hiveOutputFormatClz = hiveShim.getHiveOutputFormatClass(Class.forName(sd.getOutputFormat()));
boolean isCompressed = jobConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false);
HiveWriterFactory writerFactory = new HiveWriterFactory(jobConf, hiveOutputFormatClz, sd.getSerdeInfo(), tableSchema, getPartitionKeyArray(), HiveReflectionUtils.getTableMetadata(hiveShim, table), hiveShim, isCompressed);
String extension = Utilities.getFileExtension(jobConf, isCompressed, (HiveOutputFormat<?, ?>) hiveOutputFormatClz.newInstance());
OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder = OutputFileConfig.builder().withPartPrefix("part-" + UUID.randomUUID().toString()).withPartSuffix(extension == null ? "" : extension);
final int parallelism = Optional.ofNullable(configuredParallelism).orElse(dataStream.getParallelism());
if (isBounded) {
OutputFileConfig fileNaming = fileNamingBuilder.build();
return createBatchSink(dataStream, converter, sd, writerFactory, fileNaming, parallelism);
} else {
if (overwrite) {
throw new IllegalStateException("Streaming mode not support overwrite.");
}
Properties tableProps = HiveReflectionUtils.getTableMetadata(hiveShim, table);
return createStreamSink(providerContext, dataStream, sd, tableProps, writerFactory, fileNamingBuilder, parallelism);
}
} catch (TException e) {
throw new CatalogException("Failed to query Hive metaStore", e);
} catch (IOException e) {
throw new FlinkRuntimeException("Failed to create staging dir", e);
} catch (ClassNotFoundException e) {
throw new FlinkHiveException("Failed to get output format class", e);
} catch (IllegalAccessException | InstantiationException e) {
throw new FlinkHiveException("Failed to instantiate output format instance", e);
}
}
use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.
the class HiveTableSink method createBatchSink.
private DataStreamSink<Row> createBatchSink(DataStream<RowData> dataStream, DataStructureConverter converter, StorageDescriptor sd, HiveWriterFactory recordWriterFactory, OutputFileConfig fileNaming, final int parallelism) throws IOException {
FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
builder.setPartitionComputer(new HiveRowPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray()));
builder.setDynamicGrouped(dynamicGrouping);
builder.setPartitionColumns(getPartitionKeyArray());
builder.setFileSystemFactory(fsFactory());
builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
builder.setMetaStoreFactory(msFactory());
builder.setOverwrite(overwrite);
builder.setStaticPartitions(staticPartitionSpec);
builder.setTempPath(new org.apache.flink.core.fs.Path(toStagingDir(sd.getLocation(), jobConf)));
builder.setOutputFileConfig(fileNaming);
return dataStream.map((MapFunction<RowData, Row>) value -> (Row) converter.toExternal(value)).writeUsingOutputFormat(builder.build()).setParallelism(parallelism);
}
use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.
the class HiveDeserializeExceptionTest method parameters.
@Parameterized.Parameters(name = "{1}")
public static Object[] parameters() {
HiveWriterFactory writerFactory = new HiveWriterFactory(new JobConf(), HiveIgnoreKeyTextOutputFormat.class, new SerDeInfo(), TableSchema.builder().build(), new String[0], new Properties(), HiveShimLoader.loadHiveShim(HiveShimLoader.getHiveVersion()), false);
HiveCompactReaderFactory compactReaderFactory = new HiveCompactReaderFactory(new StorageDescriptor(), new Properties(), new JobConf(), new CatalogTableImpl(TableSchema.builder().build(), Collections.emptyMap(), null), HiveShimLoader.getHiveVersion(), RowType.of(DataTypes.INT().getLogicalType()), false);
HiveSourceBuilder builder = new HiveSourceBuilder(new JobConf(), new Configuration(), new ObjectPath("default", "foo"), HiveShimLoader.getHiveVersion(), new CatalogTableImpl(TableSchema.builder().field("i", DataTypes.INT()).build(), Collections.emptyMap(), null));
builder.setPartitions(Collections.singletonList(new HiveTablePartition(new StorageDescriptor(), new Properties())));
HiveSource<RowData> hiveSource = builder.buildWithDefaultBulkFormat();
return new Object[][] { new Object[] { writerFactory, writerFactory.getClass().getSimpleName() }, new Object[] { compactReaderFactory, compactReaderFactory.getClass().getSimpleName() }, new Object[] { hiveSource, hiveSource.getClass().getSimpleName() } };
}
use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.
the class HiveOutputFormatFactoryTest method testCreateOutputFormat.
@Test
public void testCreateOutputFormat() {
TableSchema schema = TableSchema.builder().field("x", DataTypes.INT()).build();
SerDeInfo serDeInfo = new SerDeInfo("name", LazySimpleSerDe.class.getName(), Collections.emptyMap());
HiveWriterFactory writerFactory = new HiveWriterFactory(new JobConf(), VerifyURIOutputFormat.class, serDeInfo, schema, new String[0], new Properties(), HiveShimLoader.loadHiveShim(HiveShimLoader.getHiveVersion()), false);
HiveOutputFormatFactory factory = new HiveOutputFormatFactory(writerFactory);
org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(TEST_URI_SCHEME, TEST_URI_AUTHORITY, "/foo/path");
factory.createOutputFormat(path);
}
Aggregations