Search in sources :

Example 1 with HiveOutputFormatFactory

use of org.apache.flink.connectors.hive.write.HiveOutputFormatFactory in project flink by apache.

the class HiveTableSink method createBatchSink.

private DataStreamSink<Row> createBatchSink(DataStream<RowData> dataStream, DataStructureConverter converter, StorageDescriptor sd, HiveWriterFactory recordWriterFactory, OutputFileConfig fileNaming, final int parallelism) throws IOException {
    FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
    builder.setPartitionComputer(new HiveRowPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray()));
    builder.setDynamicGrouped(dynamicGrouping);
    builder.setPartitionColumns(getPartitionKeyArray());
    builder.setFileSystemFactory(fsFactory());
    builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
    builder.setMetaStoreFactory(msFactory());
    builder.setOverwrite(overwrite);
    builder.setStaticPartitions(staticPartitionSpec);
    builder.setTempPath(new org.apache.flink.core.fs.Path(toStagingDir(sd.getLocation(), jobConf)));
    builder.setOutputFileConfig(fileNaming);
    return dataStream.map((MapFunction<RowData, Row>) value -> (Row) converter.toExternal(value)).writeUsingOutputFormat(builder.build()).setParallelism(parallelism);
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) HiveMetastoreClientFactory(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientFactory) HiveBulkWriterFactory(org.apache.flink.connectors.hive.write.HiveBulkWriterFactory) CatalogTable(org.apache.flink.table.catalog.CatalogTable) LoggerFactory(org.slf4j.LoggerFactory) JobConfUtils(org.apache.flink.connectors.hive.util.JobConfUtils) MapFunction(org.apache.flink.api.common.functions.MapFunction) OrcSplitReaderUtil(org.apache.flink.orc.OrcSplitReaderUtil) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) SupportsPartitioning(org.apache.flink.table.connector.sink.abilities.SupportsPartitioning) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SINK_ROLLING_POLICY_CHECK_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_CHECK_INTERVAL) StreamingFileSink(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink) Path(org.apache.hadoop.fs.Path) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) PartFileInfo(org.apache.flink.streaming.api.functions.sink.filesystem.PartFileInfo) CheckpointRollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.CheckpointRollingPolicy) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) HiveShimLoader(org.apache.flink.table.catalog.hive.client.HiveShimLoader) HiveCatalogFactoryOptions(org.apache.flink.table.catalog.hive.factories.HiveCatalogFactoryOptions) DynamicTableSink(org.apache.flink.table.connector.sink.DynamicTableSink) SINK_ROLLING_POLICY_ROLLOVER_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_ROLLOVER_INTERVAL) TableSchema(org.apache.flink.table.api.TableSchema) CompactOperator.convertToUncompacted(org.apache.flink.connector.file.table.stream.compact.CompactOperator.convertToUncompacted) UUID(java.util.UUID) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) Preconditions(org.apache.flink.util.Preconditions) StringUtils(org.apache.flink.util.StringUtils) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) HiveReflectionUtils(org.apache.flink.table.catalog.hive.util.HiveReflectionUtils) LogicalType(org.apache.flink.table.types.logical.LogicalType) DataStreamSinkProvider(org.apache.flink.table.connector.sink.DataStreamSinkProvider) Optional(java.util.Optional) Row(org.apache.flink.types.Row) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) ChangelogMode(org.apache.flink.table.connector.ChangelogMode) RowType(org.apache.flink.table.types.logical.RowType) HiveShim(org.apache.flink.table.catalog.hive.client.HiveShim) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) LinkedHashMap(java.util.LinkedHashMap) ReadableConfig(org.apache.flink.configuration.ReadableConfig) ThreadLocalClassLoaderConfiguration(org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration) FileSystemConnectorOptions(org.apache.flink.connector.file.table.FileSystemConnectorOptions) SINK_ROLLING_POLICY_INACTIVITY_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_INACTIVITY_INTERVAL) SupportsOverwrite(org.apache.flink.table.connector.sink.abilities.SupportsOverwrite) HiveMetastoreClientWrapper(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientWrapper) Nullable(javax.annotation.Nullable) StreamingSink(org.apache.flink.connector.file.table.stream.StreamingSink) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) HiveConfUtils(org.apache.flink.connectors.hive.util.HiveConfUtils) HiveCompactReaderFactory(org.apache.flink.connectors.hive.read.HiveCompactReaderFactory) RowData(org.apache.flink.table.data.RowData) Logger(org.slf4j.Logger) Properties(java.util.Properties) ProviderContext(org.apache.flink.table.connector.ProviderContext) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) BulkWriter(org.apache.flink.api.common.serialization.BulkWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) TypeDescription(org.apache.orc.TypeDescription) TException(org.apache.thrift.TException) IOException(java.io.IOException) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Table(org.apache.hadoop.hive.metastore.api.Table) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) DataStream(org.apache.flink.streaming.api.datastream.DataStream) JobConf(org.apache.hadoop.mapred.JobConf) TableBucketAssigner(org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner) CompactReader(org.apache.flink.connector.file.table.stream.compact.CompactReader) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) FileSystemTableSink(org.apache.flink.connector.file.table.FileSystemTableSink) TableSchemaUtils(org.apache.flink.table.utils.TableSchemaUtils) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) CatalogException(org.apache.flink.table.catalog.exceptions.CatalogException) SINK_ROLLING_POLICY_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_FILE_SIZE) HiveTableUtil.checkAcidTable(org.apache.flink.table.catalog.hive.util.HiveTableUtil.checkAcidTable) RowData(org.apache.flink.table.data.RowData) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Row(org.apache.flink.types.Row) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory)

Example 2 with HiveOutputFormatFactory

use of org.apache.flink.connectors.hive.write.HiveOutputFormatFactory in project flink by apache.

the class HiveOutputFormatFactoryTest method testCreateOutputFormat.

@Test
public void testCreateOutputFormat() {
    TableSchema schema = TableSchema.builder().field("x", DataTypes.INT()).build();
    SerDeInfo serDeInfo = new SerDeInfo("name", LazySimpleSerDe.class.getName(), Collections.emptyMap());
    HiveWriterFactory writerFactory = new HiveWriterFactory(new JobConf(), VerifyURIOutputFormat.class, serDeInfo, schema, new String[0], new Properties(), HiveShimLoader.loadHiveShim(HiveShimLoader.getHiveVersion()), false);
    HiveOutputFormatFactory factory = new HiveOutputFormatFactory(writerFactory);
    org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(TEST_URI_SCHEME, TEST_URI_AUTHORITY, "/foo/path");
    factory.createOutputFormat(path);
}
Also used : Path(org.apache.hadoop.fs.Path) TableSchema(org.apache.flink.table.api.TableSchema) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) Properties(java.util.Properties) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Aggregations

Properties (java.util.Properties)2 HiveOutputFormatFactory (org.apache.flink.connectors.hive.write.HiveOutputFormatFactory)2 HiveWriterFactory (org.apache.flink.connectors.hive.write.HiveWriterFactory)2 IOException (java.io.IOException)1 UncheckedIOException (java.io.UncheckedIOException)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 Map (java.util.Map)1 Optional (java.util.Optional)1 UUID (java.util.UUID)1 Nullable (javax.annotation.Nullable)1 VisibleForTesting (org.apache.flink.annotation.VisibleForTesting)1 MapFunction (org.apache.flink.api.common.functions.MapFunction)1 BulkWriter (org.apache.flink.api.common.serialization.BulkWriter)1 ReadableConfig (org.apache.flink.configuration.ReadableConfig)1 FileSystemConnectorOptions (org.apache.flink.connector.file.table.FileSystemConnectorOptions)1 SINK_ROLLING_POLICY_CHECK_INTERVAL (org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_CHECK_INTERVAL)1 SINK_ROLLING_POLICY_FILE_SIZE (org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_FILE_SIZE)1 SINK_ROLLING_POLICY_INACTIVITY_INTERVAL (org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_INACTIVITY_INTERVAL)1 SINK_ROLLING_POLICY_ROLLOVER_INTERVAL (org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_ROLLOVER_INTERVAL)1