Search in sources :

Example 1 with HiveWriterFactory

use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.

the class HiveTableSink method createStreamSink.

private DataStreamSink<?> createStreamSink(ProviderContext providerContext, DataStream<RowData> dataStream, StorageDescriptor sd, Properties tableProps, HiveWriterFactory recordWriterFactory, OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder, final int parallelism) {
    org.apache.flink.configuration.Configuration conf = new org.apache.flink.configuration.Configuration();
    catalogTable.getOptions().forEach(conf::setString);
    String commitPolicies = conf.getString(FileSystemConnectorOptions.SINK_PARTITION_COMMIT_POLICY_KIND);
    if (!getPartitionKeys().isEmpty() && StringUtils.isNullOrWhitespaceOnly(commitPolicies)) {
        throw new FlinkHiveException(String.format("Streaming write to partitioned hive table %s without providing a commit policy. " + "Make sure to set a proper value for %s", identifier, FileSystemConnectorOptions.SINK_PARTITION_COMMIT_POLICY_KIND.key()));
    }
    HiveRowDataPartitionComputer partComputer = new HiveRowDataPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray());
    TableBucketAssigner assigner = new TableBucketAssigner(partComputer);
    HiveRollingPolicy rollingPolicy = new HiveRollingPolicy(conf.get(SINK_ROLLING_POLICY_FILE_SIZE).getBytes(), conf.get(SINK_ROLLING_POLICY_ROLLOVER_INTERVAL).toMillis(), conf.get(SINK_ROLLING_POLICY_INACTIVITY_INTERVAL).toMillis());
    boolean autoCompaction = conf.getBoolean(FileSystemConnectorOptions.AUTO_COMPACTION);
    if (autoCompaction) {
        fileNamingBuilder.withPartPrefix(convertToUncompacted(fileNamingBuilder.build().getPartPrefix()));
    }
    OutputFileConfig outputFileConfig = fileNamingBuilder.build();
    org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(sd.getLocation());
    BucketsBuilder<RowData, String, ? extends BucketsBuilder<RowData, ?, ?>> builder;
    if (flinkConf.get(HiveOptions.TABLE_EXEC_HIVE_FALLBACK_MAPRED_WRITER)) {
        builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
        LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer.");
    } else {
        Optional<BulkWriter.Factory<RowData>> bulkFactory = createBulkWriterFactory(getPartitionKeyArray(), sd);
        if (bulkFactory.isPresent()) {
            builder = StreamingFileSink.forBulkFormat(path, new FileSystemTableSink.ProjectionBulkFactory(bulkFactory.get(), partComputer)).withBucketAssigner(assigner).withRollingPolicy(rollingPolicy).withOutputFileConfig(outputFileConfig);
            LOG.info("Hive streaming sink: Use native parquet&orc writer.");
        } else {
            builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
            LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer because BulkWriter Factory not available.");
        }
    }
    long bucketCheckInterval = conf.get(SINK_ROLLING_POLICY_CHECK_INTERVAL).toMillis();
    DataStream<PartitionCommitInfo> writerStream;
    if (autoCompaction) {
        long compactionSize = conf.getOptional(FileSystemConnectorOptions.COMPACTION_FILE_SIZE).orElse(conf.get(SINK_ROLLING_POLICY_FILE_SIZE)).getBytes();
        writerStream = StreamingSink.compactionWriter(providerContext, dataStream, bucketCheckInterval, builder, fsFactory(), path, createCompactReaderFactory(sd, tableProps), compactionSize, parallelism);
    } else {
        writerStream = StreamingSink.writer(providerContext, dataStream, bucketCheckInterval, builder, parallelism, getPartitionKeys(), conf);
    }
    return StreamingSink.sink(providerContext, writerStream, path, identifier, getPartitionKeys(), msFactory(), fsFactory(), conf);
}
Also used : TableBucketAssigner(org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner) Configuration(org.apache.hadoop.conf.Configuration) ThreadLocalClassLoaderConfiguration(org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration) HiveMetastoreClientFactory(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientFactory) HiveBulkWriterFactory(org.apache.flink.connectors.hive.write.HiveBulkWriterFactory) LoggerFactory(org.slf4j.LoggerFactory) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) HiveCompactReaderFactory(org.apache.flink.connectors.hive.read.HiveCompactReaderFactory) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) RowData(org.apache.flink.table.data.RowData) Path(org.apache.hadoop.fs.Path) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) FileSystemTableSink(org.apache.flink.connector.file.table.FileSystemTableSink)

Example 2 with HiveWriterFactory

use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.

the class HiveTableSink method consume.

private DataStreamSink<?> consume(ProviderContext providerContext, DataStream<RowData> dataStream, boolean isBounded, DataStructureConverter converter) {
    checkAcidTable(catalogTable.getOptions(), identifier.toObjectPath());
    try (HiveMetastoreClientWrapper client = HiveMetastoreClientFactory.create(HiveConfUtils.create(jobConf), hiveVersion)) {
        Table table = client.getTable(identifier.getDatabaseName(), identifier.getObjectName());
        StorageDescriptor sd = table.getSd();
        Class hiveOutputFormatClz = hiveShim.getHiveOutputFormatClass(Class.forName(sd.getOutputFormat()));
        boolean isCompressed = jobConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false);
        HiveWriterFactory writerFactory = new HiveWriterFactory(jobConf, hiveOutputFormatClz, sd.getSerdeInfo(), tableSchema, getPartitionKeyArray(), HiveReflectionUtils.getTableMetadata(hiveShim, table), hiveShim, isCompressed);
        String extension = Utilities.getFileExtension(jobConf, isCompressed, (HiveOutputFormat<?, ?>) hiveOutputFormatClz.newInstance());
        OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder = OutputFileConfig.builder().withPartPrefix("part-" + UUID.randomUUID().toString()).withPartSuffix(extension == null ? "" : extension);
        final int parallelism = Optional.ofNullable(configuredParallelism).orElse(dataStream.getParallelism());
        if (isBounded) {
            OutputFileConfig fileNaming = fileNamingBuilder.build();
            return createBatchSink(dataStream, converter, sd, writerFactory, fileNaming, parallelism);
        } else {
            if (overwrite) {
                throw new IllegalStateException("Streaming mode not support overwrite.");
            }
            Properties tableProps = HiveReflectionUtils.getTableMetadata(hiveShim, table);
            return createStreamSink(providerContext, dataStream, sd, tableProps, writerFactory, fileNamingBuilder, parallelism);
        }
    } catch (TException e) {
        throw new CatalogException("Failed to query Hive metaStore", e);
    } catch (IOException e) {
        throw new FlinkRuntimeException("Failed to create staging dir", e);
    } catch (ClassNotFoundException e) {
        throw new FlinkHiveException("Failed to get output format class", e);
    } catch (IllegalAccessException | InstantiationException e) {
        throw new FlinkHiveException("Failed to instantiate output format instance", e);
    }
}
Also used : TException(org.apache.thrift.TException) CatalogTable(org.apache.flink.table.catalog.CatalogTable) Table(org.apache.hadoop.hive.metastore.api.Table) HiveTableUtil.checkAcidTable(org.apache.flink.table.catalog.hive.util.HiveTableUtil.checkAcidTable) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) CatalogException(org.apache.flink.table.catalog.exceptions.CatalogException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) Properties(java.util.Properties) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) HiveMetastoreClientWrapper(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientWrapper) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory)

Example 3 with HiveWriterFactory

use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.

the class HiveTableSink method createBatchSink.

private DataStreamSink<Row> createBatchSink(DataStream<RowData> dataStream, DataStructureConverter converter, StorageDescriptor sd, HiveWriterFactory recordWriterFactory, OutputFileConfig fileNaming, final int parallelism) throws IOException {
    FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
    builder.setPartitionComputer(new HiveRowPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray()));
    builder.setDynamicGrouped(dynamicGrouping);
    builder.setPartitionColumns(getPartitionKeyArray());
    builder.setFileSystemFactory(fsFactory());
    builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
    builder.setMetaStoreFactory(msFactory());
    builder.setOverwrite(overwrite);
    builder.setStaticPartitions(staticPartitionSpec);
    builder.setTempPath(new org.apache.flink.core.fs.Path(toStagingDir(sd.getLocation(), jobConf)));
    builder.setOutputFileConfig(fileNaming);
    return dataStream.map((MapFunction<RowData, Row>) value -> (Row) converter.toExternal(value)).writeUsingOutputFormat(builder.build()).setParallelism(parallelism);
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) HiveMetastoreClientFactory(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientFactory) HiveBulkWriterFactory(org.apache.flink.connectors.hive.write.HiveBulkWriterFactory) CatalogTable(org.apache.flink.table.catalog.CatalogTable) LoggerFactory(org.slf4j.LoggerFactory) JobConfUtils(org.apache.flink.connectors.hive.util.JobConfUtils) MapFunction(org.apache.flink.api.common.functions.MapFunction) OrcSplitReaderUtil(org.apache.flink.orc.OrcSplitReaderUtil) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) SupportsPartitioning(org.apache.flink.table.connector.sink.abilities.SupportsPartitioning) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SINK_ROLLING_POLICY_CHECK_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_CHECK_INTERVAL) StreamingFileSink(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink) Path(org.apache.hadoop.fs.Path) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) PartFileInfo(org.apache.flink.streaming.api.functions.sink.filesystem.PartFileInfo) CheckpointRollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.CheckpointRollingPolicy) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) HiveShimLoader(org.apache.flink.table.catalog.hive.client.HiveShimLoader) HiveCatalogFactoryOptions(org.apache.flink.table.catalog.hive.factories.HiveCatalogFactoryOptions) DynamicTableSink(org.apache.flink.table.connector.sink.DynamicTableSink) SINK_ROLLING_POLICY_ROLLOVER_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_ROLLOVER_INTERVAL) TableSchema(org.apache.flink.table.api.TableSchema) CompactOperator.convertToUncompacted(org.apache.flink.connector.file.table.stream.compact.CompactOperator.convertToUncompacted) UUID(java.util.UUID) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) Preconditions(org.apache.flink.util.Preconditions) StringUtils(org.apache.flink.util.StringUtils) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) HiveReflectionUtils(org.apache.flink.table.catalog.hive.util.HiveReflectionUtils) LogicalType(org.apache.flink.table.types.logical.LogicalType) DataStreamSinkProvider(org.apache.flink.table.connector.sink.DataStreamSinkProvider) Optional(java.util.Optional) Row(org.apache.flink.types.Row) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) ChangelogMode(org.apache.flink.table.connector.ChangelogMode) RowType(org.apache.flink.table.types.logical.RowType) HiveShim(org.apache.flink.table.catalog.hive.client.HiveShim) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) LinkedHashMap(java.util.LinkedHashMap) ReadableConfig(org.apache.flink.configuration.ReadableConfig) ThreadLocalClassLoaderConfiguration(org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration) FileSystemConnectorOptions(org.apache.flink.connector.file.table.FileSystemConnectorOptions) SINK_ROLLING_POLICY_INACTIVITY_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_INACTIVITY_INTERVAL) SupportsOverwrite(org.apache.flink.table.connector.sink.abilities.SupportsOverwrite) HiveMetastoreClientWrapper(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientWrapper) Nullable(javax.annotation.Nullable) StreamingSink(org.apache.flink.connector.file.table.stream.StreamingSink) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) HiveConfUtils(org.apache.flink.connectors.hive.util.HiveConfUtils) HiveCompactReaderFactory(org.apache.flink.connectors.hive.read.HiveCompactReaderFactory) RowData(org.apache.flink.table.data.RowData) Logger(org.slf4j.Logger) Properties(java.util.Properties) ProviderContext(org.apache.flink.table.connector.ProviderContext) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) BulkWriter(org.apache.flink.api.common.serialization.BulkWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) TypeDescription(org.apache.orc.TypeDescription) TException(org.apache.thrift.TException) IOException(java.io.IOException) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Table(org.apache.hadoop.hive.metastore.api.Table) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) DataStream(org.apache.flink.streaming.api.datastream.DataStream) JobConf(org.apache.hadoop.mapred.JobConf) TableBucketAssigner(org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner) CompactReader(org.apache.flink.connector.file.table.stream.compact.CompactReader) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) FileSystemTableSink(org.apache.flink.connector.file.table.FileSystemTableSink) TableSchemaUtils(org.apache.flink.table.utils.TableSchemaUtils) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) CatalogException(org.apache.flink.table.catalog.exceptions.CatalogException) SINK_ROLLING_POLICY_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_FILE_SIZE) HiveTableUtil.checkAcidTable(org.apache.flink.table.catalog.hive.util.HiveTableUtil.checkAcidTable) RowData(org.apache.flink.table.data.RowData) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Row(org.apache.flink.types.Row) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory)

Example 4 with HiveWriterFactory

use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.

the class HiveDeserializeExceptionTest method parameters.

@Parameterized.Parameters(name = "{1}")
public static Object[] parameters() {
    HiveWriterFactory writerFactory = new HiveWriterFactory(new JobConf(), HiveIgnoreKeyTextOutputFormat.class, new SerDeInfo(), TableSchema.builder().build(), new String[0], new Properties(), HiveShimLoader.loadHiveShim(HiveShimLoader.getHiveVersion()), false);
    HiveCompactReaderFactory compactReaderFactory = new HiveCompactReaderFactory(new StorageDescriptor(), new Properties(), new JobConf(), new CatalogTableImpl(TableSchema.builder().build(), Collections.emptyMap(), null), HiveShimLoader.getHiveVersion(), RowType.of(DataTypes.INT().getLogicalType()), false);
    HiveSourceBuilder builder = new HiveSourceBuilder(new JobConf(), new Configuration(), new ObjectPath("default", "foo"), HiveShimLoader.getHiveVersion(), new CatalogTableImpl(TableSchema.builder().field("i", DataTypes.INT()).build(), Collections.emptyMap(), null));
    builder.setPartitions(Collections.singletonList(new HiveTablePartition(new StorageDescriptor(), new Properties())));
    HiveSource<RowData> hiveSource = builder.buildWithDefaultBulkFormat();
    return new Object[][] { new Object[] { writerFactory, writerFactory.getClass().getSimpleName() }, new Object[] { compactReaderFactory, compactReaderFactory.getClass().getSimpleName() }, new Object[] { hiveSource, hiveSource.getClass().getSimpleName() } };
}
Also used : ObjectPath(org.apache.flink.table.catalog.ObjectPath) Configuration(org.apache.flink.configuration.Configuration) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) HiveCompactReaderFactory(org.apache.flink.connectors.hive.read.HiveCompactReaderFactory) Properties(java.util.Properties) RowData(org.apache.flink.table.data.RowData) CatalogTableImpl(org.apache.flink.table.catalog.CatalogTableImpl) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) JobConf(org.apache.hadoop.mapred.JobConf)

Example 5 with HiveWriterFactory

use of org.apache.flink.connectors.hive.write.HiveWriterFactory in project flink by apache.

the class HiveOutputFormatFactoryTest method testCreateOutputFormat.

@Test
public void testCreateOutputFormat() {
    TableSchema schema = TableSchema.builder().field("x", DataTypes.INT()).build();
    SerDeInfo serDeInfo = new SerDeInfo("name", LazySimpleSerDe.class.getName(), Collections.emptyMap());
    HiveWriterFactory writerFactory = new HiveWriterFactory(new JobConf(), VerifyURIOutputFormat.class, serDeInfo, schema, new String[0], new Properties(), HiveShimLoader.loadHiveShim(HiveShimLoader.getHiveVersion()), false);
    HiveOutputFormatFactory factory = new HiveOutputFormatFactory(writerFactory);
    org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(TEST_URI_SCHEME, TEST_URI_AUTHORITY, "/foo/path");
    factory.createOutputFormat(path);
}
Also used : Path(org.apache.hadoop.fs.Path) TableSchema(org.apache.flink.table.api.TableSchema) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) Properties(java.util.Properties) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Aggregations

HiveWriterFactory (org.apache.flink.connectors.hive.write.HiveWriterFactory)5 Properties (java.util.Properties)4 HiveCompactReaderFactory (org.apache.flink.connectors.hive.read.HiveCompactReaderFactory)3 HiveOutputFormatFactory (org.apache.flink.connectors.hive.write.HiveOutputFormatFactory)3 OutputFileConfig (org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig)3 RowData (org.apache.flink.table.data.RowData)3 Path (org.apache.hadoop.fs.Path)3 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)3 IOException (java.io.IOException)2 UncheckedIOException (java.io.UncheckedIOException)2 FileSystemTableSink (org.apache.flink.connector.file.table.FileSystemTableSink)2 TableBucketAssigner (org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner)2 PartitionCommitInfo (org.apache.flink.connector.file.table.stream.PartitionCommitInfo)2 HiveBulkWriterFactory (org.apache.flink.connectors.hive.write.HiveBulkWriterFactory)2 ThreadLocalClassLoaderConfiguration (org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration)2 CatalogTable (org.apache.flink.table.catalog.CatalogTable)2 CatalogException (org.apache.flink.table.catalog.exceptions.CatalogException)2 HiveMetastoreClientFactory (org.apache.flink.table.catalog.hive.client.HiveMetastoreClientFactory)2 HiveMetastoreClientWrapper (org.apache.flink.table.catalog.hive.client.HiveMetastoreClientWrapper)2 HiveTableUtil.checkAcidTable (org.apache.flink.table.catalog.hive.util.HiveTableUtil.checkAcidTable)2