Search in sources :

Example 71 with StorageDescriptor

use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.

the class PartitionMonitorTest method preparePartitionMonitor.

private void preparePartitionMonitor() {
    List<List<String>> seenPartitionsSinceOffset = new ArrayList<>();
    JobConf jobConf = new JobConf();
    Configuration configuration = new Configuration();
    ObjectPath tablePath = new ObjectPath("testDb", "testTable");
    configuration.setString("streaming-source.consume-order", "create-time");
    HiveContinuousPartitionContext<Partition, Long> fetcherContext = new HiveContinuousPartitionContext<Partition, Long>() {

        @Override
        public HiveTablePartition toHiveTablePartition(Partition partition) {
            StorageDescriptor sd = partition.getSd();
            Map<String, String> partitionColValues = new HashMap<>();
            for (String partCol : partition.getValues()) {
                String[] arr = partCol.split("=");
                Asserts.check(arr.length == 2, "partition string should be key=value format");
                partitionColValues.put(arr[0], arr[1]);
            }
            return new HiveTablePartition(sd, partitionColValues, new Properties());
        }

        @Override
        public ObjectPath getTablePath() {
            return null;
        }

        @Override
        public TypeSerializer<Long> getTypeSerializer() {
            return null;
        }

        @Override
        public Long getConsumeStartOffset() {
            return null;
        }

        @Override
        public void open() throws Exception {
        }

        @Override
        public Optional<Partition> getPartition(List<String> partValues) throws Exception {
            return Optional.empty();
        }

        @Override
        public List<ComparablePartitionValue> getComparablePartitionValueList() throws Exception {
            return null;
        }

        @Override
        public void close() throws Exception {
        }
    };
    ContinuousPartitionFetcher<Partition, Long> continuousPartitionFetcher = new ContinuousPartitionFetcher<Partition, Long>() {

        private static final long serialVersionUID = 1L;

        @Override
        public List<Tuple2<Partition, Long>> fetchPartitions(Context<Partition, Long> context, Long previousOffset) throws Exception {
            return testPartitionWithOffset.stream().filter(p -> (long) p.getCreateTime() >= previousOffset).map(p -> Tuple2.of(p, (long) p.getCreateTime())).collect(Collectors.toList());
        }

        @Override
        public List<Partition> fetch(PartitionFetcher.Context<Partition> context) throws Exception {
            return null;
        }
    };
    partitionMonitor = new ContinuousHiveSplitEnumerator.PartitionMonitor<>(0L, seenPartitionsSinceOffset, tablePath, configuration, jobConf, continuousPartitionFetcher, fetcherContext);
}
Also used : Arrays(java.util.Arrays) TypeSerializer(org.apache.flink.api.common.typeutils.TypeSerializer) Properties(java.util.Properties) PartitionFetcher(org.apache.flink.connector.file.table.PartitionFetcher) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) Assert.assertTrue(org.junit.Assert.assertTrue) Asserts(org.apache.http.util.Asserts) Test(org.junit.Test) HashMap(java.util.HashMap) ObjectPath(org.apache.flink.table.catalog.ObjectPath) Partition(org.apache.hadoop.hive.metastore.api.Partition) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) JobConf(org.apache.hadoop.mapred.JobConf) HiveContinuousPartitionContext(org.apache.flink.connectors.hive.read.HiveContinuousPartitionContext) List(java.util.List) ContinuousPartitionFetcher(org.apache.flink.connector.file.table.ContinuousPartitionFetcher) Map(java.util.Map) Assert.assertArrayEquals(org.junit.Assert.assertArrayEquals) Optional(java.util.Optional) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) Collections(java.util.Collections) ObjectPath(org.apache.flink.table.catalog.ObjectPath) Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) Properties(java.util.Properties) ContinuousPartitionFetcher(org.apache.flink.connector.file.table.ContinuousPartitionFetcher) HiveContinuousPartitionContext(org.apache.flink.connectors.hive.read.HiveContinuousPartitionContext) ArrayList(java.util.ArrayList) List(java.util.List) JobConf(org.apache.hadoop.mapred.JobConf) HiveContinuousPartitionContext(org.apache.flink.connectors.hive.read.HiveContinuousPartitionContext) Partition(org.apache.hadoop.hive.metastore.api.Partition) Tuple2(org.apache.flink.api.java.tuple.Tuple2)

Example 72 with StorageDescriptor

use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.

the class HiveSourceFileEnumerator method getNumFiles.

public static int getNumFiles(List<HiveTablePartition> partitions, JobConf jobConf) throws IOException {
    int numFiles = 0;
    for (HiveTablePartition partition : partitions) {
        StorageDescriptor sd = partition.getStorageDescriptor();
        org.apache.hadoop.fs.Path inputPath = new org.apache.hadoop.fs.Path(sd.getLocation());
        FileSystem fs = inputPath.getFileSystem(jobConf);
        // it's possible a partition exists in metastore but the data has been removed
        if (!fs.exists(inputPath)) {
            continue;
        }
        numFiles += fs.listStatus(inputPath).length;
    }
    return numFiles;
}
Also used : Path(org.apache.flink.core.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor)

Example 73 with StorageDescriptor

use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.

the class HiveTableSink method consume.

private DataStreamSink<?> consume(ProviderContext providerContext, DataStream<RowData> dataStream, boolean isBounded, DataStructureConverter converter) {
    checkAcidTable(catalogTable.getOptions(), identifier.toObjectPath());
    try (HiveMetastoreClientWrapper client = HiveMetastoreClientFactory.create(HiveConfUtils.create(jobConf), hiveVersion)) {
        Table table = client.getTable(identifier.getDatabaseName(), identifier.getObjectName());
        StorageDescriptor sd = table.getSd();
        Class hiveOutputFormatClz = hiveShim.getHiveOutputFormatClass(Class.forName(sd.getOutputFormat()));
        boolean isCompressed = jobConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false);
        HiveWriterFactory writerFactory = new HiveWriterFactory(jobConf, hiveOutputFormatClz, sd.getSerdeInfo(), tableSchema, getPartitionKeyArray(), HiveReflectionUtils.getTableMetadata(hiveShim, table), hiveShim, isCompressed);
        String extension = Utilities.getFileExtension(jobConf, isCompressed, (HiveOutputFormat<?, ?>) hiveOutputFormatClz.newInstance());
        OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder = OutputFileConfig.builder().withPartPrefix("part-" + UUID.randomUUID().toString()).withPartSuffix(extension == null ? "" : extension);
        final int parallelism = Optional.ofNullable(configuredParallelism).orElse(dataStream.getParallelism());
        if (isBounded) {
            OutputFileConfig fileNaming = fileNamingBuilder.build();
            return createBatchSink(dataStream, converter, sd, writerFactory, fileNaming, parallelism);
        } else {
            if (overwrite) {
                throw new IllegalStateException("Streaming mode not support overwrite.");
            }
            Properties tableProps = HiveReflectionUtils.getTableMetadata(hiveShim, table);
            return createStreamSink(providerContext, dataStream, sd, tableProps, writerFactory, fileNamingBuilder, parallelism);
        }
    } catch (TException e) {
        throw new CatalogException("Failed to query Hive metaStore", e);
    } catch (IOException e) {
        throw new FlinkRuntimeException("Failed to create staging dir", e);
    } catch (ClassNotFoundException e) {
        throw new FlinkHiveException("Failed to get output format class", e);
    } catch (IllegalAccessException | InstantiationException e) {
        throw new FlinkHiveException("Failed to instantiate output format instance", e);
    }
}
Also used : TException(org.apache.thrift.TException) CatalogTable(org.apache.flink.table.catalog.CatalogTable) Table(org.apache.hadoop.hive.metastore.api.Table) HiveTableUtil.checkAcidTable(org.apache.flink.table.catalog.hive.util.HiveTableUtil.checkAcidTable) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) CatalogException(org.apache.flink.table.catalog.exceptions.CatalogException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) Properties(java.util.Properties) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) HiveMetastoreClientWrapper(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientWrapper) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory)

Example 74 with StorageDescriptor

use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.

the class HiveTableSink method createBatchSink.

private DataStreamSink<Row> createBatchSink(DataStream<RowData> dataStream, DataStructureConverter converter, StorageDescriptor sd, HiveWriterFactory recordWriterFactory, OutputFileConfig fileNaming, final int parallelism) throws IOException {
    FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
    builder.setPartitionComputer(new HiveRowPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray()));
    builder.setDynamicGrouped(dynamicGrouping);
    builder.setPartitionColumns(getPartitionKeyArray());
    builder.setFileSystemFactory(fsFactory());
    builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
    builder.setMetaStoreFactory(msFactory());
    builder.setOverwrite(overwrite);
    builder.setStaticPartitions(staticPartitionSpec);
    builder.setTempPath(new org.apache.flink.core.fs.Path(toStagingDir(sd.getLocation(), jobConf)));
    builder.setOutputFileConfig(fileNaming);
    return dataStream.map((MapFunction<RowData, Row>) value -> (Row) converter.toExternal(value)).writeUsingOutputFormat(builder.build()).setParallelism(parallelism);
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) HiveMetastoreClientFactory(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientFactory) HiveBulkWriterFactory(org.apache.flink.connectors.hive.write.HiveBulkWriterFactory) CatalogTable(org.apache.flink.table.catalog.CatalogTable) LoggerFactory(org.slf4j.LoggerFactory) JobConfUtils(org.apache.flink.connectors.hive.util.JobConfUtils) MapFunction(org.apache.flink.api.common.functions.MapFunction) OrcSplitReaderUtil(org.apache.flink.orc.OrcSplitReaderUtil) PartitionCommitInfo(org.apache.flink.connector.file.table.stream.PartitionCommitInfo) SupportsPartitioning(org.apache.flink.table.connector.sink.abilities.SupportsPartitioning) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SINK_ROLLING_POLICY_CHECK_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_CHECK_INTERVAL) StreamingFileSink(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink) Path(org.apache.hadoop.fs.Path) HiveWriterFactory(org.apache.flink.connectors.hive.write.HiveWriterFactory) PartFileInfo(org.apache.flink.streaming.api.functions.sink.filesystem.PartFileInfo) CheckpointRollingPolicy(org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.CheckpointRollingPolicy) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) HiveShimLoader(org.apache.flink.table.catalog.hive.client.HiveShimLoader) HiveCatalogFactoryOptions(org.apache.flink.table.catalog.hive.factories.HiveCatalogFactoryOptions) DynamicTableSink(org.apache.flink.table.connector.sink.DynamicTableSink) SINK_ROLLING_POLICY_ROLLOVER_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_ROLLOVER_INTERVAL) TableSchema(org.apache.flink.table.api.TableSchema) CompactOperator.convertToUncompacted(org.apache.flink.connector.file.table.stream.compact.CompactOperator.convertToUncompacted) UUID(java.util.UUID) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) Preconditions(org.apache.flink.util.Preconditions) StringUtils(org.apache.flink.util.StringUtils) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) HiveReflectionUtils(org.apache.flink.table.catalog.hive.util.HiveReflectionUtils) LogicalType(org.apache.flink.table.types.logical.LogicalType) DataStreamSinkProvider(org.apache.flink.table.connector.sink.DataStreamSinkProvider) Optional(java.util.Optional) Row(org.apache.flink.types.Row) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) ChangelogMode(org.apache.flink.table.connector.ChangelogMode) RowType(org.apache.flink.table.types.logical.RowType) HiveShim(org.apache.flink.table.catalog.hive.client.HiveShim) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) LinkedHashMap(java.util.LinkedHashMap) ReadableConfig(org.apache.flink.configuration.ReadableConfig) ThreadLocalClassLoaderConfiguration(org.apache.flink.orc.writer.ThreadLocalClassLoaderConfiguration) FileSystemConnectorOptions(org.apache.flink.connector.file.table.FileSystemConnectorOptions) SINK_ROLLING_POLICY_INACTIVITY_INTERVAL(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_INACTIVITY_INTERVAL) SupportsOverwrite(org.apache.flink.table.connector.sink.abilities.SupportsOverwrite) HiveMetastoreClientWrapper(org.apache.flink.table.catalog.hive.client.HiveMetastoreClientWrapper) Nullable(javax.annotation.Nullable) StreamingSink(org.apache.flink.connector.file.table.stream.StreamingSink) DataStreamSink(org.apache.flink.streaming.api.datastream.DataStreamSink) HiveConfUtils(org.apache.flink.connectors.hive.util.HiveConfUtils) HiveCompactReaderFactory(org.apache.flink.connectors.hive.read.HiveCompactReaderFactory) RowData(org.apache.flink.table.data.RowData) Logger(org.slf4j.Logger) Properties(java.util.Properties) ProviderContext(org.apache.flink.table.connector.ProviderContext) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) BulkWriter(org.apache.flink.api.common.serialization.BulkWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory) TypeDescription(org.apache.orc.TypeDescription) TException(org.apache.thrift.TException) IOException(java.io.IOException) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Table(org.apache.hadoop.hive.metastore.api.Table) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) DataStream(org.apache.flink.streaming.api.datastream.DataStream) JobConf(org.apache.hadoop.mapred.JobConf) TableBucketAssigner(org.apache.flink.connector.file.table.FileSystemTableSink.TableBucketAssigner) CompactReader(org.apache.flink.connector.file.table.stream.compact.CompactReader) OutputFileConfig(org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig) FileSystemTableSink(org.apache.flink.connector.file.table.FileSystemTableSink) TableSchemaUtils(org.apache.flink.table.utils.TableSchemaUtils) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) CatalogException(org.apache.flink.table.catalog.exceptions.CatalogException) SINK_ROLLING_POLICY_FILE_SIZE(org.apache.flink.connector.file.table.FileSystemConnectorOptions.SINK_ROLLING_POLICY_FILE_SIZE) HiveTableUtil.checkAcidTable(org.apache.flink.table.catalog.hive.util.HiveTableUtil.checkAcidTable) RowData(org.apache.flink.table.data.RowData) FileSystemOutputFormat(org.apache.flink.connector.file.table.FileSystemOutputFormat) ParquetRowDataBuilder(org.apache.flink.formats.parquet.row.ParquetRowDataBuilder) BucketsBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink.BucketsBuilder) HadoopPathBasedBulkFormatBuilder(org.apache.flink.streaming.api.functions.sink.filesystem.HadoopPathBasedBulkFormatBuilder) Row(org.apache.flink.types.Row) HiveOutputFormatFactory(org.apache.flink.connectors.hive.write.HiveOutputFormatFactory)

Example 75 with StorageDescriptor

use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.

the class HiveCatalog method instantiateHivePartition.

private Partition instantiateHivePartition(Table hiveTable, CatalogPartitionSpec partitionSpec, CatalogPartition catalogPartition) throws PartitionSpecInvalidException {
    List<String> partCols = getFieldNames(hiveTable.getPartitionKeys());
    List<String> partValues = getOrderedFullPartitionValues(partitionSpec, partCols, new ObjectPath(hiveTable.getDbName(), hiveTable.getTableName()));
    // validate partition values
    for (int i = 0; i < partCols.size(); i++) {
        if (isNullOrWhitespaceOnly(partValues.get(i))) {
            throw new PartitionSpecInvalidException(getName(), partCols, new ObjectPath(hiveTable.getDbName(), hiveTable.getTableName()), partitionSpec);
        }
    }
    // TODO: handle GenericCatalogPartition
    StorageDescriptor sd = hiveTable.getSd().deepCopy();
    sd.setLocation(catalogPartition.getProperties().remove(SqlCreateHiveTable.TABLE_LOCATION_URI));
    Map<String, String> properties = new HashMap<>(catalogPartition.getProperties());
    String comment = catalogPartition.getComment();
    if (comment != null) {
        properties.put(HiveCatalogConfig.COMMENT, comment);
    }
    return HiveTableUtil.createHivePartition(hiveTable.getDbName(), hiveTable.getTableName(), partValues, sd, properties);
}
Also used : ObjectPath(org.apache.flink.table.catalog.ObjectPath) HashMap(java.util.HashMap) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) UniqueConstraint(org.apache.flink.table.api.constraints.UniqueConstraint) PartitionSpecInvalidException(org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException)

Aggregations

StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)284 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)163 Table (org.apache.hadoop.hive.metastore.api.Table)159 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)155 ArrayList (java.util.ArrayList)134 Test (org.junit.Test)131 Partition (org.apache.hadoop.hive.metastore.api.Partition)97 HashMap (java.util.HashMap)61 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)38 List (java.util.List)35 Order (org.apache.hadoop.hive.metastore.api.Order)33 Path (org.apache.hadoop.fs.Path)30 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)30 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)30 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)29 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)29 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)27 Database (org.apache.hadoop.hive.metastore.api.Database)25 SkewedInfo (org.apache.hadoop.hive.metastore.api.SkewedInfo)23 IOException (java.io.IOException)15