Search in sources :

Example 1 with PartitionReader

use of org.apache.flink.connector.file.table.PartitionReader in project flink by apache.

the class HiveLookupTableSource method getLookupFunction.

private TableFunction<RowData> getLookupFunction(int[] keys) {
    final String defaultPartitionName = JobConfUtils.getDefaultPartitionName(jobConf);
    PartitionFetcher.Context<HiveTablePartition> fetcherContext = new HiveTablePartitionFetcherContext(tablePath, hiveShim, new JobConfWrapper(jobConf), catalogTable.getPartitionKeys(), getProducedTableSchema().getFieldDataTypes(), getProducedTableSchema().getFieldNames(), configuration, defaultPartitionName);
    final PartitionFetcher<HiveTablePartition> partitionFetcher;
    // avoid lambda capture
    final ObjectPath tableFullPath = tablePath;
    if (catalogTable.getPartitionKeys().isEmpty()) {
        // non-partitioned table, the fetcher fetches the partition which represents the given
        // table.
        partitionFetcher = context -> {
            List<HiveTablePartition> partValueList = new ArrayList<>();
            partValueList.add(context.getPartition(new ArrayList<>()).orElseThrow(() -> new IllegalArgumentException(String.format("Fetch partition fail for hive table %s.", tableFullPath))));
            return partValueList;
        };
    } else if (isStreamingSource()) {
        // streaming-read partitioned table, the fetcher fetches the latest partition of the
        // given table.
        partitionFetcher = context -> {
            List<HiveTablePartition> partValueList = new ArrayList<>();
            List<PartitionFetcher.Context.ComparablePartitionValue> comparablePartitionValues = context.getComparablePartitionValueList();
            // fetch latest partitions for partitioned table
            if (comparablePartitionValues.size() > 0) {
                // sort in desc order
                comparablePartitionValues.sort((o1, o2) -> o2.getComparator().compareTo(o1.getComparator()));
                PartitionFetcher.Context.ComparablePartitionValue maxPartition = comparablePartitionValues.get(0);
                partValueList.add(context.getPartition((List<String>) maxPartition.getPartitionValue()).orElseThrow(() -> new IllegalArgumentException(String.format("Fetch partition fail for hive table %s.", tableFullPath))));
            } else {
                throw new IllegalArgumentException(String.format("At least one partition is required when set '%s' to 'latest' in temporal join," + " but actual partition number is '%s' for hive table %s", STREAMING_SOURCE_PARTITION_INCLUDE.key(), comparablePartitionValues.size(), tableFullPath));
            }
            return partValueList;
        };
    } else {
        // bounded-read partitioned table, the fetcher fetches all partitions of the given
        // filesystem table.
        partitionFetcher = context -> {
            List<HiveTablePartition> partValueList = new ArrayList<>();
            List<PartitionFetcher.Context.ComparablePartitionValue> comparablePartitionValues = context.getComparablePartitionValueList();
            for (PartitionFetcher.Context.ComparablePartitionValue comparablePartitionValue : comparablePartitionValues) {
                partValueList.add(context.getPartition((List<String>) comparablePartitionValue.getPartitionValue()).orElseThrow(() -> new IllegalArgumentException(String.format("Fetch partition fail for hive table %s.", tableFullPath))));
            }
            return partValueList;
        };
    }
    PartitionReader<HiveTablePartition, RowData> partitionReader = new HiveInputFormatPartitionReader(flinkConf, jobConf, hiveVersion, tablePath, getProducedTableSchema().getFieldDataTypes(), getProducedTableSchema().getFieldNames(), catalogTable.getPartitionKeys(), projectedFields, flinkConf.get(HiveOptions.TABLE_EXEC_HIVE_FALLBACK_MAPRED_READER));
    return new FileSystemLookupFunction<>(partitionFetcher, fetcherContext, partitionReader, (RowType) getProducedTableSchema().toRowDataType().getLogicalType(), keys, hiveTableReloadInterval);
}
Also used : HivePartitionUtils(org.apache.flink.connectors.hive.util.HivePartitionUtils) TableFunction(org.apache.flink.table.functions.TableFunction) PartitionReader(org.apache.flink.connector.file.table.PartitionReader) DataType(org.apache.flink.table.types.DataType) CatalogTable(org.apache.flink.table.catalog.CatalogTable) LoggerFactory(org.slf4j.LoggerFactory) STREAMING_SOURCE_PARTITION_INCLUDE(org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_PARTITION_INCLUDE) HiveInputFormatPartitionReader(org.apache.flink.connectors.hive.read.HiveInputFormatPartitionReader) JobConfUtils(org.apache.flink.connectors.hive.util.JobConfUtils) RowType(org.apache.flink.table.types.logical.RowType) ObjectPath(org.apache.flink.table.catalog.ObjectPath) Partition(org.apache.hadoop.hive.metastore.api.Partition) HiveShim(org.apache.flink.table.catalog.hive.client.HiveShim) ArrayList(java.util.ArrayList) LookupTableSource(org.apache.flink.table.connector.source.LookupTableSource) ReadableConfig(org.apache.flink.configuration.ReadableConfig) Duration(java.time.Duration) HivePartitionFetcherContextBase(org.apache.flink.connectors.hive.read.HivePartitionFetcherContextBase) RowData(org.apache.flink.table.data.RowData) Logger(org.slf4j.Logger) STREAMING_SOURCE_CONSUME_START_OFFSET(org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_CONSUME_START_OFFSET) PartitionFetcher(org.apache.flink.connector.file.table.PartitionFetcher) Configuration(org.apache.flink.configuration.Configuration) Preconditions(org.apache.flink.util.Preconditions) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) JobConf(org.apache.hadoop.mapred.JobConf) LOOKUP_JOIN_CACHE_TTL(org.apache.flink.connectors.hive.HiveOptions.LOOKUP_JOIN_CACHE_TTL) List(java.util.List) Optional(java.util.Optional) TableFunctionProvider(org.apache.flink.table.connector.source.TableFunctionProvider) STREAMING_SOURCE_MONITOR_INTERVAL(org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_MONITOR_INTERVAL) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) ObjectPath(org.apache.flink.table.catalog.ObjectPath) HiveInputFormatPartitionReader(org.apache.flink.connectors.hive.read.HiveInputFormatPartitionReader) ArrayList(java.util.ArrayList) RowData(org.apache.flink.table.data.RowData) PartitionFetcher(org.apache.flink.connector.file.table.PartitionFetcher) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with PartitionReader

use of org.apache.flink.connector.file.table.PartitionReader in project flink by apache.

the class HiveLookupJoinITCase method testPartitionFetcherAndReader.

@Test
public void testPartitionFetcherAndReader() throws Exception {
    // constructs test data using dynamic partition
    TableEnvironment batchEnv = HiveTestUtils.createTableEnvInBatchMode(SqlDialect.HIVE);
    batchEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
    batchEnv.useCatalog(hiveCatalog.getName());
    batchEnv.executeSql("insert overwrite partition_table values " + "(1,'a',08,2019,'08','01')," + "(1,'a',10,2020,'08','31')," + "(2,'a',21,2020,'08','31')," + "(2,'b',22,2020,'08','31')," + "(3,'c',33,2020,'09','31')").await();
    FileSystemLookupFunction<HiveTablePartition> lookupFunction = getLookupFunction("partition_table");
    lookupFunction.open(null);
    PartitionFetcher<HiveTablePartition> fetcher = lookupFunction.getPartitionFetcher();
    PartitionFetcher.Context<HiveTablePartition> context = lookupFunction.getFetcherContext();
    List<HiveTablePartition> partitions = fetcher.fetch(context);
    // fetch latest partition by partition-name
    assertEquals(1, partitions.size());
    PartitionReader<HiveTablePartition, RowData> reader = lookupFunction.getPartitionReader();
    reader.open(partitions);
    List<RowData> res = new ArrayList<>();
    ObjectIdentifier tableIdentifier = ObjectIdentifier.of(hiveCatalog.getName(), "default", "partition_table");
    CatalogTable catalogTable = (CatalogTable) hiveCatalog.getTable(tableIdentifier.toObjectPath());
    GenericRowData reuse = new GenericRowData(catalogTable.getSchema().getFieldCount());
    TypeSerializer<RowData> serializer = InternalSerializers.create(catalogTable.getSchema().toRowDataType().getLogicalType());
    RowData row;
    while ((row = reader.read(reuse)) != null) {
        res.add(serializer.copy(row));
    }
    res.sort(Comparator.comparingInt(o -> o.getInt(0)));
    assertEquals("[+I(3,c,33,2020,09,31)]", res.toString());
}
Also used : PartitionReader(org.apache.flink.connector.file.table.PartitionReader) PARTITION_TIME_EXTRACTOR_TIMESTAMP_PATTERN(org.apache.flink.connector.file.table.FileSystemConnectorOptions.PARTITION_TIME_EXTRACTOR_TIMESTAMP_PATTERN) Arrays(java.util.Arrays) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) BeforeClass(org.junit.BeforeClass) PARTITION_TIME_EXTRACTOR_KIND(org.apache.flink.connector.file.table.FileSystemConnectorOptions.PARTITION_TIME_EXTRACTOR_KIND) CatalogTable(org.apache.flink.table.catalog.CatalogTable) STREAMING_SOURCE_PARTITION_INCLUDE(org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_PARTITION_INCLUDE) HiveCatalog(org.apache.flink.table.catalog.hive.HiveCatalog) ArrayList(java.util.ArrayList) GenericRowData(org.apache.flink.table.data.GenericRowData) InternalSerializers(org.apache.flink.table.runtime.typeutils.InternalSerializers) Duration(java.time.Duration) DynamicTableSourceFactory(org.apache.flink.table.factories.DynamicTableSourceFactory) STREAMING_SOURCE_PARTITION_ORDER(org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_PARTITION_ORDER) TableEnvironment(org.apache.flink.table.api.TableEnvironment) AfterClass(org.junit.AfterClass) TypeSerializer(org.apache.flink.api.common.typeutils.TypeSerializer) RowData(org.apache.flink.table.data.RowData) PartitionFetcher(org.apache.flink.connector.file.table.PartitionFetcher) TestValuesTableFactory(org.apache.flink.table.planner.factories.TestValuesTableFactory) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) CollectionUtil(org.apache.flink.util.CollectionUtil) TableImpl(org.apache.flink.table.api.internal.TableImpl) TestCollectionTableFactory(org.apache.flink.table.planner.factories.utils.TestCollectionTableFactory) HiveTestUtils(org.apache.flink.table.catalog.hive.HiveTestUtils) STREAMING_SOURCE_ENABLE(org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_ENABLE) List(java.util.List) FactoryUtil(org.apache.flink.table.factories.FactoryUtil) SqlDialect(org.apache.flink.table.api.SqlDialect) EnvironmentSettings(org.apache.flink.table.api.EnvironmentSettings) Row(org.apache.flink.types.Row) Comparator(java.util.Comparator) STREAMING_SOURCE_MONITOR_INTERVAL(org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_MONITOR_INTERVAL) TableEnvironmentInternal(org.apache.flink.table.api.internal.TableEnvironmentInternal) Assert.assertEquals(org.junit.Assert.assertEquals) ArrayList(java.util.ArrayList) TableEnvironment(org.apache.flink.table.api.TableEnvironment) CatalogTable(org.apache.flink.table.catalog.CatalogTable) GenericRowData(org.apache.flink.table.data.GenericRowData) RowData(org.apache.flink.table.data.RowData) PartitionFetcher(org.apache.flink.connector.file.table.PartitionFetcher) GenericRowData(org.apache.flink.table.data.GenericRowData) ObjectIdentifier(org.apache.flink.table.catalog.ObjectIdentifier) Test(org.junit.Test)

Aggregations

Duration (java.time.Duration)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 PartitionFetcher (org.apache.flink.connector.file.table.PartitionFetcher)2 PartitionReader (org.apache.flink.connector.file.table.PartitionReader)2 STREAMING_SOURCE_MONITOR_INTERVAL (org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_MONITOR_INTERVAL)2 STREAMING_SOURCE_PARTITION_INCLUDE (org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_PARTITION_INCLUDE)2 CatalogTable (org.apache.flink.table.catalog.CatalogTable)2 RowData (org.apache.flink.table.data.RowData)2 Arrays (java.util.Arrays)1 Comparator (java.util.Comparator)1 Optional (java.util.Optional)1 VisibleForTesting (org.apache.flink.annotation.VisibleForTesting)1 TypeSerializer (org.apache.flink.api.common.typeutils.TypeSerializer)1 Configuration (org.apache.flink.configuration.Configuration)1 ReadableConfig (org.apache.flink.configuration.ReadableConfig)1 PARTITION_TIME_EXTRACTOR_KIND (org.apache.flink.connector.file.table.FileSystemConnectorOptions.PARTITION_TIME_EXTRACTOR_KIND)1 PARTITION_TIME_EXTRACTOR_TIMESTAMP_PATTERN (org.apache.flink.connector.file.table.FileSystemConnectorOptions.PARTITION_TIME_EXTRACTOR_TIMESTAMP_PATTERN)1 LOOKUP_JOIN_CACHE_TTL (org.apache.flink.connectors.hive.HiveOptions.LOOKUP_JOIN_CACHE_TTL)1 STREAMING_SOURCE_CONSUME_START_OFFSET (org.apache.flink.connectors.hive.HiveOptions.STREAMING_SOURCE_CONSUME_START_OFFSET)1