Search in sources :

Example 1 with SplitMetadata

use of io.prestosql.spi.heuristicindex.SplitMetadata in project hetu-core by openlookeng.

the class OrcRecordReader method nextPage.

public Page nextPage() throws IOException {
    ColumnReader[] columnsReader = getColumnReaders();
    int batchSize = prepareNextBatch();
    if (batchSize < 0) {
        return null;
    }
    for (ColumnReader column : columnsReader) {
        if (column != null) {
            column.prepareNextRead(batchSize);
        }
    }
    batchRead(batchSize);
    matchingRowsInBatchArray = null;
    validateWritePageChecksum(batchSize);
    // create a lazy page
    blockFactory.nextPage();
    Arrays.fill(currentBytesPerCell, 0);
    Block[] blocks = new Block[columnsReader.length];
    for (int i = 0; i < columnsReader.length; i++) {
        int columnIndex = i;
        blocks[columnIndex] = blockFactory.createBlock(batchSize, () -> filterRows(columnsReader[columnIndex].readBlock()), block -> blockLoaded(columnIndex, block));
    }
    // only include page metadata if enabled
    if (pageMetadataEnabled) {
        Properties pageMetadata = new Properties();
        pageCount++;
        pageMetadata.setProperty(DATASOURCE_PAGE_NUMBER, String.valueOf(pageCount));
        if (isCurrentStripeFinished()) {
            // Only set the total page count when the current stripe has finished
            // Therefore whenever this property is available in pageMetaData,
            // it indicates that the stripe has finished and this is the last page
            pageMetadata.setProperty(DATASOURCE_TOTAL_PAGES, String.valueOf(pageCount));
            pageCount = 0;
        }
        pageMetadata.setProperty(DATASOURCE_STRIPE_NUMBER, String.valueOf(currentStripe));
        pageMetadata.setProperty(DATASOURCE_STRIPE_OFFSET, String.valueOf(stripes.get(currentStripe).getOffset()));
        pageMetadata.setProperty(DATASOURCE_STRIPE_LENGTH, String.valueOf(stripes.get(currentStripe).getTotalLength()));
        if (splitMetadata != null) {
            // Skip setting for testing (splitMetadata set as null)
            pageMetadata.setProperty(DATASOURCE_FILE_PATH, splitMetadata.getSplitIdentity());
            pageMetadata.setProperty(DATASOURCE_FILE_MODIFICATION, String.valueOf(splitMetadata.getLastModifiedTime()));
        }
        pageMetadata.setProperty(DATASOURCE_INDEX_LEVEL, "STRIPE");
        return new Page(batchSize, pageMetadata, blocks);
    } else {
        return new Page(batchSize, blocks);
    }
}
Also used : IntStream(java.util.stream.IntStream) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) ColumnReaders.createColumnReader(io.prestosql.orc.reader.ColumnReaders.createColumnReader) DATASOURCE_TOTAL_PAGES(io.prestosql.spi.HetuConstant.DATASOURCE_TOTAL_PAGES) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) DATASOURCE_FILE_PATH(io.prestosql.spi.HetuConstant.DATASOURCE_FILE_PATH) PeekingIterator(com.google.common.collect.PeekingIterator) Function(java.util.function.Function) ArrayList(java.util.ArrayList) DATASOURCE_STRIPE_NUMBER(io.prestosql.spi.HetuConstant.DATASOURCE_STRIPE_NUMBER) DATASOURCE_STRIPE_OFFSET(io.prestosql.spi.HetuConstant.DATASOURCE_STRIPE_OFFSET) Slices(io.airlift.slice.Slices) Map(java.util.Map) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) Type(io.prestosql.spi.type.Type) Math.toIntExact(java.lang.Math.toIntExact) Block(io.prestosql.spi.block.Block) ColumnReaders(io.prestosql.orc.reader.ColumnReaders) Properties(java.util.Properties) ImmutableMap(com.google.common.collect.ImmutableMap) DATASOURCE_FILE_MODIFICATION(io.prestosql.spi.HetuConstant.DATASOURCE_FILE_MODIFICATION) OrcType(io.prestosql.orc.metadata.OrcType) HiveWriterVersion(io.prestosql.orc.metadata.PostScript.HiveWriterVersion) Page(io.prestosql.spi.Page) IOException(java.io.IOException) Maps(com.google.common.collect.Maps) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) DATASOURCE_INDEX_LEVEL(io.prestosql.spi.HetuConstant.DATASOURCE_INDEX_LEVEL) ColumnReader(io.prestosql.orc.reader.ColumnReader) DATASOURCE_PAGE_NUMBER(io.prestosql.spi.HetuConstant.DATASOURCE_PAGE_NUMBER) MetadataReader(io.prestosql.orc.metadata.MetadataReader) StripeInformation(io.prestosql.orc.metadata.StripeInformation) DataSize(io.airlift.units.DataSize) List(java.util.List) ClassLayout(org.openjdk.jol.info.ClassLayout) Domain(io.prestosql.spi.predicate.Domain) DATASOURCE_STRIPE_LENGTH(io.prestosql.spi.HetuConstant.DATASOURCE_STRIPE_LENGTH) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) Optional(java.util.Optional) VisibleForTesting(com.google.common.annotations.VisibleForTesting) IndexMetadata(io.prestosql.spi.heuristicindex.IndexMetadata) SplitMetadata(io.prestosql.spi.heuristicindex.SplitMetadata) Block(io.prestosql.spi.block.Block) Page(io.prestosql.spi.Page) ColumnReaders.createColumnReader(io.prestosql.orc.reader.ColumnReaders.createColumnReader) ColumnReader(io.prestosql.orc.reader.ColumnReader) Properties(java.util.Properties)

Example 2 with SplitMetadata

use of io.prestosql.spi.heuristicindex.SplitMetadata in project hetu-core by openlookeng.

the class HivePageSourceProvider method createPageSourceInternal.

private ConnectorPageSource createPageSourceInternal(ConnectorSession session, Optional<DynamicFilterSupplier> dynamicFilterSupplier, List<Map<ColumnHandle, DynamicFilter>> dynamicFilters, HiveTableHandle hiveTable, List<HiveColumnHandle> hiveColumns, HiveSplit hiveSplit) {
    Path path = new Path(hiveSplit.getPath());
    List<Set<DynamicFilter>> dynamicFilterList = new ArrayList();
    if (dynamicFilters != null) {
        for (Map<ColumnHandle, DynamicFilter> df : dynamicFilters) {
            Set<DynamicFilter> values = df.values().stream().collect(Collectors.toSet());
            dynamicFilterList.add(values);
        }
    }
    // Filter out splits using partition values and dynamic filters
    if (dynamicFilters != null && !dynamicFilters.isEmpty() && isPartitionFiltered(hiveSplit.getPartitionKeys(), dynamicFilterList, typeManager)) {
        return new FixedPageSource(ImmutableList.of());
    }
    Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsEnvironment.HdfsContext(session, hiveSplit.getDatabase(), hiveSplit.getTable()), path);
    Properties schema = hiveSplit.getSchema();
    String columnNameDelimiter = schema.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? schema.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
    List<String> partitionColumnNames;
    if (schema.containsKey(META_PARTITION_COLUMNS)) {
        partitionColumnNames = Arrays.asList(schema.getProperty(META_PARTITION_COLUMNS).split(columnNameDelimiter));
    } else if (schema.containsKey(META_TABLE_COLUMNS)) {
        partitionColumnNames = Arrays.asList(schema.getProperty(META_TABLE_COLUMNS).split(columnNameDelimiter));
    } else {
        partitionColumnNames = new ArrayList<>();
    }
    List<String> tableColumns = hiveColumns.stream().map(cols -> cols.getName()).collect(toList());
    List<String> missingColumns = tableColumns.stream().skip(partitionColumnNames.size()).collect(toList());
    List<IndexMetadata> indexes = new ArrayList<>();
    if (indexCache != null && session.isHeuristicIndexFilterEnabled()) {
        indexes.addAll(this.indexCache.getIndices(session.getCatalog().orElse(null), hiveTable.getSchemaTableName().toString(), hiveSplit, hiveTable.getCompactEffectivePredicate(), hiveTable.getPartitionColumns()));
        /* Bloom/Bitmap indices are checked for given table and added to the possible matchers for pushdown. */
        if (hiveTable.getDisjunctCompactEffectivePredicate().isPresent() && hiveTable.getDisjunctCompactEffectivePredicate().get().size() > 0) {
            hiveTable.getDisjunctCompactEffectivePredicate().get().forEach(orPredicate -> indexes.addAll(this.indexCache.getIndices(session.getCatalog().orElse(null), hiveTable.getSchemaTableName().toString(), hiveSplit, orPredicate, hiveTable.getPartitionColumns())));
        }
    }
    Optional<List<IndexMetadata>> indexOptional = indexes == null || indexes.isEmpty() ? Optional.empty() : Optional.of(indexes);
    URI splitUri = URI.create(URIUtil.encodePath(hiveSplit.getPath()));
    SplitMetadata splitMetadata = new SplitMetadata(splitUri.getRawPath(), hiveSplit.getLastModifiedTime());
    TupleDomain<HiveColumnHandle> predicate = TupleDomain.all();
    if (dynamicFilterSupplier.isPresent() && dynamicFilters != null && !dynamicFilters.isEmpty()) {
        if (dynamicFilters.size() == 1) {
            List<HiveColumnHandle> filteredHiveColumnHandles = hiveColumns.stream().filter(column -> dynamicFilters.get(0).containsKey(column)).collect(toList());
            HiveColumnHandle hiveColumnHandle = filteredHiveColumnHandles.get(0);
            Type type = hiveColumnHandle.getColumnMetadata(typeManager).getType();
            predicate = getPredicate(dynamicFilters.get(0).get(hiveColumnHandle), type, hiveColumnHandle);
            if (predicate.isNone()) {
                predicate = TupleDomain.all();
            }
        }
    }
    /**
     * This is main logical division point to process filter pushdown enabled case (aka as selective read flow).
     * If user configuration orc_predicate_pushdown_enabled is true and if all clause of query can be handled by hive
     * selective read flow, then hiveTable.isSuitableToPush() will be enabled.
     * (Refer HiveMetadata.checkIfSuitableToPush).
     */
    if (hiveTable.isSuitableToPush()) {
        return createSelectivePageSource(selectivePageSourceFactories, configuration, session, hiveSplit, assignUniqueIndicesToPartitionColumns(hiveColumns), typeManager, dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), hiveSplit.getStartRowOffsetOfFile(), indexOptional, hiveSplit.isCacheable(), hiveTable.getCompactEffectivePredicate(), hiveTable.getPredicateColumns(), hiveTable.getDisjunctCompactEffectivePredicate(), hiveSplit.getBucketConversion(), hiveSplit.getBucketNumber(), hiveSplit.getLastModifiedTime(), missingColumns);
    }
    Optional<ConnectorPageSource> pageSource = createHivePageSource(cursorProviders, pageSourceFactories, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(predicate), hiveColumns, hiveSplit.getPartitionKeys(), typeManager, hiveSplit.getColumnCoercions(), hiveSplit.getBucketConversion(), hiveSplit.isS3SelectPushdownEnabled(), dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), hiveSplit.getStartRowOffsetOfFile(), indexOptional, splitMetadata, hiveSplit.isCacheable(), hiveSplit.getLastModifiedTime(), hiveSplit.getCustomSplitInfo(), missingColumns);
    if (pageSource.isPresent()) {
        return pageSource.get();
    }
    throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
Also used : Arrays(java.util.Arrays) DynamicFilter(io.prestosql.spi.dynamicfilter.DynamicFilter) BuiltInFunctionHandle(io.prestosql.spi.function.BuiltInFunctionHandle) ValueSet(io.prestosql.spi.predicate.ValueSet) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) META_PARTITION_COLUMNS(io.prestosql.plugin.hive.metastore.MetastoreUtil.META_PARTITION_COLUMNS) CallExpression(io.prestosql.spi.relation.CallExpression) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) HiveCoercer.createCoercer(io.prestosql.plugin.hive.coercions.HiveCoercer.createCoercer) BucketingVersion(io.prestosql.plugin.hive.HiveBucketing.BucketingVersion) FilteredDynamicFilter(io.prestosql.spi.dynamicfilter.FilteredDynamicFilter) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Type(io.prestosql.spi.type.Type) URI(java.net.URI) MAX_PARTITION_KEY_COLUMN_INDEX(io.prestosql.plugin.hive.HiveColumnHandle.MAX_PARTITION_KEY_COLUMN_INDEX) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) org.apache.hadoop.hive.serde.serdeConstants(org.apache.hadoop.hive.serde.serdeConstants) Set(java.util.Set) Collectors(java.util.stream.Collectors) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ConnectorPageSource(io.prestosql.spi.connector.ConnectorPageSource) Domain(io.prestosql.spi.predicate.Domain) ConnectorTransactionHandle(io.prestosql.spi.connector.ConnectorTransactionHandle) URIUtil(org.eclipse.jetty.util.URIUtil) Optional(java.util.Optional) IndexMetadata(io.prestosql.spi.heuristicindex.IndexMetadata) SplitMetadata(io.prestosql.spi.heuristicindex.SplitMetadata) Slice(io.airlift.slice.Slice) FixedPageSource(io.prestosql.spi.connector.FixedPageSource) ConnectorSplit(io.prestosql.spi.connector.ConnectorSplit) META_TABLE_COLUMNS(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS) OptionalInt(java.util.OptionalInt) ArrayList(java.util.ArrayList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) REGULAR(io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) ImmutableList(com.google.common.collect.ImmutableList) Range(io.prestosql.spi.predicate.Range) HiveCoercer(io.prestosql.plugin.hive.coercions.HiveCoercer) Objects.requireNonNull(java.util.Objects.requireNonNull) DynamicFilterSupplier(io.prestosql.spi.dynamicfilter.DynamicFilterSupplier) RecordCursor(io.prestosql.spi.connector.RecordCursor) Signature(io.prestosql.spi.function.Signature) SerDeUtils(org.apache.hadoop.hive.serde2.SerDeUtils) Properties(java.util.Properties) ConnectorTableHandle(io.prestosql.spi.connector.ConnectorTableHandle) TupleDomain(io.prestosql.spi.predicate.TupleDomain) TypeManager(io.prestosql.spi.type.TypeManager) HiveUtil.isPartitionFiltered(io.prestosql.plugin.hive.HiveUtil.isPartitionFiltered) CombinedDynamicFilter(io.prestosql.spi.dynamicfilter.CombinedDynamicFilter) Collectors.toList(java.util.stream.Collectors.toList) IndexCache(io.prestosql.plugin.hive.util.IndexCache) ColumnMapping.toColumnHandles(io.prestosql.plugin.hive.HivePageSourceProvider.ColumnMapping.toColumnHandles) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) RowExpression(io.prestosql.spi.relation.RowExpression) RecordPageSource(io.prestosql.spi.connector.RecordPageSource) ConnectorPageSourceProvider(io.prestosql.spi.connector.ConnectorPageSourceProvider) OrcConcatPageSource(io.prestosql.plugin.hive.orc.OrcConcatPageSource) ValueSet(io.prestosql.spi.predicate.ValueSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) Properties(java.util.Properties) ConnectorPageSource(io.prestosql.spi.connector.ConnectorPageSource) URI(java.net.URI) SplitMetadata(io.prestosql.spi.heuristicindex.SplitMetadata) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) IndexMetadata(io.prestosql.spi.heuristicindex.IndexMetadata) Path(org.apache.hadoop.fs.Path) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) DynamicFilter(io.prestosql.spi.dynamicfilter.DynamicFilter) FilteredDynamicFilter(io.prestosql.spi.dynamicfilter.FilteredDynamicFilter) CombinedDynamicFilter(io.prestosql.spi.dynamicfilter.CombinedDynamicFilter) FixedPageSource(io.prestosql.spi.connector.FixedPageSource) Type(io.prestosql.spi.type.Type)

Example 3 with SplitMetadata

use of io.prestosql.spi.heuristicindex.SplitMetadata in project hetu-core by openlookeng.

the class OrcPageSourceFactory method createOrcPageSource.

public static OrcPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, TypeManager typeManager, DataSize maxMergeDistance, DataSize maxBufferSize, DataSize streamBufferSize, DataSize tinyStripeThreshold, DataSize maxReadBlockSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, FileFormatDataSourceStats stats, Optional<DynamicFilterSupplier> dynamicFilters, Optional<DeleteDeltaLocations> deleteDeltaLocations, Optional<Long> startRowOffsetOfFile, Optional<List<IndexMetadata>> indexes, SplitMetadata splitMetadata, OrcCacheStore orcCacheStore, OrcCacheProperties orcCacheProperties, int domainCompactionThreshold, boolean pageMetadataEnabled, long dataSourceLastModifiedTime) {
    for (HiveColumnHandle column : columns) {
        checkArgument(column.getColumnType() == HiveColumnHandle.ColumnType.REGULAR || column.getHiveColumnIndex() == HiveColumnHandle.ROW_ID__COLUMN_INDEX, "column type must be regular: %s", column);
    }
    checkArgument(!effectivePredicate.isNone());
    OrcDataSource orcDataSource;
    try {
        // Always create a lazy Stream. HDFS stream opened only when required.
        FSDataInputStream inputStream = new FSDataInputStream(new LazyFSInputStream(() -> {
            FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration);
            return hdfsEnvironment.doAs(sessionUser, () -> fileSystem.open(path));
        }));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, maxMergeDistance, maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats, dataSourceLastModifiedTime);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
    try {
        OrcDataSource readerLocalDataSource = OrcReader.wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold);
        OrcFileTail fileTail;
        if (orcCacheProperties.isFileTailCacheEnabled()) {
            try {
                OrcDataSourceIdWithTimeStamp orcDataSourceIdWithTimeStamp = new OrcDataSourceIdWithTimeStamp(readerLocalDataSource.getId(), readerLocalDataSource.getLastModifiedTime());
                fileTail = orcCacheStore.getFileTailCache().get(new OrcFileTailCacheKey(orcDataSourceIdWithTimeStamp), () -> OrcPageSourceFactory.createFileTail(orcDataSource));
            } catch (UncheckedExecutionException | ExecutionException executionException) {
                handleCacheLoadException(executionException);
                log.debug(executionException.getCause(), "Error while caching the Orc file tail. Falling back to default flow");
                fileTail = OrcPageSourceFactory.createFileTail(orcDataSource);
            }
        } else {
            fileTail = OrcPageSourceFactory.createFileTail(orcDataSource);
        }
        OrcReader reader = new OrcReader(readerLocalDataSource, fileTail, maxMergeDistance, tinyStripeThreshold, maxReadBlockSize);
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        List<OrcColumn> fileReadColumns = isFullAcid ? new ArrayList<>(columns.size() + 5) : new ArrayList<>(columns.size());
        List<Type> fileReadTypes = isFullAcid ? new ArrayList<>(columns.size() + 5) : new ArrayList<>(columns.size());
        ImmutableList<String> acidColumnNames = null;
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
        // Only Hive ACID files will begin with bucket_
        boolean fileNameContainsBucket = path.getName().contains("bucket");
        if (isFullAcid && fileNameContainsBucket) {
            // Skip the acid schema check in case of non-ACID files
            acidColumnNames = ImmutableList.<String>builder().add(ACID_COLUMN_ORIGINAL_TRANSACTION, ACID_COLUMN_BUCKET, ACID_COLUMN_ROW_ID, ACID_COLUMN_CURRENT_TRANSACTION, ACID_COLUMN_OPERATION).build();
            verifyAcidSchema(reader, path);
            Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
            if (AcidUtils.isDeleteDelta(path.getParent())) {
                // Avoid reading column data from delete_delta files.
                // Call will come here in case of Minor VACUUM where all delete_delta files are merge together.
                fileColumns = ImmutableList.of();
            } else {
                fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(ACID_COLUMN_ROW_STRUCT).getNestedColumns(), columns);
            }
            fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
            fileReadTypes.add(INTEGER);
            fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_CURRENT_TRANSACTION.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_OPERATION.toLowerCase(ENGLISH)));
            fileReadTypes.add(INTEGER);
        }
        Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            verifyFileHasColumnNames(fileColumns, path);
            // Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
            fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
        }
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(orcBloomFiltersEnabled);
        Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        for (HiveColumnHandle column : columns) {
            OrcColumn orcColumn = null;
            if (useOrcColumnNames || isFullAcid) {
                orcColumn = fileColumnsByName.get(column.getName());
            } else if (column.getHiveColumnIndex() >= 0 && column.getHiveColumnIndex() < fileColumns.size()) {
                orcColumn = fileColumns.get(column.getHiveColumnIndex());
            }
            Type readType = typeManager.getType(column.getTypeSignature());
            if (orcColumn != null) {
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                Domain domain = effectivePredicateDomains.get(column);
                if (domain != null) {
                    predicateBuilder.addColumn(orcColumn.getColumnId(), domain);
                }
            } else if (isFullAcid && readType instanceof RowType && column.getName().equalsIgnoreCase(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME)) {
                HiveType hiveType = column.getHiveType();
                StructTypeInfo structTypeInfo = (StructTypeInfo) hiveType.getTypeInfo();
                ImmutableList.Builder<ColumnAdaptation> builder = new ImmutableList.Builder<>();
                ArrayList<String> fieldNames = structTypeInfo.getAllStructFieldNames();
                List<ColumnAdaptation> adaptations = fieldNames.stream().map(acidColumnNames::indexOf).map(c -> ColumnAdaptation.sourceColumn(c, false)).collect(Collectors.toList());
                columnAdaptations.add(ColumnAdaptation.structColumn(structTypeInfo, adaptations));
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
            }
        }
        Map<String, Domain> domains = effectivePredicate.getDomains().get().entrySet().stream().collect(toMap(e -> e.getKey().getName(), Map.Entry::getValue));
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, predicateBuilder.build(), start, length, legacyFileTimeZone, systemMemoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), indexes, splitMetadata, domains, orcCacheStore, orcCacheProperties, pageMetadataEnabled);
        OrcDeletedRows deletedRows = new OrcDeletedRows(path.getName(), deleteDeltaLocations, new OrcDeleteDeltaPageSourceFactory(sessionUser, configuration, hdfsEnvironment, maxMergeDistance, maxBufferSize, streamBufferSize, maxReadBlockSize, tinyStripeThreshold, lazyReadSmallRanges, orcBloomFiltersEnabled, stats), sessionUser, configuration, hdfsEnvironment, startRowOffsetOfFile);
        boolean eagerload = false;
        if (indexes.isPresent()) {
            eagerload = indexes.get().stream().anyMatch(indexMetadata -> EAGER_LOAD_INDEX_ID.contains(indexMetadata.getIndex().getId()));
        }
        return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, eagerload, systemMemoryUsage, stats);
    } catch (Exception e) {
        try {
            orcDataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof BlockMissingException) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : OrcReader(io.prestosql.orc.OrcReader) DateTimeZone(org.joda.time.DateTimeZone) LONG(io.prestosql.orc.metadata.OrcType.OrcTypeKind.LONG) TupleDomainOrcPredicate(io.prestosql.orc.TupleDomainOrcPredicate) FileSystem(org.apache.hadoop.fs.FileSystem) HiveSessionProperties.isOrcStripeFooterCacheEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isOrcStripeFooterCacheEnabled) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) HiveSessionProperties.isOrcRowDataCacheEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isOrcRowDataCacheEnabled) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) RowType(io.prestosql.spi.type.RowType) HiveSessionProperties.getOrcStreamBufferSize(io.prestosql.plugin.hive.HiveSessionProperties.getOrcStreamBufferSize) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ColumnAdaptation(io.prestosql.plugin.hive.orc.OrcPageSource.ColumnAdaptation) FileFormatDataSourceStats(io.prestosql.plugin.hive.FileFormatDataSourceStats) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ConnectorPageSource(io.prestosql.spi.connector.ConnectorPageSource) HIVE_BAD_DATA(io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) OrcCacheProperties(io.prestosql.orc.OrcCacheProperties) Domain(io.prestosql.spi.predicate.Domain) SplitMetadata(io.prestosql.spi.heuristicindex.SplitMetadata) HiveSessionProperties.getOrcMaxBufferSize(io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxBufferSize) TupleDomainOrcPredicateBuilder(io.prestosql.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) ArrayList(java.util.ArrayList) INITIAL_BATCH_SIZE(io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE) HdfsEnvironment(io.prestosql.plugin.hive.HdfsEnvironment) DynamicFilterSupplier(io.prestosql.spi.dynamicfilter.DynamicFilterSupplier) HiveSessionProperties.getOrcMaxReadBlockSize(io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxReadBlockSize) HiveSessionProperties.isOrcFileTailCacheEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isOrcFileTailCacheEnabled) HivePageSourceFactory(io.prestosql.plugin.hive.HivePageSourceFactory) Properties(java.util.Properties) DeleteDeltaLocations(io.prestosql.plugin.hive.DeleteDeltaLocations) TypeManager(io.prestosql.spi.type.TypeManager) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) HiveSessionProperties.isOrcBloomFiltersEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isOrcBloomFiltersEnabled) OrcPageSource.handleException(io.prestosql.plugin.hive.orc.OrcPageSource.handleException) STRUCT(io.prestosql.orc.metadata.OrcType.OrcTypeKind.STRUCT) HiveSessionProperties.getOrcLazyReadSmallRanges(io.prestosql.plugin.hive.HiveSessionProperties.getOrcLazyReadSmallRanges) OrcRecordReader(io.prestosql.orc.OrcRecordReader) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Seekable(org.apache.hadoop.fs.Seekable) HiveSessionProperties.getOrcTinyStripeThreshold(io.prestosql.plugin.hive.HiveSessionProperties.getOrcTinyStripeThreshold) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) Collectors.toMap(java.util.stream.Collectors.toMap) HiveConfig(io.prestosql.plugin.hive.HiveConfig) OrcDataSourceIdWithTimeStamp(io.prestosql.orc.OrcDataSourceIdWithTimeStamp) Path(org.apache.hadoop.fs.Path) Type(io.prestosql.spi.type.Type) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) PrestoException(io.prestosql.spi.PrestoException) OrcFileTail(io.prestosql.orc.OrcFileTail) OrcTypeKind(io.prestosql.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) INT(io.prestosql.orc.metadata.OrcType.OrcTypeKind.INT) AcidUtils.isFullAcidTable(org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable) HIVE_FILE_MISSING_COLUMN_NAMES(io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES) HiveSessionProperties.isOrcBloomFiltersCacheEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isOrcBloomFiltersCacheEnabled) OrcDataSource(io.prestosql.orc.OrcDataSource) HiveType(io.prestosql.plugin.hive.HiveType) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) OrcColumn(io.prestosql.orc.OrcColumn) OrcFileTailCacheKey(io.prestosql.orc.OrcFileTailCacheKey) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveSessionProperties.getOrcMaxMergeDistance(io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxMergeDistance) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) Pattern(java.util.regex.Pattern) IndexMetadata(io.prestosql.spi.heuristicindex.IndexMetadata) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) HIVE_CANNOT_OPEN_SPLIT(io.prestosql.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) Logger(io.airlift.log.Logger) FixedPageSource(io.prestosql.spi.connector.FixedPageSource) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) HashMap(java.util.HashMap) INTEGER(io.prestosql.spi.type.IntegerType.INTEGER) Inject(javax.inject.Inject) HIVE_MISSING_DATA(io.prestosql.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) ImmutableList(com.google.common.collect.ImmutableList) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) Objects.requireNonNull(java.util.Objects.requireNonNull) PositionedReadable(org.apache.hadoop.fs.PositionedReadable) HiveSessionProperties.isOrcRowIndexCacheEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isOrcRowIndexCacheEnabled) HiveUtil(io.prestosql.plugin.hive.HiveUtil) TupleDomain(io.prestosql.spi.predicate.TupleDomain) Maps(com.google.common.collect.Maps) OrcCacheStore(io.prestosql.orc.OrcCacheStore) OrcDataSourceId(io.prestosql.orc.OrcDataSourceId) OrcReader.handleCacheLoadException(io.prestosql.orc.OrcReader.handleCacheLoadException) InputStream(java.io.InputStream) TupleDomainOrcPredicateBuilder(io.prestosql.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) OrcFileTail(io.prestosql.orc.OrcFileTail) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) OrcColumn(io.prestosql.orc.OrcColumn) OrcReader(io.prestosql.orc.OrcReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) OrcFileTailCacheKey(io.prestosql.orc.OrcFileTailCacheKey) Domain(io.prestosql.spi.predicate.Domain) TupleDomain(io.prestosql.spi.predicate.TupleDomain) HiveType(io.prestosql.plugin.hive.HiveType) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Collectors.toMap(java.util.stream.Collectors.toMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) ColumnAdaptation(io.prestosql.plugin.hive.orc.OrcPageSource.ColumnAdaptation) ImmutableList(com.google.common.collect.ImmutableList) RowType(io.prestosql.spi.type.RowType) PrestoException(io.prestosql.spi.PrestoException) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ExecutionException(java.util.concurrent.ExecutionException) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) OrcDataSource(io.prestosql.orc.OrcDataSource) OrcDataSourceId(io.prestosql.orc.OrcDataSourceId) OrcDataSourceIdWithTimeStamp(io.prestosql.orc.OrcDataSourceIdWithTimeStamp) IOException(java.io.IOException) OrcRecordReader(io.prestosql.orc.OrcRecordReader) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) OrcPageSource.handleException(io.prestosql.plugin.hive.orc.OrcPageSource.handleException) PrestoException(io.prestosql.spi.PrestoException) FileNotFoundException(java.io.FileNotFoundException) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) OrcReader.handleCacheLoadException(io.prestosql.orc.OrcReader.handleCacheLoadException) RowType(io.prestosql.spi.type.RowType) Type(io.prestosql.spi.type.Type) HiveType(io.prestosql.plugin.hive.HiveType) TupleDomainOrcPredicateBuilder(io.prestosql.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder)

Aggregations

ImmutableMap (com.google.common.collect.ImmutableMap)3 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)2 ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)2 Maps (com.google.common.collect.Maps)2 Maps.uniqueIndex (com.google.common.collect.Maps.uniqueIndex)2 Logger (io.airlift.log.Logger)2 Slice (io.airlift.slice.Slice)2 Slices (io.airlift.slice.Slices)2 DataSize (io.airlift.units.DataSize)2 AggregatedMemoryContext (io.prestosql.memory.context.AggregatedMemoryContext)2 IndexMetadata (io.prestosql.spi.heuristicindex.IndexMetadata)2 SplitMetadata (io.prestosql.spi.heuristicindex.SplitMetadata)2 Domain (io.prestosql.spi.predicate.Domain)2 Type (io.prestosql.spi.type.Type)2 ArrayList (java.util.ArrayList)2 Arrays (java.util.Arrays)2 List (java.util.List)2 Map (java.util.Map)2 Optional (java.util.Optional)2