Search in sources :

Example 1 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class HivePageSourceProvider method createPageSource.

@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit split, ConnectorTableHandle tableHandle, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
    HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
    HiveSplit hiveSplit = (HiveSplit) split;
    if (shouldSkipBucket(hiveTable, hiveSplit, dynamicFilter)) {
        return new EmptyPageSource();
    }
    List<HiveColumnHandle> hiveColumns = columns.stream().map(HiveColumnHandle.class::cast).collect(toList());
    List<HiveColumnHandle> dependencyColumns = hiveColumns.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
    if (hiveTable.isAcidUpdate()) {
        hiveColumns = hiveTable.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("update processor not present")).mergeWithNonUpdatedColumns(hiveColumns);
    }
    Path path = new Path(hiveSplit.getPath());
    boolean originalFile = ORIGINAL_FILE_PATH_MATCHER.matcher(path.toString()).matches();
    List<ColumnMapping> columnMappings = ColumnMapping.buildColumnMappings(hiveSplit.getPartitionName(), hiveSplit.getPartitionKeys(), hiveColumns, hiveSplit.getBucketConversion().map(BucketConversion::getBucketColumnHandles).orElse(ImmutableList.of()), hiveSplit.getTableToPartitionMapping(), path, hiveSplit.getBucketNumber(), hiveSplit.getEstimatedFileSize(), hiveSplit.getFileModifiedTime());
    // This can happen when dynamic filters are collected after partition splits were listed.
    if (shouldSkipSplit(columnMappings, dynamicFilter)) {
        return new EmptyPageSource();
    }
    Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsContext(session), path);
    TupleDomain<HiveColumnHandle> simplifiedDynamicFilter = dynamicFilter.getCurrentPredicate().transformKeys(HiveColumnHandle.class::cast).simplify(domainCompactionThreshold);
    Optional<ConnectorPageSource> pageSource = createHivePageSource(pageSourceFactories, cursorProviders, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getEstimatedFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(simplifiedDynamicFilter), hiveColumns, typeManager, hiveSplit.getBucketConversion(), hiveSplit.getBucketValidation(), hiveSplit.isS3SelectPushdownEnabled(), hiveSplit.getAcidInfo(), originalFile, hiveTable.getTransaction(), columnMappings);
    if (pageSource.isPresent()) {
        ConnectorPageSource source = pageSource.get();
        if (hiveTable.isAcidDelete() || hiveTable.isAcidUpdate()) {
            checkArgument(orcFileWriterFactory.isPresent(), "orcFileWriterFactory not supplied but required for DELETE and UPDATE");
            HivePageSource hivePageSource = (HivePageSource) source;
            OrcPageSource orcPageSource = (OrcPageSource) hivePageSource.getDelegate();
            ColumnMetadata<OrcType> columnMetadata = orcPageSource.getColumnTypes();
            int acidRowColumnId = originalFile ? 0 : ACID_ROW_STRUCT_COLUMN_ID;
            HiveType rowType = fromOrcTypeToHiveType(columnMetadata.get(new OrcColumnId(acidRowColumnId)), columnMetadata);
            long currentSplitNumber = hiveSplit.getSplitNumber();
            if (currentSplitNumber >= MAX_NUMBER_OF_SPLITS) {
                throw new TrinoException(GENERIC_INSUFFICIENT_RESOURCES, format("Number of splits is higher than maximum possible number of splits %d", MAX_NUMBER_OF_SPLITS));
            }
            long initialRowId = currentSplitNumber << PER_SPLIT_ROW_ID_BITS;
            return new HiveUpdatablePageSource(hiveTable, hiveSplit.getPartitionName(), hiveSplit.getStatementId(), source, typeManager, hiveSplit.getBucketNumber(), path, originalFile, orcFileWriterFactory.get(), configuration, session, rowType, dependencyColumns, hiveTable.getTransaction().getOperation(), initialRowId, MAX_NUMBER_OF_ROWS_PER_SPLIT);
        }
        return source;
    }
    throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
Also used : OrcColumnId(io.trino.orc.metadata.OrcColumnId) Configuration(org.apache.hadoop.conf.Configuration) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) Path(org.apache.hadoop.fs.Path) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) OrcType(io.trino.orc.metadata.OrcType) TrinoException(io.trino.spi.TrinoException) OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType(io.trino.plugin.hive.orc.OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType) BucketConversion(io.trino.plugin.hive.HiveSplit.BucketConversion)

Example 2 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class ListColumnWriter method finishRowGroup.

@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null, null);
    rowGroupColumnStatistics.add(statistics);
    nonNullValueCount = 0;
    ImmutableMap.Builder<OrcColumnId, ColumnStatistics> columnStatistics = ImmutableMap.builder();
    columnStatistics.put(columnId, statistics);
    columnStatistics.putAll(elementWriter.finishRowGroup());
    return columnStatistics.buildOrThrow();
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.trino.orc.metadata.OrcColumnId) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 3 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class ColumnWriters method createColumnWriter.

public static ColumnWriter createColumnWriter(OrcColumnId columnId, ColumnMetadata<OrcType> orcTypes, Type type, CompressionKind compression, int bufferSize, DataSize stringStatisticsLimit, Supplier<BloomFilterBuilder> bloomFilterBuilder) {
    requireNonNull(type, "type is null");
    OrcType orcType = orcTypes.get(columnId);
    if (type instanceof TimeType) {
        TimeType timeType = (TimeType) type;
        checkArgument(timeType.getPrecision() == 6, "%s not supported for ORC writer", type);
        checkArgument(orcType.getOrcTypeKind() == LONG, "wrong ORC type %s for type %s", orcType, type);
        checkArgument("TIME".equals(orcType.getAttributes().get("iceberg.long-type")), "wrong attributes %s for type %s", orcType.getAttributes(), type);
        return new TimeColumnWriter(columnId, type, compression, bufferSize, () -> new IntegerStatisticsBuilder(bloomFilterBuilder.get()));
    }
    switch(orcType.getOrcTypeKind()) {
        case BOOLEAN:
            return new BooleanColumnWriter(columnId, type, compression, bufferSize);
        case FLOAT:
            return new FloatColumnWriter(columnId, type, compression, bufferSize, () -> new DoubleStatisticsBuilder(bloomFilterBuilder.get()));
        case DOUBLE:
            return new DoubleColumnWriter(columnId, type, compression, bufferSize, () -> new DoubleStatisticsBuilder(bloomFilterBuilder.get()));
        case BYTE:
            return new ByteColumnWriter(columnId, type, compression, bufferSize);
        case DATE:
            return new LongColumnWriter(columnId, type, compression, bufferSize, () -> new DateStatisticsBuilder(bloomFilterBuilder.get()));
        case SHORT:
        case INT:
        case LONG:
            return new LongColumnWriter(columnId, type, compression, bufferSize, () -> new IntegerStatisticsBuilder(bloomFilterBuilder.get()));
        case DECIMAL:
            return new DecimalColumnWriter(columnId, type, compression, bufferSize);
        case TIMESTAMP:
        case TIMESTAMP_INSTANT:
            return new TimestampColumnWriter(columnId, type, compression, bufferSize, () -> new TimestampStatisticsBuilder(bloomFilterBuilder.get()));
        case BINARY:
            return new SliceDirectColumnWriter(columnId, type, compression, bufferSize, BinaryStatisticsBuilder::new);
        case CHAR:
        case VARCHAR:
        case STRING:
            return new SliceDictionaryColumnWriter(columnId, type, compression, bufferSize, () -> new StringStatisticsBuilder(toIntExact(stringStatisticsLimit.toBytes()), bloomFilterBuilder.get()));
        case LIST:
            {
                OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(0);
                Type fieldType = type.getTypeParameters().get(0);
                ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
                return new ListColumnWriter(columnId, compression, bufferSize, elementWriter);
            }
        case MAP:
            {
                ColumnWriter keyWriter = createColumnWriter(orcType.getFieldTypeIndex(0), orcTypes, type.getTypeParameters().get(0), compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
                ColumnWriter valueWriter = createColumnWriter(orcType.getFieldTypeIndex(1), orcTypes, type.getTypeParameters().get(1), compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
                return new MapColumnWriter(columnId, compression, bufferSize, keyWriter, valueWriter);
            }
        case STRUCT:
            {
                ImmutableList.Builder<ColumnWriter> fieldWriters = ImmutableList.builder();
                for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) {
                    OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(fieldId);
                    Type fieldType = type.getTypeParameters().get(fieldId);
                    fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder));
                }
                return new StructColumnWriter(columnId, compression, bufferSize, fieldWriters.build());
            }
        case UNION:
    }
    throw new IllegalArgumentException("Unsupported type: " + type);
}
Also used : StringStatisticsBuilder(io.trino.orc.metadata.statistics.StringStatisticsBuilder) OrcColumnId(io.trino.orc.metadata.OrcColumnId) TimestampStatisticsBuilder(io.trino.orc.metadata.statistics.TimestampStatisticsBuilder) IntegerStatisticsBuilder(io.trino.orc.metadata.statistics.IntegerStatisticsBuilder) DoubleStatisticsBuilder(io.trino.orc.metadata.statistics.DoubleStatisticsBuilder) StringStatisticsBuilder(io.trino.orc.metadata.statistics.StringStatisticsBuilder) BinaryStatisticsBuilder(io.trino.orc.metadata.statistics.BinaryStatisticsBuilder) TimestampStatisticsBuilder(io.trino.orc.metadata.statistics.TimestampStatisticsBuilder) BloomFilterBuilder(io.trino.orc.metadata.statistics.BloomFilterBuilder) DateStatisticsBuilder(io.trino.orc.metadata.statistics.DateStatisticsBuilder) IntegerStatisticsBuilder(io.trino.orc.metadata.statistics.IntegerStatisticsBuilder) TimeType(io.trino.spi.type.TimeType) DateStatisticsBuilder(io.trino.orc.metadata.statistics.DateStatisticsBuilder) BinaryStatisticsBuilder(io.trino.orc.metadata.statistics.BinaryStatisticsBuilder) DoubleStatisticsBuilder(io.trino.orc.metadata.statistics.DoubleStatisticsBuilder) OrcType(io.trino.orc.metadata.OrcType) TimeType(io.trino.spi.type.TimeType) Type(io.trino.spi.type.Type) OrcType(io.trino.orc.metadata.OrcType)

Example 4 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class PresentOutputStream method getStreamDataOutput.

public Optional<StreamDataOutput> getStreamDataOutput(OrcColumnId columnId) {
    checkArgument(closed);
    if (booleanOutputStream == null) {
        return Optional.empty();
    }
    StreamDataOutput streamDataOutput = booleanOutputStream.getStreamDataOutput(columnId);
    // rewrite the DATA stream created by the boolean output stream to a PRESENT stream
    Stream stream = new Stream(columnId, PRESENT, toIntExact(streamDataOutput.size()), streamDataOutput.getStream().isUseVInts());
    return Optional.of(new StreamDataOutput(sliceOutput -> {
        streamDataOutput.writeData(sliceOutput);
        return stream.getLength();
    }, stream));
}
Also used : OrcOutputBuffer(io.trino.orc.OrcOutputBuffer) PRESENT(io.trino.orc.metadata.Stream.StreamKind.PRESENT) BooleanStreamCheckpoint(io.trino.orc.checkpoint.BooleanStreamCheckpoint) CompressionKind(io.trino.orc.metadata.CompressionKind) Stream(io.trino.orc.metadata.Stream) ArrayList(java.util.ArrayList) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ClassLayout(org.openjdk.jol.info.ClassLayout) Optional(java.util.Optional) Math.toIntExact(java.lang.Math.toIntExact) Nullable(javax.annotation.Nullable) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Stream(io.trino.orc.metadata.Stream)

Example 5 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class TestOrcReaderPositions method testRowGroupSkipping.

@Test
public void testRowGroupSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        // create single strip file with multiple row groups
        int rowCount = 142_000;
        createSequentialFile(tempFile.getFile(), rowCount);
        // test reading two row groups from middle of file
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == rowCount) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return (stats.getMin() == 50_000) || (stats.getMin() == 60_000);
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), rowCount);
            assertEquals(reader.getReaderRowCount(), rowCount);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            long position = 50_000;
            while (true) {
                Page page = reader.nextPage();
                if (page == null) {
                    break;
                }
                page = page.getLoadedPage();
                Block block = page.getBlock(0);
                for (int i = 0; i < block.getPositionCount(); i++) {
                    assertEquals(BIGINT.getLong(block, i), position + i);
                }
                assertEquals(reader.getFilePosition(), position);
                assertEquals(reader.getReaderPosition(), position);
                position += page.getPositionCount();
            }
            assertEquals(position, 70_000);
            assertEquals(reader.getFilePosition(), rowCount);
            assertEquals(reader.getReaderPosition(), rowCount);
        }
    }
}
Also used : OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) BATCH_SIZE_GROWTH_FACTOR(io.trino.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) READER_OPTIONS(io.trino.orc.OrcTester.READER_OPTIONS) ORC_12(io.trino.orc.OrcTester.Format.ORC_12) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) Assert.fail(org.testng.Assert.fail) IOException(java.io.IOException) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) OrcTester.createSettableStructObjectInspector(io.trino.orc.OrcTester.createSettableStructObjectInspector) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) CompressionKind(io.trino.orc.metadata.CompressionKind) File(java.io.File) Footer(io.trino.orc.metadata.Footer) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) MAX_BATCH_SIZE(io.trino.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(io.trino.spi.type.BigintType.BIGINT) Serializer(org.apache.hadoop.hive.serde2.Serializer) OrcTester.createOrcRecordWriter(io.trino.orc.OrcTester.createOrcRecordWriter) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) OrcColumnId(io.trino.orc.metadata.OrcColumnId) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Block(io.trino.spi.block.Block) Page(io.trino.spi.Page) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Aggregations

OrcColumnId (io.trino.orc.metadata.OrcColumnId)24 ImmutableMap (com.google.common.collect.ImmutableMap)10 Stream (io.trino.orc.metadata.Stream)9 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)9 ArrayList (java.util.ArrayList)9 OrcType (io.trino.orc.metadata.OrcType)8 List (java.util.List)8 ImmutableList (com.google.common.collect.ImmutableList)7 Slice (io.airlift.slice.Slice)5 CompressionKind (io.trino.orc.metadata.CompressionKind)5 Map (java.util.Map)5 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)4 Footer (io.trino.orc.metadata.Footer)4 OrcInputStream (io.trino.orc.stream.OrcInputStream)4 Page (io.trino.spi.Page)4 IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 ByteBuffer (java.nio.ByteBuffer)3 Configuration (org.apache.hadoop.conf.Configuration)3 Path (org.apache.hadoop.fs.Path)3