Search in sources :

Example 6 with OrcDataSource

use of io.trino.orc.OrcDataSource in project trino by trinodb.

the class TestOrcFileRewriter method testRewriteWithoutMetadata.

@Test
public void testRewriteWithoutMetadata() throws Exception {
    List<Long> columnIds = ImmutableList.of(3L, 7L);
    List<Type> columnTypes = ImmutableList.of(BIGINT, createVarcharType(20));
    File file = temporary.resolve(randomUUID().toString()).toFile();
    try (OrcFileWriter writer = new OrcFileWriter(columnIds, columnTypes, file, false)) {
        List<Page> pages = rowPagesBuilder(columnTypes).row(123L, "hello").row(777L, "sky").build();
        writer.appendPages(pages);
    }
    try (OrcDataSource dataSource = fileOrcDataSource(file)) {
        OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes);
        assertEquals(reader.getReaderRowCount(), 2);
        assertEquals(reader.getFileRowCount(), 2);
        assertEquals(reader.getSplitLength(), file.length());
        Page page = reader.nextPage();
        assertEquals(page.getPositionCount(), 2);
        Block column0 = page.getBlock(0);
        assertEquals(column0.getPositionCount(), 2);
        for (int i = 0; i < 2; i++) {
            assertEquals(column0.isNull(i), false);
        }
        assertEquals(BIGINT.getLong(column0, 0), 123L);
        assertEquals(BIGINT.getLong(column0, 1), 777L);
        Block column1 = page.getBlock(1);
        assertEquals(column1.getPositionCount(), 2);
        for (int i = 0; i < 2; i++) {
            assertEquals(column1.isNull(i), false);
        }
        assertEquals(createVarcharType(20).getSlice(column1, 0), utf8Slice("hello"));
        assertEquals(createVarcharType(20).getSlice(column1, 1), utf8Slice("sky"));
        assertFalse(reader.getUserMetadata().containsKey(OrcFileMetadata.KEY));
    }
    BitSet rowsToDelete = new BitSet(5);
    rowsToDelete.set(1);
    File newFile = temporary.resolve(randomUUID().toString()).toFile();
    OrcFileInfo info = OrcFileRewriter.rewrite(file, newFile, rowsToDelete);
    assertEquals(info.getRowCount(), 1);
    assertEquals(info.getUncompressedSize(), 13);
    try (OrcDataSource dataSource = fileOrcDataSource(newFile)) {
        OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes);
        assertEquals(reader.getReaderRowCount(), 1);
        assertEquals(reader.getFileRowCount(), 1);
        assertEquals(reader.getSplitLength(), newFile.length());
        Page page = reader.nextPage();
        assertEquals(page.getPositionCount(), 1);
        Block column0 = page.getBlock(0);
        assertEquals(column0.getPositionCount(), 1);
        assertEquals(column0.isNull(0), false);
        assertEquals(BIGINT.getLong(column0, 0), 123L);
        Block column1 = page.getBlock(1);
        assertEquals(column1.getPositionCount(), 1);
        assertEquals(column1.isNull(0), false);
        assertEquals(createVarcharType(20).getSlice(column1, 0), utf8Slice("hello"));
        assertFalse(reader.getUserMetadata().containsKey(OrcFileMetadata.KEY));
    }
}
Also used : OrcTestingUtil.fileOrcDataSource(io.trino.plugin.raptor.legacy.storage.OrcTestingUtil.fileOrcDataSource) OrcDataSource(io.trino.orc.OrcDataSource) BitSet(java.util.BitSet) Page(io.trino.spi.Page) OrcRecordReader(io.trino.orc.OrcRecordReader) Type(io.trino.spi.type.Type) ArrayType(io.trino.spi.type.ArrayType) VarcharType.createVarcharType(io.trino.spi.type.VarcharType.createVarcharType) DecimalType(io.trino.spi.type.DecimalType) OrcFileInfo(io.trino.plugin.raptor.legacy.storage.OrcFileRewriter.OrcFileInfo) Block(io.trino.spi.block.Block) File(java.io.File) Test(org.testng.annotations.Test)

Example 7 with OrcDataSource

use of io.trino.orc.OrcDataSource in project trino by trinodb.

the class TestRaptorStorageManager method testWriter.

@Test
public void testWriter() throws Exception {
    RaptorStorageManager manager = createRaptorStorageManager();
    List<Long> columnIds = ImmutableList.of(3L, 7L);
    List<Type> columnTypes = ImmutableList.of(BIGINT, createVarcharType(10));
    StoragePageSink sink = createStoragePageSink(manager, columnIds, columnTypes);
    List<Page> pages = rowPagesBuilder(columnTypes).row(123L, "hello").row(456L, "bye").build();
    sink.appendPages(pages);
    // shard is not recorded until flush
    assertEquals(shardRecorder.getShards().size(), 0);
    sink.flush();
    // shard is recorded after flush
    List<RecordedShard> recordedShards = shardRecorder.getShards();
    assertEquals(recordedShards.size(), 1);
    List<ShardInfo> shards = getFutureValue(sink.commit());
    assertEquals(shards.size(), 1);
    ShardInfo shardInfo = Iterables.getOnlyElement(shards);
    UUID shardUuid = shardInfo.getShardUuid();
    File file = storageService.getStorageFile(shardUuid);
    File backupFile = fileBackupStore.getBackupFile(shardUuid);
    assertEquals(recordedShards.get(0).getTransactionId(), TRANSACTION_ID);
    assertEquals(recordedShards.get(0).getShardUuid(), shardUuid);
    assertEquals(shardInfo.getRowCount(), 2);
    assertEquals(shardInfo.getCompressedSize(), file.length());
    assertEquals(shardInfo.getXxhash64(), xxhash64(file));
    // verify primary and backup shard exist
    assertFile(file, "primary shard");
    assertFile(backupFile, "backup shard");
    assertFileEquals(file, backupFile);
    // remove primary shard to force recovery from backup
    assertTrue(file.delete());
    assertTrue(file.getParentFile().delete());
    assertFalse(file.exists());
    recoveryManager.restoreFromBackup(shardUuid, shardInfo.getCompressedSize(), OptionalLong.of(shardInfo.getXxhash64()));
    try (OrcDataSource dataSource = manager.openShard(shardUuid, READER_OPTIONS)) {
        OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes);
        Page page = reader.nextPage();
        assertEquals(page.getPositionCount(), 2);
        Block column0 = page.getBlock(0);
        assertEquals(column0.isNull(0), false);
        assertEquals(column0.isNull(1), false);
        assertEquals(BIGINT.getLong(column0, 0), 123L);
        assertEquals(BIGINT.getLong(column0, 1), 456L);
        Block column1 = page.getBlock(1);
        assertEquals(createVarcharType(10).getSlice(column1, 0), utf8Slice("hello"));
        assertEquals(createVarcharType(10).getSlice(column1, 1), utf8Slice("bye"));
        assertNull(reader.nextPage());
    }
}
Also used : OrcDataSource(io.trino.orc.OrcDataSource) Page(io.trino.spi.Page) RecordedShard(io.trino.plugin.raptor.legacy.storage.InMemoryShardRecorder.RecordedShard) OrcRecordReader(io.trino.orc.OrcRecordReader) Type(io.trino.spi.type.Type) VarcharType.createVarcharType(io.trino.spi.type.VarcharType.createVarcharType) OptionalLong(java.util.OptionalLong) Block(io.trino.spi.block.Block) UUID(java.util.UUID) FileAssert.assertFile(org.testng.FileAssert.assertFile) File(java.io.File) ShardInfo(io.trino.plugin.raptor.legacy.metadata.ShardInfo) Test(org.testng.annotations.Test)

Example 8 with OrcDataSource

use of io.trino.orc.OrcDataSource in project trino by trinodb.

the class OrcFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    CompressionKind compression = getCompression(schema, configuration);
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    if (transaction.isAcidDeleteOperation(writerKind)) {
        // For delete, set the "row" column to -1
        fileInputColumnIndexes[fileInputColumnIndexes.length - 1] = -1;
    }
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        OrcDataSink orcDataSink = createOrcDataSink(fileSystem, path);
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), new OrcReaderOptions(), fileSystem.open(path), readStats);
                } catch (IOException e) {
                    throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        if (transaction.isInsert() && useAcidSchema) {
            // Only add the ACID columns if the request is for insert-type operations - - for delete operations,
            // the columns are added by the caller.  This is because the ACID columns for delete operations
            // depend on the rows being deleted, whereas the ACID columns for INSERT are completely determined
            // by bucket and writeId.
            Type rowType = createRowType(fileColumnNames, fileColumnTypes);
            fileColumnNames = ACID_COLUMN_NAMES;
            fileColumnTypes = createAcidColumnPrestoTypes(rowType);
        }
        return Optional.of(new OrcFileWriter(orcDataSink, writerKind, transaction, useAcidSchema, bucketNumber, rollbackAction, fileColumnNames, fileColumnTypes, createRootOrcType(fileColumnNames, fileColumnTypes), compression, getOrcWriterOptions(schema, orcWriterOptions).withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}
Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) FileSystem(org.apache.hadoop.fs.FileSystem) HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) OrcConf(org.apache.orc.OrcConf) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) ENGLISH(java.util.Locale.ENGLISH) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize) TrinoException(io.trino.spi.TrinoException) HiveSessionProperties.getOrcOptimizedWriterValidateMode(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterValidateMode) WriterKind(io.trino.plugin.hive.WriterKind) HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMinStripeSize) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) AcidSchema.createAcidColumnPrestoTypes(io.trino.plugin.hive.acid.AcidSchema.createAcidColumnPrestoTypes) HIVE_WRITE_VALIDATION_FAILED(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED) Optional(java.util.Optional) ACID_COLUMN_NAMES(io.trino.plugin.hive.acid.AcidSchema.ACID_COLUMN_NAMES) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) Flatten(org.weakref.jmx.Flatten) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) Supplier(java.util.function.Supplier) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) HiveSessionProperties.isOrcOptimizedWriterValidate(io.trino.plugin.hive.HiveSessionProperties.isOrcOptimizedWriterValidate) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) HiveSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.hive.HiveSessionProperties.getOrcStringStatisticsLimit) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) Properties(java.util.Properties) HIVE_UNSUPPORTED_FORMAT(io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveUtil.getOrcWriterOptions(io.trino.plugin.hive.util.HiveUtil.getOrcWriterOptions) CompressionKind(io.trino.orc.metadata.CompressionKind) JobConf(org.apache.hadoop.mapred.JobConf) Collectors.toList(java.util.stream.Collectors.toList) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows) CompressionKind(io.trino.orc.metadata.CompressionKind) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) OrcDataSink(io.trino.orc.OrcDataSink) IOException(java.io.IOException) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) Type(io.trino.spi.type.Type) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcReaderOptions(io.trino.orc.OrcReaderOptions) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) Supplier(java.util.function.Supplier)

Example 9 with OrcDataSource

use of io.trino.orc.OrcDataSource in project trino by trinodb.

the class OrcDeleteDeltaPageSource method createOrcDeleteDeltaPageSource.

public static Optional<ConnectorPageSource> createOrcDeleteDeltaPageSource(Path path, long fileSize, OrcReaderOptions options, ConnectorIdentity identity, Configuration configuration, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats) {
    OrcDataSource orcDataSource;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options, inputStream, stats);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, openError(e, path), e);
    }
    try {
        Optional<OrcReader> orcReader = createOrcReader(orcDataSource, options);
        if (orcReader.isPresent()) {
            return Optional.of(new OrcDeleteDeltaPageSource(path, fileSize, orcReader.get(), orcDataSource, stats));
        }
        return Optional.empty();
    } catch (Exception e) {
        try {
            orcDataSource.close();
        } catch (IOException ex) {
            e.addSuppressed(ex);
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = openError(e, path);
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : OrcDataSource(io.trino.orc.OrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) FileNotFoundException(java.io.FileNotFoundException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) OrcCorruptionException(io.trino.orc.OrcCorruptionException) OrcReader(io.trino.orc.OrcReader) OrcReader.createOrcReader(io.trino.orc.OrcReader.createOrcReader) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException)

Example 10 with OrcDataSource

use of io.trino.orc.OrcDataSource in project trino by trinodb.

the class IcebergPageSourceProvider method createOrcPageSource.

private static ReaderPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> columns, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, FileFormatDataSourceStats stats, TypeManager typeManager, Optional<NameMapping> nameMapping) {
    OrcDataSource orcDataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options, inputStream, stats);
        OrcReader reader = OrcReader.createOrcReader(orcDataSource, options).orElseThrow(() -> new TrinoException(ICEBERG_BAD_DATA, "ORC file is zero length"));
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        if (nameMapping.isPresent() && !hasIds(reader.getRootColumn())) {
            fileColumns = fileColumns.stream().map(orcColumn -> setMissingFieldIds(orcColumn, nameMapping.get(), ImmutableList.of(orcColumn.getColumnName()))).collect(toImmutableList());
        }
        Map<Integer, OrcColumn> fileColumnsByIcebergId = mapIdsToOrcFileColumns(fileColumns);
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled());
        Map<IcebergColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        Optional<ReaderColumns> columnProjections = projectColumns(columns);
        Map<Integer, List<List<Integer>>> projectionsByFieldId = columns.stream().collect(groupingBy(column -> column.getBaseColumnIdentity().getId(), mapping(IcebergColumnHandle::getPath, toUnmodifiableList())));
        List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(columns);
        List<OrcColumn> fileReadColumns = new ArrayList<>(readColumns.size());
        List<Type> fileReadTypes = new ArrayList<>(readColumns.size());
        List<ProjectedLayout> projectedLayouts = new ArrayList<>(readColumns.size());
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(readColumns.size());
        for (IcebergColumnHandle column : readColumns) {
            verify(column.isBaseColumn(), "Column projections must be based from a root column");
            OrcColumn orcColumn = fileColumnsByIcebergId.get(column.getId());
            if (orcColumn != null) {
                Type readType = getOrcReadType(column.getType(), typeManager);
                if (column.getType() == UUID && !"UUID".equals(orcColumn.getAttributes().get(ICEBERG_BINARY_TYPE))) {
                    throw new TrinoException(ICEBERG_BAD_DATA, format("Expected ORC column for UUID data to be annotated with %s=UUID: %s", ICEBERG_BINARY_TYPE, orcColumn));
                }
                List<List<Integer>> fieldIdProjections = projectionsByFieldId.get(column.getId());
                ProjectedLayout projectedLayout = IcebergOrcProjectedLayout.createProjectedLayout(orcColumn, fieldIdProjections);
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                projectedLayouts.add(projectedLayout);
                for (Map.Entry<IcebergColumnHandle, Domain> domainEntry : effectivePredicateDomains.entrySet()) {
                    IcebergColumnHandle predicateColumn = domainEntry.getKey();
                    OrcColumn predicateOrcColumn = fileColumnsByIcebergId.get(predicateColumn.getId());
                    if (predicateOrcColumn != null && column.getColumnIdentity().equals(predicateColumn.getBaseColumnIdentity())) {
                        predicateBuilder.addColumn(predicateOrcColumn.getColumnId(), domainEntry.getValue());
                    }
                }
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(column.getType()));
            }
        }
        AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
        OrcDataSourceId orcDataSourceId = orcDataSource.getId();
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, projectedLayouts, predicateBuilder.build(), start, length, UTC, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSourceId, exception), new IdBasedFieldMapperFactory(readColumns));
        return new ReaderPageSource(new OrcPageSource(recordReader, columnAdaptations, orcDataSource, Optional.empty(), Optional.empty(), memoryUsage, stats), columnProjections);
    } catch (Exception e) {
        if (orcDataSource != null) {
            try {
                orcDataSource.close();
            } catch (IOException ignored) {
            }
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ORC_ICEBERG_ID_KEY(io.trino.plugin.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ICEBERG_CANNOT_OPEN_SPLIT(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) UUID(io.trino.spi.type.UuidType.UUID) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) ICEBERG_FILESYSTEM_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIO(org.apache.parquet.io.ColumnIO) IcebergSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MappedField(org.apache.iceberg.mapping.MappedField) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) IcebergSessionProperties.isOrcNestedLazy(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy) IcebergSessionProperties.getOrcMaxBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IcebergSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) ICEBERG_MISSING_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) VARBINARY(io.trino.spi.type.VarbinaryType.VARBINARY) MappedFields(org.apache.iceberg.mapping.MappedFields) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) OrcType(io.trino.orc.metadata.OrcType) Predicate(io.trino.parquet.predicate.Predicate) IcebergSessionProperties.isUseFileSizeFromMetadata(io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata) MapType(io.trino.spi.type.MapType) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) StandardTypes(io.trino.spi.type.StandardTypes) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IcebergSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) IcebergSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) UTC(org.joda.time.DateTimeZone.UTC) Field(io.trino.parquet.Field) Traverser(com.google.common.graph.Traverser) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) IcebergSessionProperties.getOrcStreamBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetSchemaUtil(org.apache.iceberg.parquet.ParquetSchemaUtil) OrcColumn(io.trino.orc.OrcColumn) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) MetadataReader(io.trino.parquet.reader.MetadataReader) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) OrcRecordReader(io.trino.orc.OrcRecordReader) NameMapping(org.apache.iceberg.mapping.NameMapping) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) ReaderProjectionsAdapter(io.trino.plugin.hive.ReaderProjectionsAdapter) RowType(io.trino.spi.type.RowType) ICEBERG_BAD_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) ParquetReader(io.trino.parquet.reader.ParquetReader) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) ICEBERG_CURSOR_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CURSOR_ERROR) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) OrcDataSourceId(io.trino.orc.OrcDataSourceId) MessageType(org.apache.parquet.schema.MessageType) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) IcebergSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) Function(java.util.function.Function) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) IcebergSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) OrcReader(io.trino.orc.OrcReader) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) ICEBERG_BINARY_TYPE(io.trino.plugin.iceberg.TypeConverter.ICEBERG_BINARY_TYPE) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) OrcCorruptionException(io.trino.orc.OrcCorruptionException) Collectors.toList(java.util.stream.Collectors.toList) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) ParquetDataSource(io.trino.parquet.ParquetDataSource) TypeManager(io.trino.spi.type.TypeManager) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) ArrayList(java.util.ArrayList) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) OrcDataSource(io.trino.orc.OrcDataSource) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OrcColumn(io.trino.orc.OrcColumn) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IOException(java.io.IOException) OrcRecordReader(io.trino.orc.OrcRecordReader) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) TrinoException(io.trino.spi.TrinoException) OrcCorruptionException(io.trino.orc.OrcCorruptionException) OrcType(io.trino.orc.metadata.OrcType) MapType(io.trino.spi.type.MapType) RowType(io.trino.spi.type.RowType) ArrayType(io.trino.spi.type.ArrayType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) OrcReader(io.trino.orc.OrcReader) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation)

Aggregations

OrcDataSource (io.trino.orc.OrcDataSource)14 TrinoException (io.trino.spi.TrinoException)9 Type (io.trino.spi.type.Type)9 IOException (java.io.IOException)9 OrcDataSourceId (io.trino.orc.OrcDataSourceId)7 OrcRecordReader (io.trino.orc.OrcRecordReader)7 OrcReader (io.trino.orc.OrcReader)6 FileSystem (org.apache.hadoop.fs.FileSystem)6 OrcReaderOptions (io.trino.orc.OrcReaderOptions)5 VarcharType.createVarcharType (io.trino.spi.type.VarcharType.createVarcharType)5 Path (org.apache.hadoop.fs.Path)5 ImmutableMap (com.google.common.collect.ImmutableMap)4 FileFormatDataSourceStats (io.trino.plugin.hive.FileFormatDataSourceStats)4 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)4 Page (io.trino.spi.Page)4 Block (io.trino.spi.block.Block)4 ConnectorSession (io.trino.spi.connector.ConnectorSession)4 ArrayType (io.trino.spi.type.ArrayType)4 File (java.io.File)4 List (java.util.List)4