Search in sources :

Example 6 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class ParquetPageSourceFactory method createPageSource.

/**
 * This method is available for other callers to use directly.
 */
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
    // Ignore predicates on partial columns for now.
    effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
    MessageType fileSchema;
    MessageType requestedSchema;
    MessageColumnIO messageColumn;
    ParquetReader parquetReader;
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        messageColumn = getColumnIO(fileSchema, requestedSchema);
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
        long nextStart = 0;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
        ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
            if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
                blocks.add(block);
                blockStarts.add(nextStart);
                columnIndexes.add(columnIndex);
            }
            nextStart += block.getRowCount();
        }
        parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new TrinoException(HIVE_BAD_DATA, e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
    Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
    List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
    for (HiveColumnHandle column : baseColumns) {
        checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
    }
    ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
    ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
    ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
    for (HiveColumnHandle column : baseColumns) {
        trinoTypes.add(column.getBaseType());
        rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
        if (column == PARQUET_ROW_INDEX_COLUMN) {
            internalFields.add(Optional.empty());
        } else {
            internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
                String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
                return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
            }));
        }
    }
    ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
    return new ReaderPageSource(parquetPageSource, readerProjections);
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) MetadataReader(io.trino.parquet.reader.MetadataReader) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) HiveSessionProperties.isUseParquetColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HiveParquetColumnIOConverter.constructField(io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ParquetReader(io.trino.parquet.reader.ParquetReader) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) HiveSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) BIGINT(io.trino.spi.type.BigintType.BIGINT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) OptionalInt(java.util.OptionalInt) HiveSessionProperties.isParquetIgnoreStatistics(io.trino.plugin.hive.HiveSessionProperties.isParquetIgnoreStatistics) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) ParquetTypeUtils.lookupColumnByName(io.trino.parquet.ParquetTypeUtils.lookupColumnByName) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.trino.parquet.ParquetTypeUtils.getParquetTypeByName) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Predicate(io.trino.parquet.predicate.Predicate) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) HivePageSourceProvider.projectSufficientColumns(io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns) Properties(java.util.Properties) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) AcidInfo(io.trino.plugin.hive.AcidInfo) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Field(io.trino.parquet.Field) ParquetDataSource(io.trino.parquet.ParquetDataSource) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ImmutableList(com.google.common.collect.ImmutableList) FileNotFoundException(java.io.FileNotFoundException) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(io.trino.parquet.predicate.Predicate) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) FileSystem(org.apache.hadoop.fs.FileSystem) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) List(java.util.List) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) ImmutableList(com.google.common.collect.ImmutableList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) MessageType(org.apache.parquet.schema.MessageType) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ParquetDataSource(io.trino.parquet.ParquetDataSource) Optional(java.util.Optional) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetReader(io.trino.parquet.reader.ParquetReader) IOException(java.io.IOException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) HiveType(io.trino.plugin.hive.HiveType) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns)

Example 7 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class RcFilePageSourceFactory method createPageSource.

@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
    RcFileEncoding rcFileEncoding;
    String deserializerClassName = getDeserializerClassName(schema);
    if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
        rcFileEncoding = new BinaryRcFileEncoding(timeZone);
    } else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
        rcFileEncoding = createTextVectorEncoding(schema);
    } else {
        return Optional.empty();
    }
    checkArgument(acidInfo.isEmpty(), "Acid is not supported");
    List<HiveColumnHandle> projectedReaderColumns = columns;
    Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
    if (readerProjections.isPresent()) {
        projectedReaderColumns = readerProjections.get().get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList());
    }
    RcFileDataSource dataSource;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path));
        if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
            // Handle potentially imprecise file lengths by reading the footer
            try {
                FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes()));
                dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice());
            } finally {
                inputStream.close();
            }
        } else {
            long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen());
            dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats);
        }
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    length = min(dataSource.getSize() - start, length);
    // Split may be empty now that the correct file size is known
    if (length <= 0) {
        return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
    }
    try {
        ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
        HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
        for (HiveColumnHandle column : projectedReaderColumns) {
            readColumns.put(column.getBaseHiveColumnIndex(), column.getHiveType().getType(typeManager, timestampPrecision));
        }
        RcFileReader rcFileReader = new RcFileReader(dataSource, rcFileEncoding, readColumns.buildOrThrow(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, BUFFER_SIZE);
        ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns);
        return Optional.of(new ReaderPageSource(pageSource, readerProjections));
    } catch (Throwable e) {
        try {
            dataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof RcFileCorruptionException) {
            throw new TrinoException(HIVE_BAD_DATA, message, e);
        }
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding) TextRcFileEncoding(io.trino.rcfile.text.TextRcFileEncoding) RcFileEncoding(io.trino.rcfile.RcFileEncoding) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) RcFileCorruptionException(io.trino.rcfile.RcFileCorruptionException) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) RcFileDataSourceId(io.trino.rcfile.RcFileDataSourceId) HiveTimestampPrecision(io.trino.plugin.hive.HiveTimestampPrecision) IOException(java.io.IOException) FSDataInputStreamTail(io.trino.plugin.hive.util.FSDataInputStreamTail) RcFileReader(io.trino.rcfile.RcFileReader) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) RcFileCorruptionException(io.trino.rcfile.RcFileCorruptionException) ImmutableMap(com.google.common.collect.ImmutableMap) Type(io.trino.spi.type.Type) MemoryRcFileDataSource(io.trino.rcfile.MemoryRcFileDataSource) HadoopCodecFactory(io.trino.rcfile.HadoopCodecFactory) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding) AircompressorCodecFactory(io.trino.rcfile.AircompressorCodecFactory) RcFileDataSource(io.trino.rcfile.RcFileDataSource) MemoryRcFileDataSource(io.trino.rcfile.MemoryRcFileDataSource)

Example 8 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class AbstractTestHive method assertGetRecords.

protected void assertGetRecords(String tableName, HiveStorageFormat hiveStorageFormat) throws Exception {
    try (Transaction transaction = newTransaction()) {
        ConnectorSession session = newSession();
        ConnectorMetadata metadata = transaction.getMetadata();
        metadata.beginQuery(session);
        ConnectorTableHandle tableHandle = getTableHandle(metadata, new SchemaTableName(database, tableName));
        ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, tableHandle);
        HiveSplit hiveSplit = getHiveSplit(tableHandle, transaction, session);
        List<ColumnHandle> columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values());
        ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, hiveSplit, tableHandle, columnHandles, DynamicFilter.EMPTY);
        assertGetRecords(hiveStorageFormat, tableMetadata, hiveSplit, pageSource, columnHandles);
    }
}
Also used : HiveColumnHandle.bucketColumnHandle(io.trino.plugin.hive.HiveColumnHandle.bucketColumnHandle) ColumnHandle(io.trino.spi.connector.ColumnHandle) ConnectorSession(io.trino.spi.connector.ConnectorSession) TestingConnectorSession(io.trino.testing.TestingConnectorSession) ConnectorMetadata(io.trino.spi.connector.ConnectorMetadata) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) CatalogSchemaTableName(io.trino.spi.connector.CatalogSchemaTableName) SchemaTableName(io.trino.spi.connector.SchemaTableName) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle)

Example 9 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class AbstractTestHive method doTestBucketSortedTables.

private void doTestBucketSortedTables(SchemaTableName table) throws IOException {
    int bucketCount = 3;
    int expectedRowCount = 0;
    try (Transaction transaction = newTransaction()) {
        ConnectorSession session = newSession();
        ConnectorMetadata metadata = transaction.getMetadata();
        // begin creating the table
        ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(table, ImmutableList.<ColumnMetadata>builder().add(new ColumnMetadata("id", VARCHAR)).add(new ColumnMetadata("value_asc", VARCHAR)).add(new ColumnMetadata("value_desc", BIGINT)).add(new ColumnMetadata("ds", VARCHAR)).build(), ImmutableMap.<String, Object>builder().put(STORAGE_FORMAT_PROPERTY, RCBINARY).put(PARTITIONED_BY_PROPERTY, ImmutableList.of("ds")).put(BUCKETED_BY_PROPERTY, ImmutableList.of("id")).put(BUCKET_COUNT_PROPERTY, bucketCount).put(SORTED_BY_PROPERTY, ImmutableList.builder().add(new SortingColumn("value_asc", ASCENDING)).add(new SortingColumn("value_desc", DESCENDING)).build()).buildOrThrow());
        ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty(), NO_RETRIES);
        // write the data
        ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle);
        List<Type> types = tableMetadata.getColumns().stream().map(ColumnMetadata::getType).collect(toList());
        ThreadLocalRandom random = ThreadLocalRandom.current();
        for (int i = 0; i < 50; i++) {
            MaterializedResult.Builder builder = MaterializedResult.resultBuilder(session, types);
            for (int j = 0; j < 1000; j++) {
                builder.row(sha256().hashLong(random.nextLong()).toString(), "test" + random.nextInt(100), random.nextLong(100_000), "2018-04-01");
                expectedRowCount++;
            }
            sink.appendPage(builder.build().toPage());
        }
        HdfsContext context = new HdfsContext(session);
        // verify we have enough temporary files per bucket to require multiple passes
        Path stagingPathRoot;
        if (isTemporaryStagingDirectoryEnabled(session)) {
            stagingPathRoot = new Path(getTemporaryStagingDirectoryPath(session).replace("${USER}", context.getIdentity().getUser()));
        } else {
            stagingPathRoot = getStagingPathRoot(outputHandle);
        }
        assertThat(listAllDataFiles(context, stagingPathRoot)).filteredOn(file -> file.contains(".tmp-sort.")).size().isGreaterThan(bucketCount * getHiveConfig().getMaxOpenSortFiles() * 2);
        // finish the write
        Collection<Slice> fragments = getFutureValue(sink.finish());
        // verify there are no temporary files
        for (String file : listAllDataFiles(context, stagingPathRoot)) {
            assertThat(file).doesNotContain(".tmp-sort.");
        }
        // finish creating table
        metadata.finishCreateTable(session, outputHandle, fragments, ImmutableList.of());
        transaction.commit();
    }
    // verify that bucket files are sorted
    try (Transaction transaction = newTransaction()) {
        ConnectorMetadata metadata = transaction.getMetadata();
        ConnectorSession session = newSession();
        metadata.beginQuery(session);
        ConnectorTableHandle tableHandle = getTableHandle(metadata, table);
        List<ColumnHandle> columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values());
        // verify local sorting property
        ConnectorTableProperties properties = metadata.getTableProperties(newSession(ImmutableMap.of("propagate_table_scan_sorting_properties", true, "bucket_execution_enabled", false)), tableHandle);
        Map<String, Integer> columnIndex = indexColumns(columnHandles);
        assertEquals(properties.getLocalProperties(), ImmutableList.of(new SortingProperty<>(columnHandles.get(columnIndex.get("value_asc")), ASC_NULLS_FIRST), new SortingProperty<>(columnHandles.get(columnIndex.get("value_desc")), DESC_NULLS_LAST)));
        assertThat(metadata.getTableProperties(newSession(), tableHandle).getLocalProperties()).isEmpty();
        List<ConnectorSplit> splits = getAllSplits(tableHandle, transaction, session);
        assertThat(splits).hasSize(bucketCount);
        int actualRowCount = 0;
        for (ConnectorSplit split : splits) {
            try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles, DynamicFilter.EMPTY)) {
                String lastValueAsc = null;
                long lastValueDesc = -1;
                while (!pageSource.isFinished()) {
                    Page page = pageSource.getNextPage();
                    if (page == null) {
                        continue;
                    }
                    for (int i = 0; i < page.getPositionCount(); i++) {
                        Block blockAsc = page.getBlock(1);
                        Block blockDesc = page.getBlock(2);
                        assertFalse(blockAsc.isNull(i));
                        assertFalse(blockDesc.isNull(i));
                        String valueAsc = VARCHAR.getSlice(blockAsc, i).toStringUtf8();
                        if (lastValueAsc != null) {
                            assertGreaterThanOrEqual(valueAsc, lastValueAsc);
                            if (valueAsc.equals(lastValueAsc)) {
                                long valueDesc = BIGINT.getLong(blockDesc, i);
                                if (lastValueDesc != -1) {
                                    assertLessThanOrEqual(valueDesc, lastValueDesc);
                                }
                                lastValueDesc = valueDesc;
                            } else {
                                lastValueDesc = -1;
                            }
                        }
                        lastValueAsc = valueAsc;
                        actualRowCount++;
                    }
                }
            }
        }
        assertThat(actualRowCount).isEqualTo(expectedRowCount);
    }
}
Also used : ColumnMetadata(io.trino.spi.connector.ColumnMetadata) Page(io.trino.spi.Page) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) ConnectorSession(io.trino.spi.connector.ConnectorSession) TestingConnectorSession(io.trino.testing.TestingConnectorSession) ConnectorMetadata(io.trino.spi.connector.ConnectorMetadata) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getTemporaryStagingDirectoryPath(io.trino.plugin.hive.HiveSessionProperties.getTemporaryStagingDirectoryPath) HiveColumnHandle.bucketColumnHandle(io.trino.plugin.hive.HiveColumnHandle.bucketColumnHandle) ColumnHandle(io.trino.spi.connector.ColumnHandle) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) SortingProperty(io.trino.spi.connector.SortingProperty) Constraint(io.trino.spi.connector.Constraint) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) MapType(io.trino.spi.type.MapType) HiveTestUtils.mapType(io.trino.plugin.hive.HiveTestUtils.mapType) VarcharType.createVarcharType(io.trino.spi.type.VarcharType.createVarcharType) HiveTestUtils.arrayType(io.trino.plugin.hive.HiveTestUtils.arrayType) HiveTestUtils.rowType(io.trino.plugin.hive.HiveTestUtils.rowType) CharType.createCharType(io.trino.spi.type.CharType.createCharType) HiveType.toHiveType(io.trino.plugin.hive.HiveType.toHiveType) DecimalType.createDecimalType(io.trino.spi.type.DecimalType.createDecimalType) CharType(io.trino.spi.type.CharType) TableType(org.apache.hadoop.hive.metastore.TableType) RowType(io.trino.spi.type.RowType) ArrayType(io.trino.spi.type.ArrayType) Type(io.trino.spi.type.Type) VarcharType.createUnboundedVarcharType(io.trino.spi.type.VarcharType.createUnboundedVarcharType) VarcharType(io.trino.spi.type.VarcharType) ConnectorOutputTableHandle(io.trino.spi.connector.ConnectorOutputTableHandle) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Block(io.trino.spi.block.Block) ConnectorPageSink(io.trino.spi.connector.ConnectorPageSink) MaterializedResult(io.trino.testing.MaterializedResult) ConnectorTableProperties(io.trino.spi.connector.ConnectorTableProperties) ConnectorSplit(io.trino.spi.connector.ConnectorSplit)

Example 10 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class AbstractTestHiveFileSystem method createTable.

private void createTable(SchemaTableName tableName, HiveStorageFormat storageFormat) throws Exception {
    List<ColumnMetadata> columns = ImmutableList.<ColumnMetadata>builder().add(new ColumnMetadata("id", BIGINT)).build();
    MaterializedResult data = MaterializedResult.resultBuilder(newSession(), BIGINT).row(1L).row(3L).row(2L).build();
    try (Transaction transaction = newTransaction()) {
        ConnectorMetadata metadata = transaction.getMetadata();
        ConnectorSession session = newSession();
        // begin creating the table
        ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, createTableProperties(storageFormat));
        ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty(), NO_RETRIES);
        // write the records
        ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle);
        sink.appendPage(data.toPage());
        Collection<Slice> fragments = getFutureValue(sink.finish());
        // commit the table
        metadata.finishCreateTable(session, outputHandle, fragments, ImmutableList.of());
        transaction.commit();
        // Hack to work around the metastore not being configured for S3 or other FS.
        // The metastore tries to validate the location when creating the
        // table, which fails without explicit configuration for file system.
        // We work around that by using a dummy location when creating the
        // table and update it here to the correct location.
        metastoreClient.updateTableLocation(database, tableName.getTableName(), locationService.getTableWriteInfo(((HiveOutputTableHandle) outputHandle).getLocationHandle(), false).getTargetPath().toString());
    }
    try (Transaction transaction = newTransaction()) {
        ConnectorMetadata metadata = transaction.getMetadata();
        ConnectorSession session = newSession();
        // load the new table
        ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName);
        List<ColumnHandle> columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values());
        // verify the metadata
        ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, getTableHandle(metadata, tableName));
        assertEquals(filterNonHiddenColumnMetadata(tableMetadata.getColumns()), columns);
        // verify the data
        metadata.beginQuery(session);
        ConnectorSplitSource splitSource = getSplits(splitManager, transaction, session, tableHandle);
        ConnectorSplit split = getOnlyElement(getAllSplits(splitSource));
        try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles, DynamicFilter.EMPTY)) {
            MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles));
            assertEqualsIgnoreOrder(result.getMaterializedRows(), data.getMaterializedRows());
        }
        metadata.cleanupQuery(session);
    }
}
Also used : ColumnHandle(io.trino.spi.connector.ColumnHandle) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) AbstractTestHive.filterNonHiddenColumnMetadata(io.trino.plugin.hive.AbstractTestHive.filterNonHiddenColumnMetadata) ConnectorSplitSource(io.trino.spi.connector.ConnectorSplitSource) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) ConnectorOutputTableHandle(io.trino.spi.connector.ConnectorOutputTableHandle) Transaction(io.trino.plugin.hive.AbstractTestHive.Transaction) HiveTransaction(io.trino.plugin.hive.AbstractTestHive.HiveTransaction) Slice(io.airlift.slice.Slice) ConnectorSession(io.trino.spi.connector.ConnectorSession) ConnectorMetadata(io.trino.spi.connector.ConnectorMetadata) MaterializedResult(io.trino.testing.MaterializedResult) ConnectorPageSink(io.trino.spi.connector.ConnectorPageSink) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata)

Aggregations

ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)50 ConnectorSession (io.trino.spi.connector.ConnectorSession)23 Page (io.trino.spi.Page)18 Type (io.trino.spi.type.Type)18 Test (org.testng.annotations.Test)17 ImmutableList (com.google.common.collect.ImmutableList)16 MaterializedResult (io.trino.testing.MaterializedResult)14 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)13 ColumnHandle (io.trino.spi.connector.ColumnHandle)13 ConnectorTableHandle (io.trino.spi.connector.ConnectorTableHandle)13 List (java.util.List)12 Optional (java.util.Optional)12 ConnectorSplit (io.trino.spi.connector.ConnectorSplit)11 ImmutableMap (com.google.common.collect.ImmutableMap)10 TestingConnectorSession (io.trino.testing.TestingConnectorSession)10 File (java.io.File)10 Path (org.apache.hadoop.fs.Path)10 TupleDomain (io.trino.spi.predicate.TupleDomain)9 IOException (java.io.IOException)9 ArrayList (java.util.ArrayList)9