Search in sources :

Example 21 with ConnectorSplit

use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.

the class CassandraSplitManager method getSplitsByTokenRange.

private List<ConnectorSplit> getSplitsByTokenRange(CassandraTable table, String partitionId, Optional<Long> sessionSplitsPerNode) {
    String schema = table.getTableHandle().getSchemaName();
    String tableName = table.getTableHandle().getTableName();
    String tokenExpression = table.getTokenExpression();
    ImmutableList.Builder<ConnectorSplit> builder = ImmutableList.builder();
    List<CassandraTokenSplitManager.TokenSplit> tokenSplits = tokenSplitMgr.getSplits(schema, tableName, sessionSplitsPerNode);
    for (CassandraTokenSplitManager.TokenSplit tokenSplit : tokenSplits) {
        String condition = buildTokenCondition(tokenExpression, tokenSplit.getStartToken(), tokenSplit.getEndToken());
        List<HostAddress> addresses = new HostAddressFactory().hostAddressNamesToHostAddressList(tokenSplit.getHosts());
        CassandraSplit split = new CassandraSplit(partitionId, condition, addresses);
        builder.add(split);
    }
    return builder.build();
}
Also used : HostAddressFactory(io.trino.plugin.cassandra.util.HostAddressFactory) ImmutableList(com.google.common.collect.ImmutableList) HostAddress(io.trino.spi.HostAddress) ConnectorSplit(io.trino.spi.connector.ConnectorSplit)

Example 22 with ConnectorSplit

use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.

the class TestCassandraConnector method testGetRecords.

@Test
public void testGetRecords() {
    ConnectorTableHandle tableHandle = getTableHandle(table);
    ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(SESSION, tableHandle);
    List<ColumnHandle> columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(SESSION, tableHandle).values());
    Map<String, Integer> columnIndex = indexColumns(columnHandles);
    ConnectorTransactionHandle transaction = CassandraTransactionHandle.INSTANCE;
    tableHandle = metadata.applyFilter(SESSION, tableHandle, Constraint.alwaysTrue()).get().getHandle();
    List<ConnectorSplit> splits = getAllSplits(splitManager.getSplits(transaction, SESSION, tableHandle, UNGROUPED_SCHEDULING, DynamicFilter.EMPTY));
    long rowNumber = 0;
    for (ConnectorSplit split : splits) {
        CassandraSplit cassandraSplit = (CassandraSplit) split;
        long completedBytes = 0;
        try (RecordCursor cursor = recordSetProvider.getRecordSet(transaction, SESSION, cassandraSplit, tableHandle, columnHandles).cursor()) {
            while (cursor.advanceNextPosition()) {
                try {
                    assertReadFields(cursor, tableMetadata.getColumns());
                } catch (RuntimeException e) {
                    throw new RuntimeException("row " + rowNumber, e);
                }
                rowNumber++;
                String keyValue = cursor.getSlice(columnIndex.get("key")).toStringUtf8();
                assertTrue(keyValue.startsWith("key "));
                int rowId = Integer.parseInt(keyValue.substring(4));
                assertEquals(keyValue, "key " + rowId);
                assertEquals(Bytes.toHexString(cursor.getSlice(columnIndex.get("typebytes")).getBytes()), format("0x%08X", rowId));
                // VARINT is returned as a string
                assertEquals(cursor.getSlice(columnIndex.get("typeinteger")).toStringUtf8(), String.valueOf(rowId));
                assertEquals(cursor.getLong(columnIndex.get("typelong")), 1000 + rowId);
                assertEquals(trinoUuidToJavaUuid(cursor.getSlice(columnIndex.get("typeuuid"))).toString(), format("00000000-0000-0000-0000-%012d", rowId));
                assertEquals(cursor.getLong(columnIndex.get("typetimestamp")), packDateTimeWithZone(DATE.getTime(), UTC_KEY));
                long newCompletedBytes = cursor.getCompletedBytes();
                assertTrue(newCompletedBytes >= completedBytes);
                completedBytes = newCompletedBytes;
            }
        }
    }
    assertEquals(rowNumber, 9);
}
Also used : ColumnHandle(io.trino.spi.connector.ColumnHandle) RecordCursor(io.trino.spi.connector.RecordCursor) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) Constraint(io.trino.spi.connector.Constraint) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata) Test(org.testng.annotations.Test)

Example 23 with ConnectorSplit

use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.

the class DeltaLakePageSourceProvider method createPageSource.

@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit connectorSplit, ConnectorTableHandle connectorTable, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
    DeltaLakeSplit split = (DeltaLakeSplit) connectorSplit;
    DeltaLakeTableHandle table = (DeltaLakeTableHandle) connectorTable;
    // We reach here when we could not prune the split using file level stats, table predicate
    // and the dynamic filter in the coordinator during split generation. The file level stats
    // in DeltaLakeSplit#filePredicate could help to prune this split when a more selective dynamic filter
    // is available now, without having to access parquet file footer for row-group stats.
    // We avoid sending DeltaLakeSplit#splitPredicate to workers by using table.getPredicate() here.
    TupleDomain<DeltaLakeColumnHandle> filteredSplitPredicate = TupleDomain.intersect(ImmutableList.of(table.getNonPartitionConstraint(), split.getStatisticsPredicate(), dynamicFilter.getCurrentPredicate().transformKeys(DeltaLakeColumnHandle.class::cast)));
    if (filteredSplitPredicate.isNone()) {
        return new EmptyPageSource();
    }
    List<DeltaLakeColumnHandle> deltaLakeColumns = columns.stream().map(DeltaLakeColumnHandle.class::cast).collect(toImmutableList());
    Map<String, Optional<String>> partitionKeys = split.getPartitionKeys();
    List<DeltaLakeColumnHandle> regularColumns = deltaLakeColumns.stream().filter(column -> column.getColumnType() == REGULAR).collect(toImmutableList());
    List<HiveColumnHandle> hiveColumnHandles = regularColumns.stream().map(DeltaLakeColumnHandle::toHiveColumnHandle).collect(toImmutableList());
    Path path = new Path(split.getPath());
    HdfsContext hdfsContext = new HdfsContext(session);
    TupleDomain<HiveColumnHandle> parquetPredicate = getParquetTupleDomain(filteredSplitPredicate.simplify(domainCompactionThreshold));
    if (table.getWriteType().isPresent()) {
        return new DeltaLakeUpdatablePageSource(table, deltaLakeColumns, partitionKeys, split.getPath(), split.getFileSize(), split.getFileModifiedTime(), session, executorService, hdfsEnvironment, hdfsContext, parquetDateTimeZone, parquetReaderOptions, parquetPredicate, typeManager, updateResultJsonCodec);
    }
    ReaderPageSource pageSource = ParquetPageSourceFactory.createPageSource(path, split.getStart(), split.getLength(), split.getFileSize(), hiveColumnHandles, parquetPredicate, true, hdfsEnvironment, hdfsEnvironment.getConfiguration(hdfsContext, path), session.getIdentity(), parquetDateTimeZone, fileFormatDataSourceStats, parquetReaderOptions.withMaxReadBlockSize(getParquetMaxReadBlockSize(session)).withUseColumnIndex(isParquetUseColumnIndex(session)));
    verify(pageSource.getReaderColumns().isEmpty(), "All columns expected to be base columns");
    return new DeltaLakePageSource(deltaLakeColumns, partitionKeys, pageSource.get(), split.getPath(), split.getFileSize(), split.getFileModifiedTime());
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) Inject(javax.inject.Inject) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) Path(org.apache.hadoop.fs.Path) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ExecutorService(java.util.concurrent.ExecutorService) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) StandardTypes(io.trino.spi.type.StandardTypes) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DeltaLakeSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetMaxReadBlockSize) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) TypeManager(io.trino.spi.type.TypeManager) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) JsonCodec(io.airlift.json.JsonCodec) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) Path(org.apache.hadoop.fs.Path) Optional(java.util.Optional) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle)

Example 24 with ConnectorSplit

use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.

the class TestHiveSplitSource method testReaderWaitsForSplits.

@Test
public void testReaderWaitsForSplits() throws Exception {
    HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce(SESSION, "database", "table", 10, 10, DataSize.of(1, MEGABYTE), Integer.MAX_VALUE, new TestingHiveSplitLoader(), Executors.newFixedThreadPool(5), new CounterStat(), false);
    SettableFuture<ConnectorSplit> splits = SettableFuture.create();
    // create a thread that will get a split
    CountDownLatch started = new CountDownLatch(1);
    Thread getterThread = new Thread(() -> {
        try {
            started.countDown();
            List<ConnectorSplit> batch = getSplits(hiveSplitSource, 1);
            assertEquals(batch.size(), 1);
            splits.set(batch.get(0));
        } catch (Throwable e) {
            splits.setException(e);
        }
    });
    getterThread.start();
    try {
        // wait for the thread to be started
        assertTrue(started.await(1, TimeUnit.SECONDS));
        // sleep for a bit, and assure the thread is blocked
        TimeUnit.MILLISECONDS.sleep(200);
        assertTrue(!splits.isDone());
        // add a split
        hiveSplitSource.addToQueue(new TestSplit(33));
        // wait for thread to get the split
        ConnectorSplit split = splits.get(800, TimeUnit.MILLISECONDS);
        assertEquals(((HiveSplit) split).getSchema().getProperty("id"), "33");
    } finally {
        // make sure the thread exits
        getterThread.interrupt();
    }
}
Also used : CounterStat(io.airlift.stats.CounterStat) CountDownLatch(java.util.concurrent.CountDownLatch) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) Test(org.testng.annotations.Test)

Example 25 with ConnectorSplit

use of io.trino.spi.connector.ConnectorSplit in project trino by trinodb.

the class IcebergSplitSource method getNextBatch.

@Override
public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize) {
    long timeLeft = dynamicFilteringWaitTimeoutMillis - dynamicFilterWaitStopwatch.elapsed(MILLISECONDS);
    if (dynamicFilter.isAwaitable() && timeLeft > 0) {
        return dynamicFilter.isBlocked().thenApply(ignored -> EMPTY_BATCH).completeOnTimeout(EMPTY_BATCH, timeLeft, MILLISECONDS);
    }
    if (combinedScanIterable == null) {
        // Used to avoid duplicating work if the Dynamic Filter was already pushed down to the Iceberg API
        this.pushedDownDynamicFilterPredicate = dynamicFilter.getCurrentPredicate().transformKeys(IcebergColumnHandle.class::cast);
        TupleDomain<IcebergColumnHandle> fullPredicate = tableHandle.getUnenforcedPredicate().intersect(pushedDownDynamicFilterPredicate);
        // TODO: (https://github.com/trinodb/trino/issues/9743): Consider removing TupleDomain#simplify
        TupleDomain<IcebergColumnHandle> simplifiedPredicate = fullPredicate.simplify(ICEBERG_DOMAIN_COMPACTION_THRESHOLD);
        if (!simplifiedPredicate.equals(fullPredicate)) {
            // Pushed down predicate was simplified, always evaluate it against individual splits
            this.pushedDownDynamicFilterPredicate = TupleDomain.all();
        }
        TupleDomain<IcebergColumnHandle> effectivePredicate = tableHandle.getEnforcedPredicate().intersect(simplifiedPredicate);
        if (effectivePredicate.isNone()) {
            finish();
            return completedFuture(NO_MORE_SPLITS_BATCH);
        }
        Expression filterExpression = toIcebergExpression(effectivePredicate);
        this.combinedScanIterable = tableScan.filter(filterExpression).includeColumnStats().planTasks();
        this.fileScanIterator = Streams.stream(combinedScanIterable).map(CombinedScanTask::files).flatMap(Collection::stream).iterator();
    }
    TupleDomain<IcebergColumnHandle> dynamicFilterPredicate = dynamicFilter.getCurrentPredicate().transformKeys(IcebergColumnHandle.class::cast);
    if (dynamicFilterPredicate.isNone()) {
        finish();
        return completedFuture(NO_MORE_SPLITS_BATCH);
    }
    Iterator<FileScanTask> fileScanTasks = Iterators.limit(fileScanIterator, maxSize);
    ImmutableList.Builder<ConnectorSplit> splits = ImmutableList.builder();
    while (fileScanTasks.hasNext()) {
        FileScanTask scanTask = fileScanTasks.next();
        if (!scanTask.deletes().isEmpty()) {
            throw new TrinoException(NOT_SUPPORTED, "Iceberg tables with delete files are not supported: " + tableHandle.getSchemaTableName());
        }
        if (maxScannedFileSizeInBytes.isPresent() && scanTask.file().fileSizeInBytes() > maxScannedFileSizeInBytes.get()) {
            continue;
        }
        IcebergSplit icebergSplit = toIcebergSplit(scanTask);
        Schema fileSchema = scanTask.spec().schema();
        Set<IcebergColumnHandle> identityPartitionColumns = icebergSplit.getPartitionKeys().keySet().stream().map(fieldId -> getColumnHandle(fileSchema.findField(fieldId), typeManager)).collect(toImmutableSet());
        Supplier<Map<ColumnHandle, NullableValue>> partitionValues = memoize(() -> {
            Map<ColumnHandle, NullableValue> bindings = new HashMap<>();
            for (IcebergColumnHandle partitionColumn : identityPartitionColumns) {
                Object partitionValue = deserializePartitionValue(partitionColumn.getType(), icebergSplit.getPartitionKeys().get(partitionColumn.getId()).orElse(null), partitionColumn.getName());
                NullableValue bindingValue = new NullableValue(partitionColumn.getType(), partitionValue);
                bindings.put(partitionColumn, bindingValue);
            }
            return bindings;
        });
        if (!dynamicFilterPredicate.isAll() && !dynamicFilterPredicate.equals(pushedDownDynamicFilterPredicate)) {
            if (!partitionMatchesPredicate(identityPartitionColumns, partitionValues, dynamicFilterPredicate)) {
                continue;
            }
            if (!fileMatchesPredicate(fieldIdToType, dynamicFilterPredicate, scanTask.file().lowerBounds(), scanTask.file().upperBounds(), scanTask.file().nullValueCounts())) {
                continue;
            }
        }
        if (!partitionMatchesConstraint(identityPartitionColumns, partitionValues, constraint)) {
            continue;
        }
        if (recordScannedFiles) {
            scannedFiles.add(scanTask.file());
        }
        splits.add(icebergSplit);
    }
    return completedFuture(new ConnectorSplitBatch(splits.build(), isFinished()));
}
Also used : IcebergUtil.getPartitionKeys(io.trino.plugin.iceberg.IcebergUtil.getPartitionKeys) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) ByteBuffer(java.nio.ByteBuffer) TypeConverter.toIcebergType(io.trino.plugin.iceberg.TypeConverter.toIcebergType) Duration(io.airlift.units.Duration) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) Expression(org.apache.iceberg.expressions.Expression) ConnectorPartitionHandle(io.trino.spi.connector.ConnectorPartitionHandle) Map(java.util.Map) FileScanTask(org.apache.iceberg.FileScanTask) DataFile(org.apache.iceberg.DataFile) IcebergUtil.getColumnHandle(io.trino.plugin.iceberg.IcebergUtil.getColumnHandle) ImmutableSet(com.google.common.collect.ImmutableSet) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Range(io.trino.spi.predicate.Range) Domain(io.trino.spi.predicate.Domain) Collection(java.util.Collection) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) TableScan(org.apache.iceberg.TableScan) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) Streams(com.google.common.collect.Streams) Schema(org.apache.iceberg.Schema) CombinedScanTask(org.apache.iceberg.CombinedScanTask) ValueSet(io.trino.spi.predicate.ValueSet) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Type(org.apache.iceberg.types.Type) UncheckedIOException(java.io.UncheckedIOException) DataSize(io.airlift.units.DataSize) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) Constraint(io.trino.spi.connector.Constraint) IcebergUtil.deserializePartitionValue(io.trino.plugin.iceberg.IcebergUtil.deserializePartitionValue) NullableValue(io.trino.spi.predicate.NullableValue) Stopwatch(com.google.common.base.Stopwatch) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Supplier(java.util.function.Supplier) ExpressionConverter.toIcebergExpression(io.trino.plugin.iceberg.ExpressionConverter.toIcebergExpression) Iterators(com.google.common.collect.Iterators) IcebergTypes.convertIcebergValueToTrino(io.trino.plugin.iceberg.IcebergTypes.convertIcebergValueToTrino) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Suppliers.memoize(com.google.common.base.Suppliers.memoize) Nullable(javax.annotation.Nullable) Iterator(java.util.Iterator) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) ConnectorSplitSource(io.trino.spi.connector.ConnectorSplitSource) IOException(java.io.IOException) TupleDomain(io.trino.spi.predicate.TupleDomain) Conversions.fromByteBuffer(org.apache.iceberg.types.Conversions.fromByteBuffer) Sets.intersection(com.google.common.collect.Sets.intersection) IcebergUtil.primitiveFieldTypes(io.trino.plugin.iceberg.IcebergUtil.primitiveFieldTypes) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TypeManager(io.trino.spi.type.TypeManager) Collections(java.util.Collections) IcebergUtil.getColumnHandle(io.trino.plugin.iceberg.IcebergUtil.getColumnHandle) ColumnHandle(io.trino.spi.connector.ColumnHandle) HashMap(java.util.HashMap) ImmutableList(com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) NullableValue(io.trino.spi.predicate.NullableValue) Expression(org.apache.iceberg.expressions.Expression) ExpressionConverter.toIcebergExpression(io.trino.plugin.iceberg.ExpressionConverter.toIcebergExpression) Collection(java.util.Collection) TrinoException(io.trino.spi.TrinoException) FileScanTask(org.apache.iceberg.FileScanTask) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

ConnectorSplit (io.trino.spi.connector.ConnectorSplit)44 ColumnHandle (io.trino.spi.connector.ColumnHandle)21 ImmutableList (com.google.common.collect.ImmutableList)19 ConnectorTableHandle (io.trino.spi.connector.ConnectorTableHandle)18 ConnectorSession (io.trino.spi.connector.ConnectorSession)17 List (java.util.List)14 Test (org.testng.annotations.Test)14 ConnectorSplitSource (io.trino.spi.connector.ConnectorSplitSource)13 ConnectorTransactionHandle (io.trino.spi.connector.ConnectorTransactionHandle)13 FixedSplitSource (io.trino.spi.connector.FixedSplitSource)12 ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)11 Objects.requireNonNull (java.util.Objects.requireNonNull)11 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)9 Inject (javax.inject.Inject)9 HostAddress (io.trino.spi.HostAddress)8 ConnectorMetadata (io.trino.spi.connector.ConnectorMetadata)7 Constraint (io.trino.spi.connector.Constraint)7 DynamicFilter (io.trino.spi.connector.DynamicFilter)7 MaterializedResult (io.trino.testing.MaterializedResult)7 ArrayList (java.util.ArrayList)7