use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class OrcPageSourceFactory method createPageSource.
@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
if (!isDeserializerClass(schema, OrcSerde.class)) {
return Optional.empty();
}
List<HiveColumnHandle> readerColumnHandles = columns;
Optional<ReaderColumns> readerColumns = projectBaseColumns(columns);
if (readerColumns.isPresent()) {
readerColumnHandles = readerColumns.get().get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList());
}
ConnectorPageSource orcPageSource = createOrcPageSource(hdfsEnvironment, session.getIdentity(), configuration, path, start, length, estimatedFileSize, readerColumnHandles, columns, isUseOrcColumnNames(session), isFullAcidTable(Maps.fromProperties(schema)), effectivePredicate, legacyTimeZone, orcReaderOptions.withMaxMergeDistance(getOrcMaxMergeDistance(session)).withMaxBufferSize(getOrcMaxBufferSize(session)).withStreamBufferSize(getOrcStreamBufferSize(session)).withTinyStripeThreshold(getOrcTinyStripeThreshold(session)).withMaxReadBlockSize(getOrcMaxReadBlockSize(session)).withLazyReadSmallRanges(getOrcLazyReadSmallRanges(session)).withNestedLazy(isOrcNestedLazy(session)).withBloomFiltersEnabled(isOrcBloomFiltersEnabled(session)), acidInfo, bucketNumber, originalFile, transaction, stats);
return Optional.of(new ReaderPageSource(orcPageSource, readerColumns));
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class ParquetTester method assertMaxReadBytes.
void assertMaxReadBytes(List<ObjectInspector> objectInspectors, Iterable<?>[] writeValues, Iterable<?>[] readValues, List<String> columnNames, List<Type> columnTypes, Optional<MessageType> parquetSchema, DataSize maxReadBlockSize) throws Exception {
CompressionCodecName compressionCodecName = UNCOMPRESSED;
HiveSessionProperties hiveSessionProperties = new HiveSessionProperties(new HiveConfig().setHiveStorageFormat(HiveStorageFormat.PARQUET).setUseParquetColumnNames(false), new OrcReaderConfig(), new OrcWriterConfig(), new ParquetReaderConfig().setMaxReadBlockSize(maxReadBlockSize), new ParquetWriterConfig());
ConnectorSession session = TestingConnectorSession.builder().setPropertyMetadata(hiveSessionProperties.getSessionProperties()).build();
try (TempFile tempFile = new TempFile("test", "parquet")) {
JobConf jobConf = new JobConf();
jobConf.setEnum(COMPRESSION, compressionCodecName);
jobConf.setBoolean(ENABLE_DICTIONARY, true);
jobConf.setEnum(WRITER_VERSION, PARQUET_1_0);
writeParquetColumn(jobConf, tempFile.getFile(), compressionCodecName, createTableProperties(columnNames, objectInspectors), getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), parquetSchema, false);
Iterator<?>[] expectedValues = getIterators(readValues);
try (ConnectorPageSource pageSource = fileFormat.createFileFormatReader(session, HDFS_ENVIRONMENT, tempFile.getFile(), columnNames, columnTypes)) {
assertPageSource(columnTypes, expectedValues, pageSource, Optional.of(getParquetMaxReadBlockSize(session).toBytes()));
assertFalse(stream(expectedValues).allMatch(Iterator::hasNext));
}
}
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class DeltaLakePageSourceProvider method createPageSource.
@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit connectorSplit, ConnectorTableHandle connectorTable, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
DeltaLakeSplit split = (DeltaLakeSplit) connectorSplit;
DeltaLakeTableHandle table = (DeltaLakeTableHandle) connectorTable;
// We reach here when we could not prune the split using file level stats, table predicate
// and the dynamic filter in the coordinator during split generation. The file level stats
// in DeltaLakeSplit#filePredicate could help to prune this split when a more selective dynamic filter
// is available now, without having to access parquet file footer for row-group stats.
// We avoid sending DeltaLakeSplit#splitPredicate to workers by using table.getPredicate() here.
TupleDomain<DeltaLakeColumnHandle> filteredSplitPredicate = TupleDomain.intersect(ImmutableList.of(table.getNonPartitionConstraint(), split.getStatisticsPredicate(), dynamicFilter.getCurrentPredicate().transformKeys(DeltaLakeColumnHandle.class::cast)));
if (filteredSplitPredicate.isNone()) {
return new EmptyPageSource();
}
List<DeltaLakeColumnHandle> deltaLakeColumns = columns.stream().map(DeltaLakeColumnHandle.class::cast).collect(toImmutableList());
Map<String, Optional<String>> partitionKeys = split.getPartitionKeys();
List<DeltaLakeColumnHandle> regularColumns = deltaLakeColumns.stream().filter(column -> column.getColumnType() == REGULAR).collect(toImmutableList());
List<HiveColumnHandle> hiveColumnHandles = regularColumns.stream().map(DeltaLakeColumnHandle::toHiveColumnHandle).collect(toImmutableList());
Path path = new Path(split.getPath());
HdfsContext hdfsContext = new HdfsContext(session);
TupleDomain<HiveColumnHandle> parquetPredicate = getParquetTupleDomain(filteredSplitPredicate.simplify(domainCompactionThreshold));
if (table.getWriteType().isPresent()) {
return new DeltaLakeUpdatablePageSource(table, deltaLakeColumns, partitionKeys, split.getPath(), split.getFileSize(), split.getFileModifiedTime(), session, executorService, hdfsEnvironment, hdfsContext, parquetDateTimeZone, parquetReaderOptions, parquetPredicate, typeManager, updateResultJsonCodec);
}
ReaderPageSource pageSource = ParquetPageSourceFactory.createPageSource(path, split.getStart(), split.getLength(), split.getFileSize(), hiveColumnHandles, parquetPredicate, true, hdfsEnvironment, hdfsEnvironment.getConfiguration(hdfsContext, path), session.getIdentity(), parquetDateTimeZone, fileFormatDataSourceStats, parquetReaderOptions.withMaxReadBlockSize(getParquetMaxReadBlockSize(session)).withUseColumnIndex(isParquetUseColumnIndex(session)));
verify(pageSource.getReaderColumns().isEmpty(), "All columns expected to be base columns");
return new DeltaLakePageSource(deltaLakeColumns, partitionKeys, pageSource.get(), split.getPath(), split.getFileSize(), split.getFileModifiedTime());
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class TestHivePageSink method writeTestFile.
private static long writeTestFile(HiveConfig config, HiveMetastore metastore, String outputPath) {
HiveTransactionHandle transaction = new HiveTransactionHandle(false);
HiveWriterStats stats = new HiveWriterStats();
ConnectorPageSink pageSink = createPageSink(transaction, config, metastore, new Path("file:///" + outputPath), stats);
List<LineItemColumn> columns = getTestColumns();
List<Type> columnTypes = columns.stream().map(LineItemColumn::getType).map(TestHivePageSink::getHiveType).map(hiveType -> hiveType.getType(TESTING_TYPE_MANAGER)).collect(toList());
PageBuilder pageBuilder = new PageBuilder(columnTypes);
int rows = 0;
for (LineItem lineItem : new LineItemGenerator(0.01, 1, 1)) {
rows++;
if (rows >= NUM_ROWS) {
break;
}
pageBuilder.declarePosition();
for (int i = 0; i < columns.size(); i++) {
LineItemColumn column = columns.get(i);
BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(i);
switch(column.getType().getBase()) {
case IDENTIFIER:
BIGINT.writeLong(blockBuilder, column.getIdentifier(lineItem));
break;
case INTEGER:
INTEGER.writeLong(blockBuilder, column.getInteger(lineItem));
break;
case DATE:
DATE.writeLong(blockBuilder, column.getDate(lineItem));
break;
case DOUBLE:
DOUBLE.writeDouble(blockBuilder, column.getDouble(lineItem));
break;
case VARCHAR:
createUnboundedVarcharType().writeSlice(blockBuilder, Slices.utf8Slice(column.getString(lineItem)));
break;
default:
throw new IllegalArgumentException("Unsupported type " + column.getType());
}
}
}
Page page = pageBuilder.build();
pageSink.appendPage(page);
getFutureValue(pageSink.finish());
File outputDir = new File(outputPath);
List<File> files = ImmutableList.copyOf(outputDir.listFiles((dir, name) -> !name.endsWith(".crc")));
File outputFile = getOnlyElement(files);
long length = outputFile.length();
ConnectorPageSource pageSource = createPageSource(transaction, config, outputFile);
List<Page> pages = new ArrayList<>();
while (!pageSource.isFinished()) {
Page nextPage = pageSource.getNextPage();
if (nextPage != null) {
pages.add(nextPage.getLoadedPage());
}
}
MaterializedResult expectedResults = toMaterializedResult(getHiveSession(config), columnTypes, ImmutableList.of(page));
MaterializedResult results = toMaterializedResult(getHiveSession(config), columnTypes, pages);
assertEquals(results, expectedResults);
assertEquals(round(stats.getInputPageSizeInBytes().getAllTime().getMax()), page.getRetainedSizeInBytes());
return length;
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class TestNodeLocalDynamicSplitPruning method testDynamicPartitionPruning.
@Test
public void testDynamicPartitionPruning() throws IOException {
HiveConfig config = new HiveConfig();
HiveTransactionHandle transaction = new HiveTransactionHandle(false);
try (TempFile tempFile = new TempFile()) {
ConnectorPageSource emptyPageSource = createTestingPageSource(transaction, config, tempFile.file(), getDynamicFilter(getTupleDomainForPartitionSplitPruning()));
assertEquals(emptyPageSource.getClass(), EmptyPageSource.class);
ConnectorPageSource nonEmptyPageSource = createTestingPageSource(transaction, config, tempFile.file(), getDynamicFilter(getNonSelectivePartitionTupleDomain()));
assertEquals(nonEmptyPageSource.getClass(), HivePageSource.class);
}
}
Aggregations