use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class ParquetPageSourceFactory method createPageSource.
/**
* This method is available for other callers to use directly.
*/
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
// Ignore predicates on partial columns for now.
effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
MessageType fileSchema;
MessageType requestedSchema;
MessageColumnIO messageColumn;
ParquetReader parquetReader;
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
messageColumn = getColumnIO(fileSchema, requestedSchema);
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
long nextStart = 0;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
blocks.add(block);
blockStarts.add(nextStart);
columnIndexes.add(columnIndex);
}
nextStart += block.getRowCount();
}
parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
for (HiveColumnHandle column : baseColumns) {
checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
}
ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
for (HiveColumnHandle column : baseColumns) {
trinoTypes.add(column.getBaseType());
rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
if (column == PARQUET_ROW_INDEX_COLUMN) {
internalFields.add(Optional.empty());
} else {
internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
}));
}
}
ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
return new ReaderPageSource(parquetPageSource, readerProjections);
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class RcFilePageSourceFactory method createPageSource.
@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
RcFileEncoding rcFileEncoding;
String deserializerClassName = getDeserializerClassName(schema);
if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
rcFileEncoding = new BinaryRcFileEncoding(timeZone);
} else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
rcFileEncoding = createTextVectorEncoding(schema);
} else {
return Optional.empty();
}
checkArgument(acidInfo.isEmpty(), "Acid is not supported");
List<HiveColumnHandle> projectedReaderColumns = columns;
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
if (readerProjections.isPresent()) {
projectedReaderColumns = readerProjections.get().get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList());
}
RcFileDataSource dataSource;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path));
if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
// Handle potentially imprecise file lengths by reading the footer
try {
FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes()));
dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice());
} finally {
inputStream.close();
}
} else {
long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen());
dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats);
}
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
length = min(dataSource.getSize() - start, length);
// Split may be empty now that the correct file size is known
if (length <= 0) {
return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
}
try {
ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
for (HiveColumnHandle column : projectedReaderColumns) {
readColumns.put(column.getBaseHiveColumnIndex(), column.getHiveType().getType(typeManager, timestampPrecision));
}
RcFileReader rcFileReader = new RcFileReader(dataSource, rcFileEncoding, readColumns.buildOrThrow(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, BUFFER_SIZE);
ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns);
return Optional.of(new ReaderPageSource(pageSource, readerProjections));
} catch (Throwable e) {
try {
dataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof RcFileCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class AbstractTestHive method assertGetRecords.
protected void assertGetRecords(String tableName, HiveStorageFormat hiveStorageFormat) throws Exception {
try (Transaction transaction = newTransaction()) {
ConnectorSession session = newSession();
ConnectorMetadata metadata = transaction.getMetadata();
metadata.beginQuery(session);
ConnectorTableHandle tableHandle = getTableHandle(metadata, new SchemaTableName(database, tableName));
ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, tableHandle);
HiveSplit hiveSplit = getHiveSplit(tableHandle, transaction, session);
List<ColumnHandle> columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values());
ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, hiveSplit, tableHandle, columnHandles, DynamicFilter.EMPTY);
assertGetRecords(hiveStorageFormat, tableMetadata, hiveSplit, pageSource, columnHandles);
}
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class AbstractTestHive method doTestBucketSortedTables.
private void doTestBucketSortedTables(SchemaTableName table) throws IOException {
int bucketCount = 3;
int expectedRowCount = 0;
try (Transaction transaction = newTransaction()) {
ConnectorSession session = newSession();
ConnectorMetadata metadata = transaction.getMetadata();
// begin creating the table
ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(table, ImmutableList.<ColumnMetadata>builder().add(new ColumnMetadata("id", VARCHAR)).add(new ColumnMetadata("value_asc", VARCHAR)).add(new ColumnMetadata("value_desc", BIGINT)).add(new ColumnMetadata("ds", VARCHAR)).build(), ImmutableMap.<String, Object>builder().put(STORAGE_FORMAT_PROPERTY, RCBINARY).put(PARTITIONED_BY_PROPERTY, ImmutableList.of("ds")).put(BUCKETED_BY_PROPERTY, ImmutableList.of("id")).put(BUCKET_COUNT_PROPERTY, bucketCount).put(SORTED_BY_PROPERTY, ImmutableList.builder().add(new SortingColumn("value_asc", ASCENDING)).add(new SortingColumn("value_desc", DESCENDING)).build()).buildOrThrow());
ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty(), NO_RETRIES);
// write the data
ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle);
List<Type> types = tableMetadata.getColumns().stream().map(ColumnMetadata::getType).collect(toList());
ThreadLocalRandom random = ThreadLocalRandom.current();
for (int i = 0; i < 50; i++) {
MaterializedResult.Builder builder = MaterializedResult.resultBuilder(session, types);
for (int j = 0; j < 1000; j++) {
builder.row(sha256().hashLong(random.nextLong()).toString(), "test" + random.nextInt(100), random.nextLong(100_000), "2018-04-01");
expectedRowCount++;
}
sink.appendPage(builder.build().toPage());
}
HdfsContext context = new HdfsContext(session);
// verify we have enough temporary files per bucket to require multiple passes
Path stagingPathRoot;
if (isTemporaryStagingDirectoryEnabled(session)) {
stagingPathRoot = new Path(getTemporaryStagingDirectoryPath(session).replace("${USER}", context.getIdentity().getUser()));
} else {
stagingPathRoot = getStagingPathRoot(outputHandle);
}
assertThat(listAllDataFiles(context, stagingPathRoot)).filteredOn(file -> file.contains(".tmp-sort.")).size().isGreaterThan(bucketCount * getHiveConfig().getMaxOpenSortFiles() * 2);
// finish the write
Collection<Slice> fragments = getFutureValue(sink.finish());
// verify there are no temporary files
for (String file : listAllDataFiles(context, stagingPathRoot)) {
assertThat(file).doesNotContain(".tmp-sort.");
}
// finish creating table
metadata.finishCreateTable(session, outputHandle, fragments, ImmutableList.of());
transaction.commit();
}
// verify that bucket files are sorted
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
metadata.beginQuery(session);
ConnectorTableHandle tableHandle = getTableHandle(metadata, table);
List<ColumnHandle> columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values());
// verify local sorting property
ConnectorTableProperties properties = metadata.getTableProperties(newSession(ImmutableMap.of("propagate_table_scan_sorting_properties", true, "bucket_execution_enabled", false)), tableHandle);
Map<String, Integer> columnIndex = indexColumns(columnHandles);
assertEquals(properties.getLocalProperties(), ImmutableList.of(new SortingProperty<>(columnHandles.get(columnIndex.get("value_asc")), ASC_NULLS_FIRST), new SortingProperty<>(columnHandles.get(columnIndex.get("value_desc")), DESC_NULLS_LAST)));
assertThat(metadata.getTableProperties(newSession(), tableHandle).getLocalProperties()).isEmpty();
List<ConnectorSplit> splits = getAllSplits(tableHandle, transaction, session);
assertThat(splits).hasSize(bucketCount);
int actualRowCount = 0;
for (ConnectorSplit split : splits) {
try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles, DynamicFilter.EMPTY)) {
String lastValueAsc = null;
long lastValueDesc = -1;
while (!pageSource.isFinished()) {
Page page = pageSource.getNextPage();
if (page == null) {
continue;
}
for (int i = 0; i < page.getPositionCount(); i++) {
Block blockAsc = page.getBlock(1);
Block blockDesc = page.getBlock(2);
assertFalse(blockAsc.isNull(i));
assertFalse(blockDesc.isNull(i));
String valueAsc = VARCHAR.getSlice(blockAsc, i).toStringUtf8();
if (lastValueAsc != null) {
assertGreaterThanOrEqual(valueAsc, lastValueAsc);
if (valueAsc.equals(lastValueAsc)) {
long valueDesc = BIGINT.getLong(blockDesc, i);
if (lastValueDesc != -1) {
assertLessThanOrEqual(valueDesc, lastValueDesc);
}
lastValueDesc = valueDesc;
} else {
lastValueDesc = -1;
}
}
lastValueAsc = valueAsc;
actualRowCount++;
}
}
}
}
assertThat(actualRowCount).isEqualTo(expectedRowCount);
}
}
use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.
the class AbstractTestHiveFileSystem method createTable.
private void createTable(SchemaTableName tableName, HiveStorageFormat storageFormat) throws Exception {
List<ColumnMetadata> columns = ImmutableList.<ColumnMetadata>builder().add(new ColumnMetadata("id", BIGINT)).build();
MaterializedResult data = MaterializedResult.resultBuilder(newSession(), BIGINT).row(1L).row(3L).row(2L).build();
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
// begin creating the table
ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, createTableProperties(storageFormat));
ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty(), NO_RETRIES);
// write the records
ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle);
sink.appendPage(data.toPage());
Collection<Slice> fragments = getFutureValue(sink.finish());
// commit the table
metadata.finishCreateTable(session, outputHandle, fragments, ImmutableList.of());
transaction.commit();
// Hack to work around the metastore not being configured for S3 or other FS.
// The metastore tries to validate the location when creating the
// table, which fails without explicit configuration for file system.
// We work around that by using a dummy location when creating the
// table and update it here to the correct location.
metastoreClient.updateTableLocation(database, tableName.getTableName(), locationService.getTableWriteInfo(((HiveOutputTableHandle) outputHandle).getLocationHandle(), false).getTargetPath().toString());
}
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
// load the new table
ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName);
List<ColumnHandle> columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values());
// verify the metadata
ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, getTableHandle(metadata, tableName));
assertEquals(filterNonHiddenColumnMetadata(tableMetadata.getColumns()), columns);
// verify the data
metadata.beginQuery(session);
ConnectorSplitSource splitSource = getSplits(splitManager, transaction, session, tableHandle);
ConnectorSplit split = getOnlyElement(getAllSplits(splitSource));
try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles, DynamicFilter.EMPTY)) {
MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles));
assertEqualsIgnoreOrder(result.getMaterializedRows(), data.getMaterializedRows());
}
metadata.cleanupQuery(session);
}
}
Aggregations