use of io.trino.spi.type.Type in project trino by trinodb.
the class ParquetPageSource method getNextPage.
@Override
public Page getNextPage() {
try {
batchId++;
int batchSize = parquetReader.nextBatch();
if (closed || batchSize <= 0) {
close();
return null;
}
completedPositions += batchSize;
Block[] blocks = new Block[fields.size()];
for (int column = 0; column < blocks.length; column++) {
if (isIndexColumn(column)) {
blocks[column] = getRowIndexColumn(parquetReader.lastBatchStartRow(), batchSize);
} else {
Type type = types.get(column);
blocks[column] = fields.get(column).<Block>map(field -> new LazyBlock(batchSize, new ParquetBlockLoader(field))).orElseGet(() -> RunLengthEncodedBlock.create(type, null, batchSize));
}
}
return new Page(batchSize, blocks);
} catch (TrinoException e) {
closeAllSuppress(e, this);
throw e;
} catch (RuntimeException e) {
closeAllSuppress(e, this);
throw new TrinoException(HIVE_CURSOR_ERROR, e);
}
}
use of io.trino.spi.type.Type in project trino by trinodb.
the class ParquetPageSourceFactory method createPageSource.
/**
* This method is available for other callers to use directly.
*/
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
// Ignore predicates on partial columns for now.
effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
MessageType fileSchema;
MessageType requestedSchema;
MessageColumnIO messageColumn;
ParquetReader parquetReader;
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
messageColumn = getColumnIO(fileSchema, requestedSchema);
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
long nextStart = 0;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
blocks.add(block);
blockStarts.add(nextStart);
columnIndexes.add(columnIndex);
}
nextStart += block.getRowCount();
}
parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
for (HiveColumnHandle column : baseColumns) {
checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
}
ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
for (HiveColumnHandle column : baseColumns) {
trinoTypes.add(column.getBaseType());
rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
if (column == PARQUET_ROW_INDEX_COLUMN) {
internalFields.add(Optional.empty());
} else {
internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
}));
}
}
ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
return new ReaderPageSource(parquetPageSource, readerProjections);
}
use of io.trino.spi.type.Type in project trino by trinodb.
the class ParquetPageSourceFactory method getColumnType.
public static Optional<org.apache.parquet.schema.Type> getColumnType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) {
Optional<org.apache.parquet.schema.Type> columnType = getParquetType(messageType, useParquetColumnNames, column);
if (columnType.isEmpty() || column.getHiveColumnProjectionInfo().isEmpty()) {
return columnType;
}
GroupType baseType = columnType.get().asGroupType();
ImmutableList.Builder<org.apache.parquet.schema.Type> typeBuilder = ImmutableList.builder();
org.apache.parquet.schema.Type parentType = baseType;
for (String name : column.getHiveColumnProjectionInfo().get().getDereferenceNames()) {
org.apache.parquet.schema.Type childType = getParquetTypeByName(name, parentType.asGroupType());
if (childType == null) {
return Optional.empty();
}
typeBuilder.add(childType);
parentType = childType;
}
List<org.apache.parquet.schema.Type> subfieldTypes = typeBuilder.build();
org.apache.parquet.schema.Type type = subfieldTypes.get(subfieldTypes.size() - 1);
for (int i = subfieldTypes.size() - 2; i >= 0; --i) {
GroupType groupType = subfieldTypes.get(i).asGroupType();
type = new GroupType(groupType.getRepetition(), groupType.getName(), ImmutableList.of(type));
}
return Optional.of(new GroupType(baseType.getRepetition(), baseType.getName(), ImmutableList.of(type)));
}
use of io.trino.spi.type.Type in project trino by trinodb.
the class RcFilePageSourceFactory method createPageSource.
@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
RcFileEncoding rcFileEncoding;
String deserializerClassName = getDeserializerClassName(schema);
if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
rcFileEncoding = new BinaryRcFileEncoding(timeZone);
} else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
rcFileEncoding = createTextVectorEncoding(schema);
} else {
return Optional.empty();
}
checkArgument(acidInfo.isEmpty(), "Acid is not supported");
List<HiveColumnHandle> projectedReaderColumns = columns;
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
if (readerProjections.isPresent()) {
projectedReaderColumns = readerProjections.get().get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList());
}
RcFileDataSource dataSource;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path));
if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
// Handle potentially imprecise file lengths by reading the footer
try {
FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes()));
dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice());
} finally {
inputStream.close();
}
} else {
long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen());
dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats);
}
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
length = min(dataSource.getSize() - start, length);
// Split may be empty now that the correct file size is known
if (length <= 0) {
return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
}
try {
ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
for (HiveColumnHandle column : projectedReaderColumns) {
readColumns.put(column.getBaseHiveColumnIndex(), column.getHiveType().getType(typeManager, timestampPrecision));
}
RcFileReader rcFileReader = new RcFileReader(dataSource, rcFileEncoding, readColumns.buildOrThrow(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, BUFFER_SIZE);
ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns);
return Optional.of(new ReaderPageSource(pageSource, readerProjections));
} catch (Throwable e) {
try {
dataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof RcFileCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.spi.type.Type in project trino by trinodb.
the class HiveBucketingV2 method hash.
private static int hash(TypeInfo type, Block block, int position) {
if (block.isNull(position)) {
return 0;
}
switch(type.getCategory()) {
case PRIMITIVE:
PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) type;
PrimitiveCategory primitiveCategory = typeInfo.getPrimitiveCategory();
Type trinoType = requireNonNull(HiveTypeTranslator.fromPrimitiveType(typeInfo));
switch(primitiveCategory) {
case BOOLEAN:
return trinoType.getBoolean(block, position) ? 1 : 0;
case BYTE:
return SignedBytes.checkedCast(trinoType.getLong(block, position));
case SHORT:
return Murmur3.hash32(bytes(Shorts.checkedCast(trinoType.getLong(block, position))));
case INT:
return Murmur3.hash32(bytes(toIntExact(trinoType.getLong(block, position))));
case LONG:
return Murmur3.hash32(bytes(trinoType.getLong(block, position)));
case FLOAT:
// https://github.com/apache/hive/blob/7dc47faddba9f079bbe2698aaa4d8712e7654f87/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L830
return Murmur3.hash32(bytes(floatToRawIntBits(floatToIntBits(intBitsToFloat(toIntExact(trinoType.getLong(block, position)))))));
case DOUBLE:
// https://github.com/apache/hive/blob/7dc47faddba9f079bbe2698aaa4d8712e7654f87/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L836
return Murmur3.hash32(bytes(doubleToRawLongBits(doubleToLongBits(trinoType.getDouble(block, position)))));
case STRING:
return Murmur3.hash32(trinoType.getSlice(block, position).getBytes());
case VARCHAR:
return Murmur3.hash32(trinoType.getSlice(block, position).getBytes());
case DATE:
// day offset from 1970-01-01
return Murmur3.hash32(bytes(toIntExact(trinoType.getLong(block, position))));
case TIMESTAMP:
// We do not support bucketing on timestamp
break;
case DECIMAL:
case CHAR:
case BINARY:
case TIMESTAMPLOCALTZ:
case INTERVAL_YEAR_MONTH:
case INTERVAL_DAY_TIME:
// TODO
break;
case VOID:
case UNKNOWN:
break;
}
throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive primitive category: " + primitiveCategory);
case LIST:
return hashOfList((ListTypeInfo) type, block.getObject(position, Block.class));
case MAP:
return hashOfMap((MapTypeInfo) type, block.getObject(position, Block.class));
case STRUCT:
case UNION:
}
throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive category: " + type.getCategory());
}
Aggregations