use of io.trino.orc.metadata.OrcType in project trino by trinodb.
the class HivePageSourceProvider method createPageSource.
@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit split, ConnectorTableHandle tableHandle, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
HiveSplit hiveSplit = (HiveSplit) split;
if (shouldSkipBucket(hiveTable, hiveSplit, dynamicFilter)) {
return new EmptyPageSource();
}
List<HiveColumnHandle> hiveColumns = columns.stream().map(HiveColumnHandle.class::cast).collect(toList());
List<HiveColumnHandle> dependencyColumns = hiveColumns.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
if (hiveTable.isAcidUpdate()) {
hiveColumns = hiveTable.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("update processor not present")).mergeWithNonUpdatedColumns(hiveColumns);
}
Path path = new Path(hiveSplit.getPath());
boolean originalFile = ORIGINAL_FILE_PATH_MATCHER.matcher(path.toString()).matches();
List<ColumnMapping> columnMappings = ColumnMapping.buildColumnMappings(hiveSplit.getPartitionName(), hiveSplit.getPartitionKeys(), hiveColumns, hiveSplit.getBucketConversion().map(BucketConversion::getBucketColumnHandles).orElse(ImmutableList.of()), hiveSplit.getTableToPartitionMapping(), path, hiveSplit.getBucketNumber(), hiveSplit.getEstimatedFileSize(), hiveSplit.getFileModifiedTime());
// This can happen when dynamic filters are collected after partition splits were listed.
if (shouldSkipSplit(columnMappings, dynamicFilter)) {
return new EmptyPageSource();
}
Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsContext(session), path);
TupleDomain<HiveColumnHandle> simplifiedDynamicFilter = dynamicFilter.getCurrentPredicate().transformKeys(HiveColumnHandle.class::cast).simplify(domainCompactionThreshold);
Optional<ConnectorPageSource> pageSource = createHivePageSource(pageSourceFactories, cursorProviders, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getEstimatedFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(simplifiedDynamicFilter), hiveColumns, typeManager, hiveSplit.getBucketConversion(), hiveSplit.getBucketValidation(), hiveSplit.isS3SelectPushdownEnabled(), hiveSplit.getAcidInfo(), originalFile, hiveTable.getTransaction(), columnMappings);
if (pageSource.isPresent()) {
ConnectorPageSource source = pageSource.get();
if (hiveTable.isAcidDelete() || hiveTable.isAcidUpdate()) {
checkArgument(orcFileWriterFactory.isPresent(), "orcFileWriterFactory not supplied but required for DELETE and UPDATE");
HivePageSource hivePageSource = (HivePageSource) source;
OrcPageSource orcPageSource = (OrcPageSource) hivePageSource.getDelegate();
ColumnMetadata<OrcType> columnMetadata = orcPageSource.getColumnTypes();
int acidRowColumnId = originalFile ? 0 : ACID_ROW_STRUCT_COLUMN_ID;
HiveType rowType = fromOrcTypeToHiveType(columnMetadata.get(new OrcColumnId(acidRowColumnId)), columnMetadata);
long currentSplitNumber = hiveSplit.getSplitNumber();
if (currentSplitNumber >= MAX_NUMBER_OF_SPLITS) {
throw new TrinoException(GENERIC_INSUFFICIENT_RESOURCES, format("Number of splits is higher than maximum possible number of splits %d", MAX_NUMBER_OF_SPLITS));
}
long initialRowId = currentSplitNumber << PER_SPLIT_ROW_ID_BITS;
return new HiveUpdatablePageSource(hiveTable, hiveSplit.getPartitionName(), hiveSplit.getStatementId(), source, typeManager, hiveSplit.getBucketNumber(), path, originalFile, orcFileWriterFactory.get(), configuration, session, rowType, dependencyColumns, hiveTable.getTransaction().getOperation(), initialRowId, MAX_NUMBER_OF_ROWS_PER_SPLIT);
}
return source;
}
throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
use of io.trino.orc.metadata.OrcType in project trino by trinodb.
the class ColumnWriters method createColumnWriter.
public static ColumnWriter createColumnWriter(OrcColumnId columnId, ColumnMetadata<OrcType> orcTypes, Type type, CompressionKind compression, int bufferSize, DataSize stringStatisticsLimit, Supplier<BloomFilterBuilder> bloomFilterBuilder) {
requireNonNull(type, "type is null");
OrcType orcType = orcTypes.get(columnId);
if (type instanceof TimeType) {
TimeType timeType = (TimeType) type;
checkArgument(timeType.getPrecision() == 6, "%s not supported for ORC writer", type);
checkArgument(orcType.getOrcTypeKind() == LONG, "wrong ORC type %s for type %s", orcType, type);
checkArgument("TIME".equals(orcType.getAttributes().get("iceberg.long-type")), "wrong attributes %s for type %s", orcType.getAttributes(), type);
return new TimeColumnWriter(columnId, type, compression, bufferSize, () -> new IntegerStatisticsBuilder(bloomFilterBuilder.get()));
}
switch(orcType.getOrcTypeKind()) {
case BOOLEAN:
return new BooleanColumnWriter(columnId, type, compression, bufferSize);
case FLOAT:
return new FloatColumnWriter(columnId, type, compression, bufferSize, () -> new DoubleStatisticsBuilder(bloomFilterBuilder.get()));
case DOUBLE:
return new DoubleColumnWriter(columnId, type, compression, bufferSize, () -> new DoubleStatisticsBuilder(bloomFilterBuilder.get()));
case BYTE:
return new ByteColumnWriter(columnId, type, compression, bufferSize);
case DATE:
return new LongColumnWriter(columnId, type, compression, bufferSize, () -> new DateStatisticsBuilder(bloomFilterBuilder.get()));
case SHORT:
case INT:
case LONG:
return new LongColumnWriter(columnId, type, compression, bufferSize, () -> new IntegerStatisticsBuilder(bloomFilterBuilder.get()));
case DECIMAL:
return new DecimalColumnWriter(columnId, type, compression, bufferSize);
case TIMESTAMP:
case TIMESTAMP_INSTANT:
return new TimestampColumnWriter(columnId, type, compression, bufferSize, () -> new TimestampStatisticsBuilder(bloomFilterBuilder.get()));
case BINARY:
return new SliceDirectColumnWriter(columnId, type, compression, bufferSize, BinaryStatisticsBuilder::new);
case CHAR:
case VARCHAR:
case STRING:
return new SliceDictionaryColumnWriter(columnId, type, compression, bufferSize, () -> new StringStatisticsBuilder(toIntExact(stringStatisticsLimit.toBytes()), bloomFilterBuilder.get()));
case LIST:
{
OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(0);
Type fieldType = type.getTypeParameters().get(0);
ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
return new ListColumnWriter(columnId, compression, bufferSize, elementWriter);
}
case MAP:
{
ColumnWriter keyWriter = createColumnWriter(orcType.getFieldTypeIndex(0), orcTypes, type.getTypeParameters().get(0), compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
ColumnWriter valueWriter = createColumnWriter(orcType.getFieldTypeIndex(1), orcTypes, type.getTypeParameters().get(1), compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
return new MapColumnWriter(columnId, compression, bufferSize, keyWriter, valueWriter);
}
case STRUCT:
{
ImmutableList.Builder<ColumnWriter> fieldWriters = ImmutableList.builder();
for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) {
OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(fieldId);
Type fieldType = type.getTypeParameters().get(fieldId);
fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder));
}
return new StructColumnWriter(columnId, compression, bufferSize, fieldWriters.build());
}
case UNION:
}
throw new IllegalArgumentException("Unsupported type: " + type);
}
use of io.trino.orc.metadata.OrcType in project trino by trinodb.
the class StripeReader method getRowGroupStatistics.
private static ColumnMetadata<ColumnStatistics> getRowGroupStatistics(ColumnMetadata<OrcType> types, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) {
requireNonNull(columnIndexes, "columnIndexes is null");
checkArgument(rowGroup >= 0, "rowGroup is negative");
Map<Integer, List<RowGroupIndex>> rowGroupIndexesByColumn = columnIndexes.entrySet().stream().collect(toImmutableMap(entry -> entry.getKey().getColumnId().getId(), Entry::getValue));
List<ColumnStatistics> statistics = new ArrayList<>(types.size());
for (int columnIndex = 0; columnIndex < types.size(); columnIndex++) {
List<RowGroupIndex> rowGroupIndexes = rowGroupIndexesByColumn.get(columnIndex);
if (rowGroupIndexes != null) {
statistics.add(rowGroupIndexes.get(rowGroup).getColumnStatistics());
} else {
statistics.add(null);
}
}
return new ColumnMetadata<>(statistics);
}
use of io.trino.orc.metadata.OrcType in project trino by trinodb.
the class TypeConverter method toOrcMapType.
private static List<OrcType> toOrcMapType(int nextFieldTypeIndex, Types.MapType mapType, Map<String, String> attributes) {
nextFieldTypeIndex++;
Map<String, String> keyAttributes = ImmutableMap.<String, String>builder().put(ORC_ICEBERG_ID_KEY, Integer.toString(mapType.keyId())).put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(true)).buildOrThrow();
List<OrcType> keyTypes = toOrcType(nextFieldTypeIndex, mapType.keyType(), keyAttributes);
Map<String, String> valueAttributes = ImmutableMap.<String, String>builder().put(ORC_ICEBERG_ID_KEY, Integer.toString(mapType.valueId())).put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(mapType.isValueRequired())).buildOrThrow();
List<OrcType> valueTypes = toOrcType(nextFieldTypeIndex + keyTypes.size(), mapType.valueType(), valueAttributes);
List<OrcType> orcTypes = new ArrayList<>();
orcTypes.add(new OrcType(OrcTypeKind.MAP, ImmutableList.of(new OrcColumnId(nextFieldTypeIndex), new OrcColumnId(nextFieldTypeIndex + keyTypes.size())), ImmutableList.of("key", "value"), Optional.empty(), Optional.empty(), Optional.empty(), attributes));
orcTypes.addAll(keyTypes);
orcTypes.addAll(valueTypes);
return orcTypes;
}
use of io.trino.orc.metadata.OrcType in project trino by trinodb.
the class IcebergOrcFileWriter method computeMetrics.
private static Metrics computeMetrics(MetricsConfig metricsConfig, Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics) {
if (columnStatistics.isEmpty()) {
return new Metrics(fileRowCount, null, null, null, null, null, null);
}
// Columns that are descendants of LIST or MAP types are excluded because:
// 1. Their stats are not used by Apache Iceberg to filter out data files
// 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
// See https://github.com/apache/iceberg/pull/199#discussion_r429443627
Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);
ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
// OrcColumnId(0) is the root column that represents file-level schema
for (int i = 1; i < orcColumns.size(); i++) {
OrcColumnId orcColumnId = new OrcColumnId(i);
if (excludedColumns.contains(orcColumnId)) {
continue;
}
OrcType orcColumn = orcColumns.get(orcColumnId);
ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
int icebergId = getIcebergId(orcColumn);
Types.NestedField icebergField = icebergSchema.findField(icebergId);
MetricsModes.MetricsMode metricsMode = MetricsUtil.metricsMode(icebergSchema, metricsConfig, icebergId);
if (metricsMode.equals(MetricsModes.None.get())) {
continue;
}
verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
valueCountsBuilder.put(icebergId, fileRowCount);
if (orcColumnStats.hasNumberOfValues()) {
nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
}
if (!metricsMode.equals(MetricsModes.Counts.get())) {
toIcebergMinMax(orcColumnStats, icebergField.type(), metricsMode).ifPresent(minMax -> {
lowerBoundsBuilder.put(icebergId, minMax.getMin());
upperBoundsBuilder.put(icebergId, minMax.getMax());
});
}
}
Map<Integer, Long> valueCounts = valueCountsBuilder.buildOrThrow();
Map<Integer, Long> nullCounts = nullCountsBuilder.buildOrThrow();
Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.buildOrThrow();
Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.buildOrThrow();
return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, // TODO: Add nanValueCounts to ORC writer
null, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Aggregations