use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class HivePageSourceProvider method createPageSource.
@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit split, ConnectorTableHandle tableHandle, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
HiveSplit hiveSplit = (HiveSplit) split;
if (shouldSkipBucket(hiveTable, hiveSplit, dynamicFilter)) {
return new EmptyPageSource();
}
List<HiveColumnHandle> hiveColumns = columns.stream().map(HiveColumnHandle.class::cast).collect(toList());
List<HiveColumnHandle> dependencyColumns = hiveColumns.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
if (hiveTable.isAcidUpdate()) {
hiveColumns = hiveTable.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("update processor not present")).mergeWithNonUpdatedColumns(hiveColumns);
}
Path path = new Path(hiveSplit.getPath());
boolean originalFile = ORIGINAL_FILE_PATH_MATCHER.matcher(path.toString()).matches();
List<ColumnMapping> columnMappings = ColumnMapping.buildColumnMappings(hiveSplit.getPartitionName(), hiveSplit.getPartitionKeys(), hiveColumns, hiveSplit.getBucketConversion().map(BucketConversion::getBucketColumnHandles).orElse(ImmutableList.of()), hiveSplit.getTableToPartitionMapping(), path, hiveSplit.getBucketNumber(), hiveSplit.getEstimatedFileSize(), hiveSplit.getFileModifiedTime());
// This can happen when dynamic filters are collected after partition splits were listed.
if (shouldSkipSplit(columnMappings, dynamicFilter)) {
return new EmptyPageSource();
}
Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsContext(session), path);
TupleDomain<HiveColumnHandle> simplifiedDynamicFilter = dynamicFilter.getCurrentPredicate().transformKeys(HiveColumnHandle.class::cast).simplify(domainCompactionThreshold);
Optional<ConnectorPageSource> pageSource = createHivePageSource(pageSourceFactories, cursorProviders, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getEstimatedFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(simplifiedDynamicFilter), hiveColumns, typeManager, hiveSplit.getBucketConversion(), hiveSplit.getBucketValidation(), hiveSplit.isS3SelectPushdownEnabled(), hiveSplit.getAcidInfo(), originalFile, hiveTable.getTransaction(), columnMappings);
if (pageSource.isPresent()) {
ConnectorPageSource source = pageSource.get();
if (hiveTable.isAcidDelete() || hiveTable.isAcidUpdate()) {
checkArgument(orcFileWriterFactory.isPresent(), "orcFileWriterFactory not supplied but required for DELETE and UPDATE");
HivePageSource hivePageSource = (HivePageSource) source;
OrcPageSource orcPageSource = (OrcPageSource) hivePageSource.getDelegate();
ColumnMetadata<OrcType> columnMetadata = orcPageSource.getColumnTypes();
int acidRowColumnId = originalFile ? 0 : ACID_ROW_STRUCT_COLUMN_ID;
HiveType rowType = fromOrcTypeToHiveType(columnMetadata.get(new OrcColumnId(acidRowColumnId)), columnMetadata);
long currentSplitNumber = hiveSplit.getSplitNumber();
if (currentSplitNumber >= MAX_NUMBER_OF_SPLITS) {
throw new TrinoException(GENERIC_INSUFFICIENT_RESOURCES, format("Number of splits is higher than maximum possible number of splits %d", MAX_NUMBER_OF_SPLITS));
}
long initialRowId = currentSplitNumber << PER_SPLIT_ROW_ID_BITS;
return new HiveUpdatablePageSource(hiveTable, hiveSplit.getPartitionName(), hiveSplit.getStatementId(), source, typeManager, hiveSplit.getBucketNumber(), path, originalFile, orcFileWriterFactory.get(), configuration, session, rowType, dependencyColumns, hiveTable.getTransaction().getOperation(), initialRowId, MAX_NUMBER_OF_ROWS_PER_SPLIT);
}
return source;
}
throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class ListColumnWriter method finishRowGroup.
@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
checkState(!closed);
ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null, null);
rowGroupColumnStatistics.add(statistics);
nonNullValueCount = 0;
ImmutableMap.Builder<OrcColumnId, ColumnStatistics> columnStatistics = ImmutableMap.builder();
columnStatistics.put(columnId, statistics);
columnStatistics.putAll(elementWriter.finishRowGroup());
return columnStatistics.buildOrThrow();
}
use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class ColumnWriters method createColumnWriter.
public static ColumnWriter createColumnWriter(OrcColumnId columnId, ColumnMetadata<OrcType> orcTypes, Type type, CompressionKind compression, int bufferSize, DataSize stringStatisticsLimit, Supplier<BloomFilterBuilder> bloomFilterBuilder) {
requireNonNull(type, "type is null");
OrcType orcType = orcTypes.get(columnId);
if (type instanceof TimeType) {
TimeType timeType = (TimeType) type;
checkArgument(timeType.getPrecision() == 6, "%s not supported for ORC writer", type);
checkArgument(orcType.getOrcTypeKind() == LONG, "wrong ORC type %s for type %s", orcType, type);
checkArgument("TIME".equals(orcType.getAttributes().get("iceberg.long-type")), "wrong attributes %s for type %s", orcType.getAttributes(), type);
return new TimeColumnWriter(columnId, type, compression, bufferSize, () -> new IntegerStatisticsBuilder(bloomFilterBuilder.get()));
}
switch(orcType.getOrcTypeKind()) {
case BOOLEAN:
return new BooleanColumnWriter(columnId, type, compression, bufferSize);
case FLOAT:
return new FloatColumnWriter(columnId, type, compression, bufferSize, () -> new DoubleStatisticsBuilder(bloomFilterBuilder.get()));
case DOUBLE:
return new DoubleColumnWriter(columnId, type, compression, bufferSize, () -> new DoubleStatisticsBuilder(bloomFilterBuilder.get()));
case BYTE:
return new ByteColumnWriter(columnId, type, compression, bufferSize);
case DATE:
return new LongColumnWriter(columnId, type, compression, bufferSize, () -> new DateStatisticsBuilder(bloomFilterBuilder.get()));
case SHORT:
case INT:
case LONG:
return new LongColumnWriter(columnId, type, compression, bufferSize, () -> new IntegerStatisticsBuilder(bloomFilterBuilder.get()));
case DECIMAL:
return new DecimalColumnWriter(columnId, type, compression, bufferSize);
case TIMESTAMP:
case TIMESTAMP_INSTANT:
return new TimestampColumnWriter(columnId, type, compression, bufferSize, () -> new TimestampStatisticsBuilder(bloomFilterBuilder.get()));
case BINARY:
return new SliceDirectColumnWriter(columnId, type, compression, bufferSize, BinaryStatisticsBuilder::new);
case CHAR:
case VARCHAR:
case STRING:
return new SliceDictionaryColumnWriter(columnId, type, compression, bufferSize, () -> new StringStatisticsBuilder(toIntExact(stringStatisticsLimit.toBytes()), bloomFilterBuilder.get()));
case LIST:
{
OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(0);
Type fieldType = type.getTypeParameters().get(0);
ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
return new ListColumnWriter(columnId, compression, bufferSize, elementWriter);
}
case MAP:
{
ColumnWriter keyWriter = createColumnWriter(orcType.getFieldTypeIndex(0), orcTypes, type.getTypeParameters().get(0), compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
ColumnWriter valueWriter = createColumnWriter(orcType.getFieldTypeIndex(1), orcTypes, type.getTypeParameters().get(1), compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder);
return new MapColumnWriter(columnId, compression, bufferSize, keyWriter, valueWriter);
}
case STRUCT:
{
ImmutableList.Builder<ColumnWriter> fieldWriters = ImmutableList.builder();
for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) {
OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(fieldId);
Type fieldType = type.getTypeParameters().get(fieldId);
fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder));
}
return new StructColumnWriter(columnId, compression, bufferSize, fieldWriters.build());
}
case UNION:
}
throw new IllegalArgumentException("Unsupported type: " + type);
}
use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class PresentOutputStream method getStreamDataOutput.
public Optional<StreamDataOutput> getStreamDataOutput(OrcColumnId columnId) {
checkArgument(closed);
if (booleanOutputStream == null) {
return Optional.empty();
}
StreamDataOutput streamDataOutput = booleanOutputStream.getStreamDataOutput(columnId);
// rewrite the DATA stream created by the boolean output stream to a PRESENT stream
Stream stream = new Stream(columnId, PRESENT, toIntExact(streamDataOutput.size()), streamDataOutput.getStream().isUseVInts());
return Optional.of(new StreamDataOutput(sliceOutput -> {
streamDataOutput.writeData(sliceOutput);
return stream.getLength();
}, stream));
}
use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class TestOrcReaderPositions method testRowGroupSkipping.
@Test
public void testRowGroupSkipping() throws Exception {
try (TempFile tempFile = new TempFile()) {
// create single strip file with multiple row groups
int rowCount = 142_000;
createSequentialFile(tempFile.getFile(), rowCount);
// test reading two row groups from middle of file
OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
if (numberOfRows == rowCount) {
return true;
}
IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
return (stats.getMin() == 50_000) || (stats.getMin() == 60_000);
};
try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
assertEquals(reader.getFileRowCount(), rowCount);
assertEquals(reader.getReaderRowCount(), rowCount);
assertEquals(reader.getFilePosition(), 0);
assertEquals(reader.getReaderPosition(), 0);
long position = 50_000;
while (true) {
Page page = reader.nextPage();
if (page == null) {
break;
}
page = page.getLoadedPage();
Block block = page.getBlock(0);
for (int i = 0; i < block.getPositionCount(); i++) {
assertEquals(BIGINT.getLong(block, i), position + i);
}
assertEquals(reader.getFilePosition(), position);
assertEquals(reader.getReaderPosition(), position);
position += page.getPositionCount();
}
assertEquals(position, 70_000);
assertEquals(reader.getFilePosition(), rowCount);
assertEquals(reader.getReaderPosition(), rowCount);
}
}
}
Aggregations