use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class TestOrcReaderPositions method testStripeSkipping.
@Test
public void testStripeSkipping() throws Exception {
try (TempFile tempFile = new TempFile()) {
createMultiStripeFile(tempFile.getFile());
// test reading second and fourth stripes
OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
if (numberOfRows == 100) {
return true;
}
IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
return ((stats.getMin() == 60) && (stats.getMax() == 117)) || ((stats.getMin() == 180) && (stats.getMax() == 237));
};
try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
assertEquals(reader.getFileRowCount(), 100);
assertEquals(reader.getReaderRowCount(), 40);
assertEquals(reader.getFilePosition(), 0);
assertEquals(reader.getReaderPosition(), 0);
// second stripe
Page page = reader.nextPage().getLoadedPage();
assertEquals(page.getPositionCount(), 20);
assertEquals(reader.getReaderPosition(), 0);
assertEquals(reader.getFilePosition(), 20);
assertCurrentBatch(page, 1);
// fourth stripe
page = reader.nextPage().getLoadedPage();
assertEquals(page.getPositionCount(), 20);
assertEquals(reader.getReaderPosition(), 20);
assertEquals(reader.getFilePosition(), 60);
assertCurrentBatch(page, 3);
page = reader.nextPage();
assertNull(page);
assertEquals(reader.getReaderPosition(), 40);
assertEquals(reader.getFilePosition(), 100);
}
}
}
use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class StructColumnWriter method finishRowGroup.
@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
checkState(!closed);
ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null, null);
rowGroupColumnStatistics.add(statistics);
nonNullValueCount = 0;
ImmutableMap.Builder<OrcColumnId, ColumnStatistics> columnStatistics = ImmutableMap.builder();
columnStatistics.put(columnId, statistics);
structFields.stream().map(ColumnWriter::finishRowGroup).forEach(columnStatistics::putAll);
return columnStatistics.buildOrThrow();
}
use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class TestBooleanStream method testWriteMultiple.
@Test
public void testWriteMultiple() throws IOException {
BooleanOutputStream outputStream = createValueOutputStream();
for (int i = 0; i < 3; i++) {
outputStream.reset();
BooleanList expectedValues = new BooleanArrayList(1024);
outputStream.writeBooleans(32, true);
expectedValues.addAll(Collections.nCopies(32, true));
outputStream.writeBooleans(32, false);
expectedValues.addAll(Collections.nCopies(32, false));
outputStream.writeBooleans(1, true);
expectedValues.add(true);
outputStream.writeBooleans(1, false);
expectedValues.add(false);
outputStream.writeBooleans(34, true);
expectedValues.addAll(Collections.nCopies(34, true));
outputStream.writeBooleans(34, false);
expectedValues.addAll(Collections.nCopies(34, false));
outputStream.writeBoolean(true);
expectedValues.add(true);
outputStream.writeBoolean(false);
expectedValues.add(false);
outputStream.close();
DynamicSliceOutput sliceOutput = new DynamicSliceOutput(1000);
StreamDataOutput streamDataOutput = outputStream.getStreamDataOutput(new OrcColumnId(33));
streamDataOutput.writeData(sliceOutput);
Stream stream = streamDataOutput.getStream();
assertEquals(stream.getStreamKind(), StreamKind.DATA);
assertEquals(stream.getColumnId(), new OrcColumnId(33));
assertEquals(stream.getLength(), sliceOutput.size());
BooleanInputStream valueStream = createValueStream(sliceOutput.slice());
for (int index = 0; index < expectedValues.size(); index++) {
boolean expectedValue = expectedValues.getBoolean(index);
boolean actualValue = readValue(valueStream);
assertEquals(actualValue, expectedValue);
}
}
}
use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class AbstractTestValueStream method testWriteValue.
protected void testWriteValue(List<List<T>> groups) throws IOException {
W outputStream = createValueOutputStream();
for (int i = 0; i < 3; i++) {
outputStream.reset();
long retainedBytes = 0;
for (List<T> group : groups) {
outputStream.recordCheckpoint();
group.forEach(value -> writeValue(outputStream, value));
assertTrue(outputStream.getRetainedBytes() >= retainedBytes);
retainedBytes = outputStream.getRetainedBytes();
}
outputStream.close();
DynamicSliceOutput sliceOutput = new DynamicSliceOutput(1000);
StreamDataOutput streamDataOutput = outputStream.getStreamDataOutput(new OrcColumnId(33));
streamDataOutput.writeData(sliceOutput);
Stream stream = streamDataOutput.getStream();
assertEquals(stream.getStreamKind(), StreamKind.DATA);
assertEquals(stream.getColumnId(), new OrcColumnId(33));
assertEquals(stream.getLength(), sliceOutput.size());
List<C> checkpoints = outputStream.getCheckpoints();
assertEquals(checkpoints.size(), groups.size());
R valueStream = createValueStream(sliceOutput.slice());
for (List<T> group : groups) {
int index = 0;
for (T expectedValue : group) {
index++;
T actualValue = readValue(valueStream);
if (!actualValue.equals(expectedValue)) {
assertEquals(actualValue, expectedValue, "index=" + index);
}
}
}
for (int groupIndex = groups.size() - 1; groupIndex >= 0; groupIndex--) {
valueStream.seekToCheckpoint(checkpoints.get(groupIndex));
for (T expectedValue : groups.get(groupIndex)) {
T actualValue = readValue(valueStream);
if (!actualValue.equals(expectedValue)) {
assertEquals(actualValue, expectedValue);
}
}
}
}
}
use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.
the class StripeReader method readStripe.
public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext memoryUsage) throws IOException {
// read the stripe footer
StripeFooter stripeFooter = readStripeFooter(stripe, memoryUsage);
ColumnMetadata<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
if (writeValidation.isPresent()) {
writeValidation.get().validateTimeZone(orcDataSource.getId(), stripeFooter.getTimeZone());
}
ZoneId fileTimeZone = stripeFooter.getTimeZone();
// get streams for selected columns
Map<StreamId, Stream> streams = new HashMap<>();
for (Stream stream : stripeFooter.getStreams()) {
if (includedOrcColumnIds.contains(stream.getColumnId()) && isSupportedStreamType(stream, types.get(stream.getColumnId()).getOrcTypeKind())) {
streams.put(new StreamId(stream), stream);
}
}
// handle stripes with more than one row group
boolean invalidCheckPoint = false;
if (rowsInRowGroup.isPresent() && stripe.getNumberOfRows() > rowsInRowGroup.getAsInt()) {
// determine ranges of the stripe to read
Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
// read the file regions
Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, memoryUsage);
// read the bloom filter for each column
Map<OrcColumnId, List<BloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
// read the row index for each column
Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes);
if (writeValidation.isPresent()) {
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
}
// select the row groups matching the tuple domain
Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
memoryUsage.close();
return null;
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row groups
try {
List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, rowGroups, dictionaryStreamSources);
} catch (InvalidCheckpointException e) {
// The ORC file contains a corrupt checkpoint stream treat the stripe as a single row group.
invalidCheckPoint = true;
}
}
// stripe only has one row group
ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
StreamId streamId = entry.getKey();
if (streams.containsKey(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.buildOrThrow();
// read the file regions
Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, memoryUsage);
long minAverageRowBytes = 0;
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
if (entry.getKey().getStreamKind() == ROW_INDEX) {
List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey())));
checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
long totalBytes = 0;
long totalRows = 0;
for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
if (columnStatistics.hasMinAverageValueSizeInBytes()) {
totalBytes += columnStatistics.getMinAverageValueSizeInBytes() * columnStatistics.getNumberOfValues();
totalRows += columnStatistics.getNumberOfValues();
}
}
if (totalRows > 0) {
minAverageRowBytes += totalBytes / totalRows;
}
}
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), minAverageRowBytes, new InputStreamSources(builder.buildOrThrow()));
return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
Aggregations