use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class ByteColumnWriter method getIndexStreams.
@Override
public List<StreamDataOutput> getIndexStreams() throws IOException {
checkState(closed);
ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
List<ByteStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
int groupId = i;
ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
ByteStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
List<Integer> positions = createByteColumnPositionList(compressed, dataCheckpoint, presentCheckpoint);
rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
}
Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
Stream stream = new Stream(column, StreamKind.ROW_INDEX, slice.length(), false);
return ImmutableList.of(new StreamDataOutput(slice, stream));
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class TestStripeReader method testRowSize.
@Test
public void testRowSize() {
int numberOfEntries = 10_000;
long numRowsInGroup = MILLION;
IntegerStatistics integerStatistics = new IntegerStatistics(0L, 0L, 0L);
ColumnStatistics intColumnStatistics = new IntegerColumnStatistics(numRowsInGroup, null, integerStatistics);
ColumnStatistics mapColumnStatistics = new ColumnStatistics(numRowsInGroup, null);
ColumnStatistics mapKeyColumnStatistics = new IntegerColumnStatistics(numRowsInGroup * numberOfEntries, null, integerStatistics);
ColumnStatistics mapValueColumnStatistics = new IntegerColumnStatistics(numRowsInGroup * numberOfEntries, null, integerStatistics);
StreamId intStreamId = new StreamId(1, 0, Stream.StreamKind.ROW_INDEX);
StreamId mapStreamId = new StreamId(2, 0, Stream.StreamKind.ROW_INDEX);
StreamId mapKeyStreamId = new StreamId(3, 0, Stream.StreamKind.ROW_INDEX);
StreamId mapValueStreamId = new StreamId(4, 0, Stream.StreamKind.ROW_INDEX);
Map<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.of(intStreamId, createRowGroupIndex(intColumnStatistics), mapStreamId, createRowGroupIndex(mapColumnStatistics), mapKeyStreamId, createRowGroupIndex(mapKeyColumnStatistics), mapValueStreamId, createRowGroupIndex(mapValueColumnStatistics));
// Each row contains 1 integer, 2 * numberOfEntries * integer (2 is for key and value).
long expectedRowSize = INTEGER_VALUE_BYTES + 2 * numberOfEntries * INTEGER_VALUE_BYTES;
RowGroup rowGroup = StripeReader.createRowGroup(0, Long.MAX_VALUE, numRowsInGroup, columnIndexes, ImmutableMap.of(), ImmutableMap.of());
assertEquals(expectedRowSize, rowGroup.getMinAverageRowBytes());
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class AggregatedOrcPageSource method writeNonNullCount.
private void writeNonNullCount(int columnIndex, BlockBuilder blockBuilder) {
ColumnStatistics columnStatistics = footer.getFileStats().get(columnIndex + 1);
if (!columnStatistics.hasNumberOfValues()) {
throw new UnsupportedOperationException("Number of values not set for orc file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again");
}
blockBuilder.writeLong(columnStatistics.getNumberOfValues());
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class AggregatedOrcPageSource method writeMinMax.
private void writeMinMax(int columnIndex, Type type, HiveType hiveType, BlockBuilder blockBuilder, boolean isMin) {
ColumnStatistics columnStatistics = footer.getFileStats().get(columnIndex + 1);
OrcType orcType = footer.getTypes().get(columnIndex + 1);
if (type instanceof FixedWidthType) {
completedBytes += ((FixedWidthType) type).getFixedSize();
}
String orcNoMinMaxMessage = "No min/max found for orc file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again";
switch(orcType.getOrcTypeKind()) {
case SHORT:
case INT:
case LONG:
{
Long value = isMin ? columnStatistics.getIntegerStatistics().getMin() : columnStatistics.getIntegerStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
blockBuilder.writeLong(value);
}
break;
}
case TIMESTAMP:
case DATE:
{
Integer value = isMin ? columnStatistics.getDateStatistics().getMin() : columnStatistics.getDateStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
blockBuilder.writeLong(Long.valueOf(value));
}
break;
}
case VARCHAR:
case CHAR:
case STRING:
{
Slice value = isMin ? columnStatistics.getStringStatistics().getMin() : columnStatistics.getStringStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
blockBuilder.writeBytes(value, 0, value.length()).closeEntry();
completedBytes += value.length();
}
break;
}
case FLOAT:
{
Double value = isMin ? columnStatistics.getDoubleStatistics().getMin() : columnStatistics.getDoubleStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
blockBuilder.writeLong(floatToRawIntBits(value.floatValue()));
}
break;
}
case DOUBLE:
{
Double value = isMin ? columnStatistics.getDoubleStatistics().getMin() : columnStatistics.getDoubleStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
type.writeDouble(blockBuilder, value);
}
break;
}
case DECIMAL:
BigDecimal value = isMin ? columnStatistics.getDecimalStatistics().getMin() : columnStatistics.getDecimalStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
Type definedType = hiveType.getType(typeManager);
if (Decimals.isShortDecimal(definedType)) {
blockBuilder.writeLong(value.unscaledValue().longValue());
} else {
type.writeSlice(blockBuilder, Decimals.encodeUnscaledValue(value.unscaledValue()));
}
}
break;
case BYTE:
case BOOLEAN:
case BINARY:
case UNION:
case LIST:
case STRUCT:
case MAP:
default:
throw new IllegalArgumentException("Unsupported type: " + orcType.getOrcTypeKind());
}
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class StripeReader method readStripe.
public Stripe readStripe(StripeInformation stripe, OrcAggregatedMemoryContext systemMemoryUsage, Optional<DwrfEncryptionInfo> decryptors, SharedBuffer sharedDecompressionBuffer) throws IOException {
StripeId stripeId = new StripeId(orcDataSource.getId(), stripe.getOffset());
// read the stripe footer
StripeFooter stripeFooter = readStripeFooter(stripeId, stripe, systemMemoryUsage);
// get streams for selected columns
List<List<Stream>> allStreams = new ArrayList<>();
allStreams.add(stripeFooter.getStreams());
Map<StreamId, Stream> includedStreams = new HashMap<>();
boolean hasRowGroupDictionary = addIncludedStreams(stripeFooter.getColumnEncodings(), stripeFooter.getStreams(), includedStreams);
Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
Map<Integer, ColumnEncoding> stripeFooterEncodings = stripeFooter.getColumnEncodings();
columnEncodings.putAll(stripeFooterEncodings);
// included columns may be encrypted
if (decryptors.isPresent()) {
List<Slice> encryptedEncryptionGroups = stripeFooter.getStripeEncryptionGroups();
for (Integer groupId : decryptors.get().getEncryptorGroupIds()) {
StripeEncryptionGroup stripeEncryptionGroup = getStripeEncryptionGroup(decryptors.get().getEncryptorByGroupId(groupId), encryptedEncryptionGroups.get(groupId), dwrfEncryptionGroupColumns.get(groupId), systemMemoryUsage);
allStreams.add(stripeEncryptionGroup.getStreams());
columnEncodings.putAll(stripeEncryptionGroup.getColumnEncodings());
boolean encryptedHasRowGroupDictionary = addIncludedStreams(stripeEncryptionGroup.getColumnEncodings(), stripeEncryptionGroup.getStreams(), includedStreams);
hasRowGroupDictionary = encryptedHasRowGroupDictionary || hasRowGroupDictionary;
}
}
// handle stripes with more than one row group or a dictionary
boolean invalidCheckPoint = false;
if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
// determine ranges of the stripe to read
Map<StreamId, DiskRange> diskRanges = getDiskRanges(allStreams);
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(includedStreams.keySet()));
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
// read the row index for each column
Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(includedStreams, streamsData, stripeId);
if (writeValidation.isPresent()) {
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
}
// select the row groups matching the tuple domain
Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
systemMemoryUsage.close();
return null;
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
// build the row groups
try {
List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), includedStreams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
} catch (InvalidCheckpointException e) {
// we must fail because the length of the row group dictionary is contained in the checkpoint stream.
if (hasRowGroupDictionary) {
throw new OrcCorruptionException(e, orcDataSource.getId(), "Checkpoints are corrupt");
}
invalidCheckPoint = true;
}
}
// stripe only has one row group and no dictionary
ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
for (Entry<StreamId, DiskRange> entry : getDiskRanges(allStreams).entrySet()) {
StreamId streamId = entry.getKey();
if (includedStreams.keySet().contains(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
long totalBytes = 0;
for (Entry<StreamId, Stream> entry : includedStreams.entrySet()) {
if (entry.getKey().getStreamKind() == ROW_INDEX) {
List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, streamsData.get(entry.getKey()), null);
checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
if (columnStatistics.hasMinAverageValueSizeInBytes()) {
totalBytes += columnStatistics.getTotalValueSizeInBytes();
}
}
}
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), totalBytes, new InputStreamSources(builder.build()));
return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
Aggregations