use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class AggregatedOrcPageSource method writeMinMax.
private void writeMinMax(int columnIndex, Type type, HiveType hiveType, BlockBuilder blockBuilder, boolean isMin) {
ColumnStatistics columnStatistics = footer.getFileStats().get(columnIndex + 1);
OrcType orcType = footer.getTypes().get(columnIndex + 1);
if (type instanceof FixedWidthType) {
completedBytes += ((FixedWidthType) type).getFixedSize();
}
String orcNoMinMaxMessage = "No min/max found for orc file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again";
switch(orcType.getOrcTypeKind()) {
case SHORT:
case INT:
case LONG:
{
Long value = isMin ? columnStatistics.getIntegerStatistics().getMin() : columnStatistics.getIntegerStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
blockBuilder.writeLong(value);
}
break;
}
case TIMESTAMP:
case DATE:
{
Integer value = isMin ? columnStatistics.getDateStatistics().getMin() : columnStatistics.getDateStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
blockBuilder.writeLong(Long.valueOf(value));
}
break;
}
case VARCHAR:
case CHAR:
case STRING:
{
Slice value = isMin ? columnStatistics.getStringStatistics().getMin() : columnStatistics.getStringStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
blockBuilder.writeBytes(value, 0, value.length()).closeEntry();
completedBytes += value.length();
}
break;
}
case FLOAT:
{
Double value = isMin ? columnStatistics.getDoubleStatistics().getMin() : columnStatistics.getDoubleStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
blockBuilder.writeLong(floatToRawIntBits(value.floatValue()));
}
break;
}
case DOUBLE:
{
Double value = isMin ? columnStatistics.getDoubleStatistics().getMin() : columnStatistics.getDoubleStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
type.writeDouble(blockBuilder, value);
}
break;
}
case DECIMAL:
BigDecimal value = isMin ? columnStatistics.getDecimalStatistics().getMin() : columnStatistics.getDecimalStatistics().getMax();
if (value == null) {
throw new UnsupportedOperationException(orcNoMinMaxMessage);
} else {
Type definedType = hiveType.getType(typeManager);
if (Decimals.isShortDecimal(definedType)) {
blockBuilder.writeLong(value.unscaledValue().longValue());
} else {
type.writeSlice(blockBuilder, Decimals.encodeUnscaledValue(value.unscaledValue()));
}
}
break;
case BYTE:
case BOOLEAN:
case BINARY:
case UNION:
case LIST:
case STRUCT:
case MAP:
default:
throw new IllegalArgumentException("Unsupported type: " + orcType.getOrcTypeKind());
}
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class IcebergOrcFileWriter method computeMetrics.
private static Metrics computeMetrics(Schema icebergSchema, List<OrcType> orcRowTypes, long fileRowCount, List<ColumnStatistics> columnStatistics) {
if (columnStatistics.isEmpty()) {
return new Metrics(fileRowCount, null, null, null, null, null, null);
}
// Columns that are descendants of LIST or MAP types are excluded because:
// 1. Their stats are not used by Apache Iceberg to filter out data files
// 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
// See https://github.com/apache/iceberg/pull/199#discussion_r429443627
Set<Integer> excludedColumns = getExcludedColumns(orcRowTypes);
ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
// OrcColumnId(0) is the root column that represents file-level schema
for (int i = 1; i < orcRowTypes.size(); i++) {
if (excludedColumns.contains(i)) {
continue;
}
OrcType orcColumn = orcRowTypes.get(i);
ColumnStatistics orcColumnStats = columnStatistics.get(i);
int icebergId = getIcebergId(orcColumn);
NestedField icebergField = icebergSchema.findField(icebergId);
verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
valueCountsBuilder.put(icebergId, fileRowCount);
if (orcColumnStats.hasNumberOfValues()) {
nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
}
toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
lowerBoundsBuilder.put(icebergId, minMax.getMin());
upperBoundsBuilder.put(icebergId, minMax.getMax());
});
}
Map<Integer, Long> valueCounts = valueCountsBuilder.build();
Map<Integer, Long> nullCounts = nullCountsBuilder.build();
Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, null, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class TestStripeReader method testRowSize.
@Test
public void testRowSize() {
int numberOfEntries = 10_000;
long numRowsInGroup = MILLION;
IntegerStatistics integerStatistics = new IntegerStatistics(0L, 0L, 0L);
ColumnStatistics intColumnStatistics = new IntegerColumnStatistics(numRowsInGroup, null, integerStatistics);
ColumnStatistics mapColumnStatistics = new ColumnStatistics(numRowsInGroup, null);
ColumnStatistics mapKeyColumnStatistics = new IntegerColumnStatistics(numRowsInGroup * numberOfEntries, null, integerStatistics);
ColumnStatistics mapValueColumnStatistics = new IntegerColumnStatistics(numRowsInGroup * numberOfEntries, null, integerStatistics);
StreamId intStreamId = new StreamId(1, 0, Stream.StreamKind.ROW_INDEX);
StreamId mapStreamId = new StreamId(2, 0, Stream.StreamKind.ROW_INDEX);
StreamId mapKeyStreamId = new StreamId(3, 0, Stream.StreamKind.ROW_INDEX);
StreamId mapValueStreamId = new StreamId(4, 0, Stream.StreamKind.ROW_INDEX);
Map<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.of(intStreamId, createRowGroupIndex(intColumnStatistics), mapStreamId, createRowGroupIndex(mapColumnStatistics), mapKeyStreamId, createRowGroupIndex(mapKeyColumnStatistics), mapValueStreamId, createRowGroupIndex(mapValueColumnStatistics));
// Each row contains 1 integer, 2 * numberOfEntries * integer (2 is for key and value).
long expectedRowSize = INTEGER_VALUE_BYTES + 2 * numberOfEntries * INTEGER_VALUE_BYTES;
RowGroup rowGroup = StripeReader.createRowGroup(0, Long.MAX_VALUE, numRowsInGroup, columnIndexes, ImmutableMap.of(), ImmutableMap.of());
assertEquals(expectedRowSize, rowGroup.getMinAverageRowBytes());
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class AbstractOrcRecordReader method close.
@Override
public void close() throws IOException {
try (Closer closer = Closer.create()) {
closer.register(orcDataSource);
for (StreamReader column : streamReaders) {
if (column != null) {
closer.register(column::close);
}
}
}
rowGroups = null;
if (writeChecksumBuilder.isPresent()) {
OrcWriteValidation.WriteChecksum actualChecksum = writeChecksumBuilder.get().build();
validateWrite(validation -> validation.getChecksum().getTotalRowCount() == actualChecksum.getTotalRowCount(), "Invalid row count");
List<Long> columnHashes = actualChecksum.getColumnHashes();
for (int i = 0; i < columnHashes.size(); i++) {
int columnIndex = i;
validateWrite(validation -> validation.getChecksum().getColumnHashes().get(columnIndex).equals(columnHashes.get(columnIndex)), "Invalid checksum for column %s", columnIndex);
}
validateWrite(validation -> validation.getChecksum().getStripeHash() == actualChecksum.getStripeHash(), "Invalid stripes checksum");
}
if (fileStatisticsValidation.isPresent()) {
List<ColumnStatistics> columnStatistics = fileStatisticsValidation.get().build();
writeValidation.get().validateFileStatistics(orcDataSource.getId(), columnStatistics);
}
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class OrcWriteValidation method validateRowGroupStatistics.
public void validateRowGroupStatistics(OrcDataSourceId orcDataSourceId, long stripeOffset, Map<StreamId, List<RowGroupIndex>> actualRowGroupStatistics) throws OrcCorruptionException {
requireNonNull(actualRowGroupStatistics, "actualRowGroupStatistics is null");
List<RowGroupStatistics> expectedRowGroupStatistics = rowGroupStatistics.get(stripeOffset);
if (expectedRowGroupStatistics == null) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected stripe at offset %s", stripeOffset);
}
int rowGroupCount = expectedRowGroupStatistics.size();
for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
if (entry.getValue().size() != rowGroupCount) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected row group count stripe in at offset %s", stripeOffset);
}
}
for (int rowGroupIndex = 0; rowGroupIndex < expectedRowGroupStatistics.size(); rowGroupIndex++) {
RowGroupStatistics expectedRowGroup = expectedRowGroupStatistics.get(rowGroupIndex);
if (expectedRowGroup.getValidationMode() != HASHED) {
Map<Integer, ColumnStatistics> expectedStatistics = expectedRowGroup.getColumnStatistics();
Set<Integer> actualColumns = actualRowGroupStatistics.keySet().stream().map(StreamId::getColumn).collect(Collectors.toSet());
if (!expectedStatistics.keySet().equals(actualColumns)) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected column in row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
}
for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
ColumnStatistics actual = entry.getValue().get(rowGroupIndex).getColumnStatistics();
ColumnStatistics expected = expectedStatistics.get(entry.getKey().getColumn());
validateColumnStatisticsEquivalent(orcDataSourceId, "Row group " + rowGroupIndex + " in stripe at offset " + stripeOffset, actual, expected);
}
}
if (expectedRowGroup.getValidationMode() != DETAILED) {
RowGroupStatistics actualRowGroup = buildActualRowGroupStatistics(rowGroupIndex, actualRowGroupStatistics);
if (expectedRowGroup.getHash() != actualRowGroup.getHash()) {
throw new OrcCorruptionException(orcDataSourceId, "Checksum mismatch for row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
}
}
}
}
Aggregations