use of org.apache.parquet.column.statistics.Statistics in project drill by axbaretto.
the class ParquetMetaStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
// map from column to ColumnMetadata
final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
columnMetadataMap.put(schemaPath, columnMetadata);
}
for (final SchemaPath field : fields) {
final PrimitiveType.PrimitiveTypeName primitiveType;
final OriginalType originalType;
final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(field.getUnIndexed());
if (columnMetadata != null) {
final Object min = columnMetadata.getMinValue();
final Object max = columnMetadata.getMaxValue();
final Long numNull = columnMetadata.getNulls();
primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
int precision = 0;
int scale = 0;
// ColumnTypeMetadata_v3 stores information about scale and precision
if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v3) {
Metadata.ColumnTypeMetadata_v3 columnTypeInfo = ((Metadata.ParquetTableMetadata_v3) parquetTableMetadata).getColumnTypeInfo(columnMetadata.getName());
scale = columnTypeInfo.scale;
precision = columnTypeInfo.precision;
}
statMap.put(field, getStat(min, max, numNull, primitiveType, originalType, scale, precision));
} else {
final String columnName = field.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(field, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.column.statistics.Statistics in project presto by prestodb.
the class TestTupleDomainParquetPredicate method stringColumnStats.
private static Statistics stringColumnStats(String minimum, String maximum) {
Statistics.Builder builder = Statistics.getBuilderForReading(new PrimitiveType(OPTIONAL, BINARY, "testFile", UTF8));
builder.withMin(minimum.getBytes()).withMax(maximum.getBytes()).withNumNulls(0);
return builder.build();
}
use of org.apache.parquet.column.statistics.Statistics in project presto by prestodb.
the class AggregatedParquetPageSource method getNumNulls.
private long getNumNulls(ParquetMetadata parquetMetadata, int columnIndex) {
long numNulls = 0;
for (BlockMetaData blockMetaData : parquetMetadata.getBlocks()) {
Statistics statistics = blockMetaData.getColumns().get(columnIndex).getStatistics();
if (!statistics.isNumNullsSet()) {
throw new UnsupportedOperationException("Number of nulls not set for parquet file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again");
}
numNulls += statistics.getNumNulls();
}
completedBytes += INTEGER.getFixedSize();
return numNulls;
}
use of org.apache.parquet.column.statistics.Statistics in project presto by prestodb.
the class AggregatedParquetPageSource method writeMinMax.
private void writeMinMax(ParquetMetadata parquetMetadata, int columnIndex, BlockBuilder blockBuilder, Type type, HiveType hiveType, boolean isMin) {
org.apache.parquet.schema.Type parquetType = parquetMetadata.getFileMetaData().getSchema().getType(columnIndex);
if (parquetType instanceof GroupType) {
throw new IllegalArgumentException("Unsupported type : " + parquetType.toString());
}
Object value = null;
for (BlockMetaData blockMetaData : parquetMetadata.getBlocks()) {
Statistics statistics = blockMetaData.getColumns().get(columnIndex).getStatistics();
if (!statistics.hasNonNullValue()) {
throw new UnsupportedOperationException("No min/max found for parquet file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again");
}
if (isMin) {
Object currentValue = statistics.genericGetMin();
if (currentValue != null && (value == null || ((Comparable) currentValue).compareTo(value) < 0)) {
value = currentValue;
}
} else {
Object currentValue = statistics.genericGetMax();
if (currentValue != null && (value == null || ((Comparable) currentValue).compareTo(value) > 0)) {
value = currentValue;
}
}
}
if (type instanceof FixedWidthType) {
completedBytes += ((FixedWidthType) type).getFixedSize();
}
if (value == null) {
blockBuilder.appendNull();
return;
}
PrimitiveType.PrimitiveTypeName parquetTypeName = parquetType.asPrimitiveType().getPrimitiveTypeName();
switch(parquetTypeName) {
case INT32:
{
blockBuilder.writeLong(Long.valueOf((Integer) value));
break;
}
case INT64:
{
blockBuilder.writeLong((Long) value);
break;
}
case INT96:
{
blockBuilder.writeLong(getTimestampMillis(((Binary) value).getBytes(), 0));
break;
}
case FLOAT:
{
blockBuilder.writeLong(floatToRawIntBits((Float) value));
break;
}
case DOUBLE:
{
type.writeDouble(blockBuilder, (Double) value);
break;
}
case FIXED_LEN_BYTE_ARRAY:
{
byte[] valBytes = ((Binary) value).getBytes();
DecimalType decimalType = (DecimalType) hiveType.getType(typeManager);
if (decimalType.isShort()) {
blockBuilder.writeLong(getShortDecimalValue(valBytes));
} else {
BigInteger bigIntValue = new BigInteger(valBytes);
type.writeSlice(blockBuilder, encodeUnscaledValue(bigIntValue));
}
break;
}
case BINARY:
{
Slice slice = Slices.wrappedBuffer(((Binary) value).getBytes());
blockBuilder.writeBytes(slice, 0, slice.length()).closeEntry();
completedBytes += slice.length();
break;
}
case BOOLEAN:
default:
throw new IllegalArgumentException("Unexpected parquet type name: " + parquetTypeName);
}
}
use of org.apache.parquet.column.statistics.Statistics in project presto by prestodb.
the class TupleDomainParquetPredicate method getDomain.
/**
* Get a domain for the ranges defined by each pair of elements from {@code minimums} and {@code maximums}.
* Both arrays must have the same length.
*/
private static Domain getDomain(ColumnDescriptor column, Type type, List<Object> minimums, List<Object> maximums, boolean hasNullValue) {
checkArgument(minimums.size() == maximums.size(), "Expected minimums and maximums to have the same size");
List<Range> ranges = new ArrayList<>();
if (type.equals(BOOLEAN)) {
boolean hasTrueValues = minimums.stream().anyMatch(value -> (boolean) value) || maximums.stream().anyMatch(value -> (boolean) value);
boolean hasFalseValues = minimums.stream().anyMatch(value -> !(boolean) value) || maximums.stream().anyMatch(value -> !(boolean) value);
if (hasTrueValues && hasFalseValues) {
return Domain.all(type);
}
if (hasTrueValues) {
return Domain.create(ValueSet.of(type, true), hasNullValue);
}
if (hasFalseValues) {
return Domain.create(ValueSet.of(type, false), hasNullValue);
}
// All nulls case is handled earlier
throw new VerifyException("Impossible boolean statistics");
}
if ((type.equals(BIGINT) || type.equals(TINYINT) || type.equals(SMALLINT) || type.equals(INTEGER))) {
for (int i = 0; i < minimums.size(); i++) {
long min = asLong(minimums.get(i));
long max = asLong(maximums.get(i));
if (isStatisticsOverflow(type, min, max)) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
ranges.add(Range.range(type, min, true, max, true));
}
checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
}
if (type.equals(REAL)) {
for (int i = 0; i < minimums.size(); i++) {
Float min = (Float) minimums.get(i);
Float max = (Float) maximums.get(i);
if (min.isNaN() || max.isNaN()) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
ranges.add(Range.range(type, (long) floatToRawIntBits(min), true, (long) floatToRawIntBits(max), true));
}
checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
}
if (type.equals(DOUBLE)) {
for (int i = 0; i < minimums.size(); i++) {
Double min = (Double) minimums.get(i);
Double max = (Double) maximums.get(i);
if (min.isNaN() || max.isNaN()) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
ranges.add(Range.range(type, min, true, max, true));
}
checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
}
if (isVarcharType(type)) {
for (int i = 0; i < minimums.size(); i++) {
Slice min = Slices.wrappedBuffer(((Binary) minimums.get(i)).toByteBuffer());
Slice max = Slices.wrappedBuffer(((Binary) maximums.get(i)).toByteBuffer());
ranges.add(Range.range(type, min, true, max, true));
}
checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
}
if (type.equals(DATE)) {
for (int i = 0; i < minimums.size(); i++) {
long min = asLong(minimums.get(i));
long max = asLong(maximums.get(i));
if (isStatisticsOverflow(type, min, max)) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
ranges.add(Range.range(type, min, true, max, true));
}
checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
}
return Domain.create(ValueSet.all(type), hasNullValue);
}
Aggregations