use of org.apache.parquet.column.ColumnDescriptor in project hive by apache.
the class VectorizedParquetRecordReader method buildVectorizedParquetReader.
// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, boolean skipTimestampConversion, ZoneId writerTimezone, boolean skipProlepticConversion, boolean legacyConversionEnabled, int depth) throws IOException {
List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
switch(typeInfo.getCategory()) {
case PRIMITIVE:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
if (fileSchema.getColumns().contains(descriptors.get(0))) {
return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, type, typeInfo);
} else {
// Support for schema evolution
return new VectorizedDummyColumnReader();
}
case STRUCT:
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
List<Type> types = type.asGroupType().getFields();
for (int i = 0; i < fieldTypes.size(); i++) {
VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, depth + 1);
if (r != null) {
fieldReaders.add(r);
} else {
throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
}
}
return new VectorizedStructColumnReader(fieldReaders);
case LIST:
checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, getElementType(type), typeInfo);
case MAP:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
// to handle the different Map definition in Parquet, eg:
// definition has 1 group:
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// definition has 2 groups:
// optional group m1 (MAP) {
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// }
int nestGroup = 0;
GroupType groupType = type.asGroupType();
// otherwise, continue to get the group type until MAP_DEFINITION_LEVEL_MAX.
while (groupType.getFieldCount() < 2) {
if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
throw new RuntimeException("More than " + MAP_DEFINITION_LEVEL_MAX + " level is found in Map definition, " + "Failed to get the field types for Map with type " + type);
}
groupType = groupType.getFields().get(0).asGroupType();
nestGroup++;
}
List<Type> kvTypes = groupType.getFields();
VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(0), typeInfo);
VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader(descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(1), typeInfo);
return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
case UNION:
default:
throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
}
}
use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testVarcharMatchesWithStatistics.
@Test
public void testVarcharMatchesWithStatistics() throws ParquetCorruptionException {
String value = "Test";
ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] { "path" }, BINARY, 0, 0);
RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column"));
TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value));
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
Statistics<?> stats = getStatsBasedOnType(column.getType());
stats.setNumNulls(1L);
stats.setMinMaxFromBytes(value.getBytes(), value.getBytes());
assertTrue(parquetPredicate.matches(2, singletonMap(column, stats), ID));
}
use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testIntegerMatchesWithStatistics.
@Test(dataProvider = "typeForParquetInt32")
public void testIntegerMatchesWithStatistics(Type typeForParquetInt32) throws ParquetCorruptionException {
RichColumnDescriptor column = new RichColumnDescriptor(new ColumnDescriptor(new String[] { "path" }, INT32, 0, 0), new PrimitiveType(OPTIONAL, INT32, "Test column"));
TupleDomain<ColumnDescriptor> effectivePredicate = TupleDomain.withColumnDomains(ImmutableMap.of(column, Domain.create(ValueSet.of(typeForParquetInt32, 42L, 43L, 44L, 112L), false)));
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(32, 42)), ID));
assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(30, 40)), ID));
// stats invalid for smallint/tinyint
assertEquals(parquetPredicate.matches(2, ImmutableMap.of(column, intColumnStats(1024, 0x10000 + 42)), ID), (typeForParquetInt32 != INTEGER));
}
use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testFloat.
@Test
public void testFloat() throws Exception {
ColumnDescriptor columnDescriptor = createColumnDescriptor(FLOAT, "FloatColumn");
assertEquals(getDomain(columnDescriptor, REAL, 0, null, ID), Domain.all(REAL));
float minimum = 4.3f;
float maximum = 40.3f;
assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(minimum, minimum), ID), singleValue(REAL, (long) floatToRawIntBits(minimum)));
assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(minimum, maximum), ID), create(ValueSet.ofRanges(range(REAL, (long) floatToRawIntBits(minimum), true, (long) floatToRawIntBits(maximum), true)), false));
assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(NaN, NaN), ID), Domain.notNull(REAL));
assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(NaN, NaN, true), ID), Domain.all(REAL));
assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(minimum, NaN), ID), Domain.notNull(REAL));
assertEquals(getDomain(columnDescriptor, REAL, 10, floatColumnStats(minimum, NaN, true), ID), Domain.all(REAL));
assertEquals(getDomain(REAL, floatDictionaryDescriptor(NaN)), Domain.all(REAL));
assertEquals(getDomain(REAL, floatDictionaryDescriptor(minimum, NaN)), Domain.all(REAL));
// fail on corrupted statistics
assertThatExceptionOfType(ParquetCorruptionException.class).isThrownBy(() -> getDomain(columnDescriptor, REAL, 10, floatColumnStats(maximum, minimum), ID)).withMessage("Corrupted statistics for column \"[] required float FloatColumn\" in Parquet file \"testFile\": [min: 40.3, max: 4.3, num_nulls: 0]");
}
use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testString.
@Test
public void testString() throws ParquetCorruptionException {
ColumnDescriptor columnDescriptor = createColumnDescriptor(BINARY, "StringColumn");
assertEquals(getDomain(columnDescriptor, createUnboundedVarcharType(), 0, null, ID), Domain.all(createUnboundedVarcharType()));
assertEquals(getDomain(columnDescriptor, createUnboundedVarcharType(), 10, stringColumnStats("taco", "taco"), ID), singleValue(createUnboundedVarcharType(), utf8Slice("taco")));
assertEquals(getDomain(columnDescriptor, createUnboundedVarcharType(), 10, stringColumnStats("apple", "taco"), ID), create(ValueSet.ofRanges(range(createUnboundedVarcharType(), utf8Slice("apple"), true, utf8Slice("taco"), true)), false));
assertEquals(getDomain(columnDescriptor, createUnboundedVarcharType(), 10, stringColumnStats("中国", "美利坚"), ID), create(ValueSet.ofRanges(range(createUnboundedVarcharType(), utf8Slice("中国"), true, utf8Slice("美利坚"), true)), false));
// fail on corrupted statistics
assertThatExceptionOfType(ParquetCorruptionException.class).isThrownBy(() -> getDomain(columnDescriptor, createUnboundedVarcharType(), 10, stringColumnStats("taco", "apple"), ID)).withMessage("Corrupted statistics for column \"[] required binary StringColumn\" in Parquet file \"testFile\": [min: taco, max: apple, num_nulls: 0]");
}
Aggregations