use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class ParquetPageSourceFactory method getParquetTupleDomain.
public static TupleDomain<ColumnDescriptor> getParquetTupleDomain(Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<HiveColumnHandle> effectivePredicate) {
if (effectivePredicate.isNone()) {
return TupleDomain.none();
}
ImmutableMap.Builder<ColumnDescriptor, Domain> predicate = ImmutableMap.builder();
for (Entry<HiveColumnHandle, Domain> entry : effectivePredicate.getDomains().get().entrySet()) {
HiveColumnHandle columnHandle = entry.getKey();
// skip looking up predicates for complex types as Parquet only stores stats for primitives
if (!columnHandle.getHiveType().getCategory().equals(PRIMITIVE)) {
continue;
}
RichColumnDescriptor descriptor;
if (isPushedDownSubfield(columnHandle)) {
Subfield pushedDownSubfield = getPushedDownSubfield(columnHandle);
List<String> subfieldPath = columnPathFromSubfield(pushedDownSubfield);
descriptor = descriptorsByPath.get(subfieldPath);
} else {
descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName()));
}
if (descriptor != null) {
predicate.put(descriptor, entry.getValue());
}
}
return TupleDomain.withColumnDomains(predicate.build());
}
use of org.apache.parquet.column.ColumnDescriptor in project flink by apache.
the class ParquetVectorizedInputFormat method checkSchema.
private void checkSchema(MessageType fileSchema, MessageType requestedSchema) throws IOException, UnsupportedOperationException {
if (projectedFields.length != requestedSchema.getFieldCount()) {
throw new RuntimeException("The quality of field type is incompatible with the request schema!");
}
/*
* Check that the requested schema is supported.
*/
for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
Type t = requestedSchema.getFields().get(i);
if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
throw new UnsupportedOperationException("Complex types not supported.");
}
String[] colPath = requestedSchema.getPaths().get(i);
if (fileSchema.containsPath(colPath)) {
ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
if (!fd.equals(requestedSchema.getColumns().get(i))) {
throw new UnsupportedOperationException("Schema evolution not supported.");
}
} else {
if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) {
// invalid.
throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath));
}
}
}
}
use of org.apache.parquet.column.ColumnDescriptor in project hive by apache.
the class VectorizedParquetRecordReader method wrapPathForCache.
private Path wrapPathForCache(Path path, Object fileKey, JobConf configuration, List<BlockMetaData> blocks, CacheTag tag) throws IOException {
if (fileKey == null || cache == null) {
return path;
}
HashSet<ColumnPath> includedCols = new HashSet<>();
for (ColumnDescriptor col : requestedSchema.getColumns()) {
includedCols.add(ColumnPath.get(col.getPath()));
}
// We could make some assumptions given how the reader currently does the work (consecutive
// chunks, etc.; blocks and columns stored in offset order in the lists), but we won't -
// just save all the chunk boundaries and lengths for now.
TreeMap<Long, Long> chunkIndex = new TreeMap<>();
for (BlockMetaData block : blocks) {
for (ColumnChunkMetaData mc : block.getColumns()) {
if (!includedCols.contains(mc.getPath()))
continue;
chunkIndex.put(mc.getStartingPos(), mc.getStartingPos() + mc.getTotalSize());
}
}
// Register the cache-aware path so that Parquet reader would go thru it.
configuration.set("fs." + LlapCacheAwareFs.SCHEME + ".impl", LlapCacheAwareFs.class.getCanonicalName());
path = LlapCacheAwareFs.registerFile(cache, path, fileKey, chunkIndex, configuration, tag);
this.cacheFsPath = path;
return path;
}
use of org.apache.parquet.column.ColumnDescriptor in project hive by apache.
the class VectorizedParquetRecordReader method buildVectorizedParquetReader.
// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, boolean skipTimestampConversion, int depth) throws IOException {
List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
switch(typeInfo.getCategory()) {
case PRIMITIVE:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
if (fileSchema.getColumns().contains(descriptors.get(0))) {
return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo);
} else {
// Support for schema evolution
return new VectorizedDummyColumnReader();
}
case STRUCT:
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
List<Type> types = type.asGroupType().getFields();
for (int i = 0; i < fieldTypes.size(); i++) {
VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion, depth + 1);
if (r != null) {
fieldReaders.add(r);
} else {
throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
}
}
return new VectorizedStructColumnReader(fieldReaders);
case LIST:
checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, getElementType(type), typeInfo);
case MAP:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
// to handle the different Map definition in Parquet, eg:
// definition has 1 group:
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// definition has 2 groups:
// optional group m1 (MAP) {
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// }
int nestGroup = 0;
GroupType groupType = type.asGroupType();
// otherwise, continue to get the group type until MAP_DEFINITION_LEVEL_MAX.
while (groupType.getFieldCount() < 2) {
if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
throw new RuntimeException("More than " + MAP_DEFINITION_LEVEL_MAX + " level is found in Map definition, " + "Failed to get the field types for Map with type " + type);
}
groupType = groupType.getFields().get(0).asGroupType();
nestGroup++;
}
List<Type> kvTypes = groupType.getFields();
VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, kvTypes.get(0), typeInfo);
VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader(descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, kvTypes.get(1), typeInfo);
return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
case UNION:
default:
throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class ParquetMetadataCommand method printColumnChunk.
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
String[] path = column.getPath().toArray();
PrimitiveType type = primitive(schema, path);
Preconditions.checkNotNull(type);
ColumnDescriptor desc = schema.getColumnDescription(path);
long size = column.getTotalSize();
long count = column.getValueCount();
float perValue = ((float) size) / count;
CompressionCodecName codec = column.getCodec();
Set<Encoding> encodings = column.getEncodings();
EncodingStats encodingStats = column.getEncodingStats();
String encodingSummary = encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats);
Statistics stats = column.getStatistics();
String name = column.getPath().toDotString();
PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
} else {
console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
}
}
Aggregations