Search in sources :

Example 46 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.

the class ParquetPageSourceFactory method getParquetTupleDomain.

public static TupleDomain<ColumnDescriptor> getParquetTupleDomain(Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<HiveColumnHandle> effectivePredicate) {
    if (effectivePredicate.isNone()) {
        return TupleDomain.none();
    }
    ImmutableMap.Builder<ColumnDescriptor, Domain> predicate = ImmutableMap.builder();
    for (Entry<HiveColumnHandle, Domain> entry : effectivePredicate.getDomains().get().entrySet()) {
        HiveColumnHandle columnHandle = entry.getKey();
        // skip looking up predicates for complex types as Parquet only stores stats for primitives
        if (!columnHandle.getHiveType().getCategory().equals(PRIMITIVE)) {
            continue;
        }
        RichColumnDescriptor descriptor;
        if (isPushedDownSubfield(columnHandle)) {
            Subfield pushedDownSubfield = getPushedDownSubfield(columnHandle);
            List<String> subfieldPath = columnPathFromSubfield(pushedDownSubfield);
            descriptor = descriptorsByPath.get(subfieldPath);
        } else {
            descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName()));
        }
        if (descriptor != null) {
            predicate.put(descriptor, entry.getValue());
        }
    }
    return TupleDomain.withColumnDomains(predicate.build());
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) Domain(com.facebook.presto.common.predicate.Domain) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) ImmutableMap(com.google.common.collect.ImmutableMap) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) HiveColumnHandle.getPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.getPushedDownSubfield) HiveColumnHandle.isPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.isPushedDownSubfield) Subfield(com.facebook.presto.common.Subfield)

Example 47 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project flink by apache.

the class ParquetVectorizedInputFormat method checkSchema.

private void checkSchema(MessageType fileSchema, MessageType requestedSchema) throws IOException, UnsupportedOperationException {
    if (projectedFields.length != requestedSchema.getFieldCount()) {
        throw new RuntimeException("The quality of field type is incompatible with the request schema!");
    }
    /*
         * Check that the requested schema is supported.
         */
    for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
        Type t = requestedSchema.getFields().get(i);
        if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
            throw new UnsupportedOperationException("Complex types not supported.");
        }
        String[] colPath = requestedSchema.getPaths().get(i);
        if (fileSchema.containsPath(colPath)) {
            ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
            if (!fd.equals(requestedSchema.getColumns().get(i))) {
                throw new UnsupportedOperationException("Schema evolution not supported.");
            }
        } else {
            if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) {
                // invalid.
                throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath));
            }
        }
    }
}
Also used : RowType(org.apache.flink.table.types.logical.RowType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) LogicalType(org.apache.flink.table.types.logical.LogicalType) Type(org.apache.parquet.schema.Type) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException)

Example 48 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project hive by apache.

the class VectorizedParquetRecordReader method wrapPathForCache.

private Path wrapPathForCache(Path path, Object fileKey, JobConf configuration, List<BlockMetaData> blocks, CacheTag tag) throws IOException {
    if (fileKey == null || cache == null) {
        return path;
    }
    HashSet<ColumnPath> includedCols = new HashSet<>();
    for (ColumnDescriptor col : requestedSchema.getColumns()) {
        includedCols.add(ColumnPath.get(col.getPath()));
    }
    // We could make some assumptions given how the reader currently does the work (consecutive
    // chunks, etc.; blocks and columns stored in offset order in the lists), but we won't -
    // just save all the chunk boundaries and lengths for now.
    TreeMap<Long, Long> chunkIndex = new TreeMap<>();
    for (BlockMetaData block : blocks) {
        for (ColumnChunkMetaData mc : block.getColumns()) {
            if (!includedCols.contains(mc.getPath()))
                continue;
            chunkIndex.put(mc.getStartingPos(), mc.getStartingPos() + mc.getTotalSize());
        }
    }
    // Register the cache-aware path so that Parquet reader would go thru it.
    configuration.set("fs." + LlapCacheAwareFs.SCHEME + ".impl", LlapCacheAwareFs.class.getCanonicalName());
    path = LlapCacheAwareFs.registerFile(cache, path, fileKey, chunkIndex, configuration, tag);
    this.cacheFsPath = path;
    return path;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) LlapCacheAwareFs(org.apache.hadoop.hive.llap.LlapCacheAwareFs) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) TreeMap(java.util.TreeMap) HashSet(java.util.HashSet)

Example 49 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project hive by apache.

the class VectorizedParquetRecordReader method buildVectorizedParquetReader.

// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, boolean skipTimestampConversion, int depth) throws IOException {
    List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
    switch(typeInfo.getCategory()) {
        case PRIMITIVE:
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            if (fileSchema.getColumns().contains(descriptors.get(0))) {
                return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo);
            } else {
                // Support for schema evolution
                return new VectorizedDummyColumnReader();
            }
        case STRUCT:
            StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
            List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
            List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
            List<Type> types = type.asGroupType().getFields();
            for (int i = 0; i < fieldTypes.size(); i++) {
                VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion, depth + 1);
                if (r != null) {
                    fieldReaders.add(r);
                } else {
                    throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
                }
            }
            return new VectorizedStructColumnReader(fieldReaders);
        case LIST:
            checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, getElementType(type), typeInfo);
        case MAP:
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            // to handle the different Map definition in Parquet, eg:
            // definition has 1 group:
            // repeated group map (MAP_KEY_VALUE)
            // {required binary key (UTF8); optional binary value (UTF8);}
            // definition has 2 groups:
            // optional group m1 (MAP) {
            // repeated group map (MAP_KEY_VALUE)
            // {required binary key (UTF8); optional binary value (UTF8);}
            // }
            int nestGroup = 0;
            GroupType groupType = type.asGroupType();
            // otherwise, continue to get the group type until MAP_DEFINITION_LEVEL_MAX.
            while (groupType.getFieldCount() < 2) {
                if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
                    throw new RuntimeException("More than " + MAP_DEFINITION_LEVEL_MAX + " level is found in Map definition, " + "Failed to get the field types for Map with type " + type);
                }
                groupType = groupType.getFields().get(0).asGroupType();
                nestGroup++;
            }
            List<Type> kvTypes = groupType.getFields();
            VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, kvTypes.get(0), typeInfo);
            VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader(descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, kvTypes.get(1), typeInfo);
            return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
        case UNION:
        default:
            throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) ParquetRuntimeException(org.apache.parquet.ParquetRuntimeException) GroupType(org.apache.parquet.schema.GroupType)

Example 50 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class ParquetMetadataCommand method printColumnChunk.

private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
    String[] path = column.getPath().toArray();
    PrimitiveType type = primitive(schema, path);
    Preconditions.checkNotNull(type);
    ColumnDescriptor desc = schema.getColumnDescription(path);
    long size = column.getTotalSize();
    long count = column.getValueCount();
    float perValue = ((float) size) / count;
    CompressionCodecName codec = column.getCodec();
    Set<Encoding> encodings = column.getEncodings();
    EncodingStats encodingStats = column.getEncodingStats();
    String encodingSummary = encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats);
    Statistics stats = column.getStatistics();
    String name = column.getPath().toDotString();
    PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
    if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
        console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
    } else {
        console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
    }
}
Also used : EncodingStats(org.apache.parquet.column.EncodingStats) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Encoding(org.apache.parquet.column.Encoding) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingsAsString(org.apache.parquet.cli.Util.encodingsAsString) Util.encodingStatsAsString(org.apache.parquet.cli.Util.encodingStatsAsString) Statistics(org.apache.parquet.column.statistics.Statistics)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7