Search in sources :

Example 1 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project hive by apache.

the class VectorizedParquetRecordReader method buildVectorizedParquetReader.

// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, String conversionTimeZone, int depth) throws IOException {
    List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
    switch(typeInfo.getCategory()) {
        case PRIMITIVE:
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            } else {
                return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), conversionTimeZone, type);
            }
        case STRUCT:
            StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
            List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
            List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
            List<Type> types = type.asGroupType().getFields();
            for (int i = 0; i < fieldTypes.size(); i++) {
                VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, conversionTimeZone, depth + 1);
                if (r != null) {
                    fieldReaders.add(r);
                } else {
                    throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
                }
            }
            return new VectorizedStructColumnReader(fieldReaders);
        case LIST:
        case MAP:
        case UNION:
        default:
            throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) ParquetRuntimeException(org.apache.parquet.ParquetRuntimeException)

Example 2 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class ParquetFooterStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
    // map from column name to ColumnDescriptor
    Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
    // map from column name to ColumnChunkMetaData
    final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
    // map from column name to MajorType
    final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
    // map from column name to SchemaElement
    final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
    for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
        if (fields.contains(schemaPath)) {
            columnDescMap.put(schemaPath, column);
        }
    }
    for (final SchemaElement se : fileMetaData.getSchema()) {
        final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
        if (fields.contains(schemaPath)) {
            schemaElementMap.put(schemaPath, se);
        }
    }
    for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
        if (fields.contains(schemaPath)) {
            columnChkMetaMap.put(schemaPath, colMetaData);
        }
    }
    for (final SchemaPath path : fields) {
        if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
            ColumnDescriptor columnDesc = columnDescMap.get(path);
            SchemaElement se = schemaElementMap.get(path);
            ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
            TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
            columnTypeMap.put(path, type);
            Statistics stat = metaData.getStatistics();
            if (type.getMinorType() == TypeProtos.MinorType.DATE) {
                stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
            }
            statMap.put(path, new ColumnStatistics(stat, type));
        } else {
            final String columnName = path.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(path, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Stopwatch(com.google.common.base.Stopwatch) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) SchemaPath(org.apache.drill.common.expression.SchemaPath) SchemaElement(org.apache.parquet.format.SchemaElement) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter)

Example 3 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class DrillParquetReader method getProjection.

public static MessageType getProjection(MessageType schema, Collection<SchemaPath> columns, List<SchemaPath> columnsNotFound) {
    MessageType projection = null;
    String messageName = schema.getName();
    List<ColumnDescriptor> schemaColumns = schema.getColumns();
    // parquet type.union() seems to lose ConvertedType info when merging two columns that are the same type. This can
    // happen when selecting two elements from an array. So to work around this, we use set of SchemaPath to avoid duplicates
    // and then merge the types at the end
    Set<SchemaPath> selectedSchemaPaths = Sets.newLinkedHashSet();
    // get a list of modified columns which have the array elements removed from the schema path since parquet schema doesn't include array elements
    List<SchemaPath> modifiedColumns = Lists.newLinkedList();
    for (SchemaPath path : columns) {
        List<String> segments = Lists.newArrayList();
        PathSegment seg = path.getRootSegment();
        do {
            if (seg.isNamed()) {
                segments.add(seg.getNameSegment().getPath());
            }
        } while ((seg = seg.getChild()) != null);
        String[] pathSegments = new String[segments.size()];
        segments.toArray(pathSegments);
        SchemaPath modifiedSchemaPath = SchemaPath.getCompoundPath(pathSegments);
        modifiedColumns.add(modifiedSchemaPath);
    }
    // convert the columns in the parquet schema to a list of SchemaPath columns so that they can be compared in case insensitive manner
    // to the projection columns
    List<SchemaPath> schemaPaths = Lists.newLinkedList();
    for (ColumnDescriptor columnDescriptor : schemaColumns) {
        String[] schemaColDesc = Arrays.copyOf(columnDescriptor.getPath(), columnDescriptor.getPath().length);
        SchemaPath schemaPath = SchemaPath.getCompoundPath(schemaColDesc);
        schemaPaths.add(schemaPath);
    }
    // loop through projection columns and add any columns that are missing from parquet schema to columnsNotFound list
    for (SchemaPath columnPath : modifiedColumns) {
        boolean notFound = true;
        for (SchemaPath schemaPath : schemaPaths) {
            if (schemaPath.contains(columnPath)) {
                selectedSchemaPaths.add(schemaPath);
                notFound = false;
            }
        }
        if (notFound) {
            columnsNotFound.add(columnPath);
        }
    }
    // convert SchemaPaths from selectedSchemaPaths and convert to parquet type, and merge into projection schema
    for (SchemaPath schemaPath : selectedSchemaPaths) {
        List<String> segments = Lists.newArrayList();
        PathSegment seg = schemaPath.getRootSegment();
        do {
            segments.add(seg.getNameSegment().getPath());
        } while ((seg = seg.getChild()) != null);
        String[] pathSegments = new String[segments.size()];
        segments.toArray(pathSegments);
        Type t = getType(pathSegments, 0, schema);
        if (projection == null) {
            projection = new MessageType(messageName, t);
        } else {
            projection = projection.union(new MessageType(messageName, t));
        }
    }
    return projection;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PathSegment(org.apache.drill.common.expression.PathSegment) MessageType(org.apache.parquet.schema.MessageType)

Example 4 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project hive by apache.

the class VectorizedParquetRecordReader method wrapPathForCache.

private Path wrapPathForCache(Path path, Object fileKey, JobConf configuration, List<BlockMetaData> blocks, String tag) throws IOException {
    if (fileKey == null || cache == null) {
        return path;
    }
    HashSet<ColumnPath> includedCols = new HashSet<>();
    for (ColumnDescriptor col : requestedSchema.getColumns()) {
        includedCols.add(ColumnPath.get(col.getPath()));
    }
    // We could make some assumptions given how the reader currently does the work (consecutive
    // chunks, etc.; blocks and columns stored in offset order in the lists), but we won't -
    // just save all the chunk boundaries and lengths for now.
    TreeMap<Long, Long> chunkIndex = new TreeMap<>();
    for (BlockMetaData block : blocks) {
        for (ColumnChunkMetaData mc : block.getColumns()) {
            if (!includedCols.contains(mc.getPath()))
                continue;
            chunkIndex.put(mc.getStartingPos(), mc.getStartingPos() + mc.getTotalSize());
        }
    }
    // Register the cache-aware path so that Parquet reader would go thru it.
    configuration.set("fs." + LlapCacheAwareFs.SCHEME + ".impl", LlapCacheAwareFs.class.getCanonicalName());
    path = LlapCacheAwareFs.registerFile(cache, path, fileKey, chunkIndex, configuration, tag);
    this.cacheFsPath = path;
    return path;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) LlapCacheAwareFs(org.apache.hadoop.hive.llap.LlapCacheAwareFs) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) TreeMap(java.util.TreeMap) HashSet(java.util.HashSet)

Example 5 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by axbaretto.

the class ParquetSchema method loadParquetSchema.

/**
 * Scan the Parquet footer, then map each Parquet column to the list of columns
 * we want to read. Track those to be read.
 */
private void loadParquetSchema() {
    // TODO - figure out how to deal with this better once we add nested reading, note also look where this map is used below
    // store a map from column name to converted types if they are non-null
    Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
    // loop to add up the length of the fixed width columns and build the schema
    for (ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
        ParquetColumnMetadata columnMetadata = new ParquetColumnMetadata(column);
        columnMetadata.resolveDrillType(schemaElements, options);
        if (!fieldSelected(columnMetadata.field)) {
            continue;
        }
        selectedColumnMetadata.add(columnMetadata);
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) SchemaElement(org.apache.parquet.format.SchemaElement)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7