Search in sources :

Example 16 with MessageType

use of org.apache.parquet.schema.MessageType in project drill by apache.

the class ParquetSchemaMerge method main.

public static void main(String[] args) {
    MessageType message1;
    MessageType message2;
    PrimitiveType c = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "c");
    GroupType b = new GroupType(Repetition.REQUIRED, "b");
    GroupType a = new GroupType(Repetition.OPTIONAL, "a", b);
    message1 = new MessageType("root", a);
    PrimitiveType c2 = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "d");
    GroupType b2 = new GroupType(Repetition.OPTIONAL, "b", c2);
    GroupType a2 = new GroupType(Repetition.OPTIONAL, "a", b2);
    message2 = new MessageType("root", a2);
    MessageType message3 = message1.union(message2);
    StringBuilder builder = new StringBuilder();
    message3.writeToStringBuilder(builder, "");
    System.out.println(builder);
}
Also used : GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType)

Example 17 with MessageType

use of org.apache.parquet.schema.MessageType in project drill by apache.

the class Metadata method getParquetFileMetadata_v3.

/**
   * Get the metadata for a single file
   *
   * @param file
   * @return
   * @throws IOException
   */
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
    ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
    MessageType schema = metadata.getFileMetaData().getSchema();
    //    Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
    Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
    }
    List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
    ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
    ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
    boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
    if (logger.isDebugEnabled()) {
        logger.debug(containsCorruptDates.toString());
    }
    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            ColumnMetadata_v3 columnMetadata;
            boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
            Statistics<?> stats = col.getStatistics();
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
            ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            // Save the column schema info. We'll merge it into one list
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
            if (statsAvailable) {
                // Write stats when they are not null
                Object minValue = null;
                Object maxValue = null;
                if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
                    minValue = stats.genericGetMin();
                    maxValue = stats.genericGetMax();
                    if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
                        minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
                        maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
                    }
                }
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
            } else {
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
            }
            columnMetadataList.add(columnMetadata);
            length += col.getTotalSize();
        }
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
        rowGroupMetadataList.add(rowGroupMeta);
    }
    String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
    return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ArrayList(java.util.ArrayList) SchemaPath(org.apache.drill.common.expression.SchemaPath) MessageType(org.apache.parquet.schema.MessageType)

Example 18 with MessageType

use of org.apache.parquet.schema.MessageType in project drill by apache.

the class Metadata method getColTypeInfo.

private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) {
    if (type.isPrimitive()) {
        PrimitiveType primitiveType = (PrimitiveType) type;
        int precision = 0;
        int scale = 0;
        if (primitiveType.getDecimalMetadata() != null) {
            precision = primitiveType.getDecimalMetadata().getPrecision();
            scale = primitiveType.getDecimalMetadata().getScale();
        }
        int repetitionLevel = schema.getMaxRepetitionLevel(path);
        int definitionLevel = schema.getMaxDefinitionLevel(path);
        return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel);
    }
    Type t = ((GroupType) type).getType(path[depth]);
    return getColTypeInfo(schema, t, path, depth + 1);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 19 with MessageType

use of org.apache.parquet.schema.MessageType in project drill by apache.

the class DrillParquetReader method getProjection.

public static MessageType getProjection(MessageType schema, Collection<SchemaPath> columns, List<SchemaPath> columnsNotFound) {
    MessageType projection = null;
    String messageName = schema.getName();
    List<ColumnDescriptor> schemaColumns = schema.getColumns();
    // parquet type.union() seems to lose ConvertedType info when merging two columns that are the same type. This can
    // happen when selecting two elements from an array. So to work around this, we use set of SchemaPath to avoid duplicates
    // and then merge the types at the end
    Set<SchemaPath> selectedSchemaPaths = Sets.newLinkedHashSet();
    // get a list of modified columns which have the array elements removed from the schema path since parquet schema doesn't include array elements
    List<SchemaPath> modifiedColumns = Lists.newLinkedList();
    for (SchemaPath path : columns) {
        List<String> segments = Lists.newArrayList();
        PathSegment seg = path.getRootSegment();
        do {
            if (seg.isNamed()) {
                segments.add(seg.getNameSegment().getPath());
            }
        } while ((seg = seg.getChild()) != null);
        String[] pathSegments = new String[segments.size()];
        segments.toArray(pathSegments);
        SchemaPath modifiedSchemaPath = SchemaPath.getCompoundPath(pathSegments);
        modifiedColumns.add(modifiedSchemaPath);
    }
    // convert the columns in the parquet schema to a list of SchemaPath columns so that they can be compared in case insensitive manner
    // to the projection columns
    List<SchemaPath> schemaPaths = Lists.newLinkedList();
    for (ColumnDescriptor columnDescriptor : schemaColumns) {
        String[] schemaColDesc = Arrays.copyOf(columnDescriptor.getPath(), columnDescriptor.getPath().length);
        SchemaPath schemaPath = SchemaPath.getCompoundPath(schemaColDesc);
        schemaPaths.add(schemaPath);
    }
    // loop through projection columns and add any columns that are missing from parquet schema to columnsNotFound list
    for (SchemaPath columnPath : modifiedColumns) {
        boolean notFound = true;
        for (SchemaPath schemaPath : schemaPaths) {
            if (schemaPath.contains(columnPath)) {
                selectedSchemaPaths.add(schemaPath);
                notFound = false;
            }
        }
        if (notFound) {
            columnsNotFound.add(columnPath);
        }
    }
    // convert SchemaPaths from selectedSchemaPaths and convert to parquet type, and merge into projection schema
    for (SchemaPath schemaPath : selectedSchemaPaths) {
        List<String> segments = Lists.newArrayList();
        PathSegment seg = schemaPath.getRootSegment();
        do {
            segments.add(seg.getNameSegment().getPath());
        } while ((seg = seg.getChild()) != null);
        String[] pathSegments = new String[segments.size()];
        segments.toArray(pathSegments);
        Type t = getType(pathSegments, 0, schema);
        if (projection == null) {
            projection = new MessageType(messageName, t);
        } else {
            projection = projection.union(new MessageType(messageName, t));
        }
    }
    return projection;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PathSegment(org.apache.drill.common.expression.PathSegment) MessageType(org.apache.parquet.schema.MessageType)

Example 20 with MessageType

use of org.apache.parquet.schema.MessageType in project drill by apache.

the class DrillParquetReader method setup.

@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
    try {
        this.operatorContext = context;
        schema = footer.getFileMetaData().getSchema();
        MessageType projection = null;
        if (isStarQuery()) {
            projection = schema;
        } else {
            columnsNotFound = new ArrayList<SchemaPath>();
            projection = getProjection(schema, getColumns(), columnsNotFound);
            if (projection == null) {
                projection = schema;
            }
            if (columnsNotFound != null && columnsNotFound.size() > 0) {
                nullFilledVectors = new ArrayList<>();
                for (SchemaPath col : columnsNotFound) {
                    nullFilledVectors.add((NullableIntVector) output.addField(MaterializedField.create(col.getAsUnescapedPath(), org.apache.drill.common.types.Types.optional(TypeProtos.MinorType.INT)), (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)));
                }
                if (columnsNotFound.size() == getColumns().size()) {
                    noColumnsFound = true;
                }
            }
        }
        logger.debug("Requesting schema {}", projection);
        ColumnIOFactory factory = new ColumnIOFactory(false);
        MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
        Map<ColumnPath, ColumnChunkMetaData> paths = new HashMap<>();
        for (ColumnChunkMetaData md : footer.getBlocks().get(entry.getRowGroupIndex()).getColumns()) {
            paths.put(md.getPath(), md);
        }
        Path filePath = new Path(entry.getPath());
        BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
        recordCount = (int) blockMetaData.getRowCount();
        pageReadStore = new ColumnChunkIncReadStore(recordCount, CodecFactory.createDirectCodecFactory(fileSystem.getConf(), new ParquetDirectByteBufferAllocator(operatorContext.getAllocator()), 0), operatorContext.getAllocator(), fileSystem, filePath);
        for (String[] path : schema.getPaths()) {
            Type type = schema.getType(path);
            if (type.isPrimitive()) {
                ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
                pageReadStore.addColumn(schema.getColumnDescription(path), md);
            }
        }
        if (!noColumnsFound) {
            writer = new VectorContainerWriter(output);
            // Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
            final Collection<SchemaPath> columns = columnsNotFound == null || columnsNotFound.size() == 0 ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
            recordMaterializer = new DrillParquetRecordMaterializer(output, writer, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
            primitiveVectors = writer.getMapVector().getPrimitiveVectors();
            recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
        }
    } catch (Exception e) {
        handleAndRaise("Failure in setting up reader", e);
    }
}
Also used : ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) VectorContainerWriter(org.apache.drill.exec.vector.complex.impl.VectorContainerWriter) HashMap(java.util.HashMap) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) IOException(java.io.IOException) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnChunkIncReadStore(org.apache.parquet.hadoop.ColumnChunkIncReadStore) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

MessageType (org.apache.parquet.schema.MessageType)46 Test (org.junit.Test)25 FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)15 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)9 Path (org.apache.hadoop.fs.Path)8 Type (org.apache.parquet.schema.Type)7 GroupType (org.apache.parquet.schema.GroupType)6 Configuration (org.apache.hadoop.conf.Configuration)5 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 OriginalType (org.apache.parquet.schema.OriginalType)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 HiveChar (org.apache.hadoop.hive.common.type.HiveChar)3 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)3 PrimitiveType (org.apache.parquet.schema.PrimitiveType)3 DimensionSchema (io.druid.data.input.impl.DimensionSchema)2 File (java.io.File)2 HashSet (java.util.HashSet)2 FileSystem (org.apache.hadoop.fs.FileSystem)2