Search in sources :

Example 1 with BlockMetaData

use of org.apache.parquet.hadoop.metadata.BlockMetaData in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
    jobConf = configuration;
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    if (rowGroupOffsets == null) {
        //TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readFooter(configuration, file, NO_FILTER);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    MessageType tableSchema;
    if (indexAccess) {
        List<Integer> indexSequence = new ArrayList<>();
        // Generates a sequence list of indexes
        for (int i = 0; i < columnNamesList.size(); i++) {
            indexSequence.add(i);
        }
        tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
    } else {
        tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
    }
    indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
    if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
        requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
    } else {
        requestedSchema = fileSchema;
    }
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 2 with BlockMetaData

use of org.apache.parquet.hadoop.metadata.BlockMetaData in project drill by apache.

the class DrillParquetReader method setup.

@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
    try {
        this.operatorContext = context;
        schema = footer.getFileMetaData().getSchema();
        MessageType projection = null;
        if (isStarQuery()) {
            projection = schema;
        } else {
            columnsNotFound = new ArrayList<SchemaPath>();
            projection = getProjection(schema, getColumns(), columnsNotFound);
            if (projection == null) {
                projection = schema;
            }
            if (columnsNotFound != null && columnsNotFound.size() > 0) {
                nullFilledVectors = new ArrayList<>();
                for (SchemaPath col : columnsNotFound) {
                    nullFilledVectors.add((NullableIntVector) output.addField(MaterializedField.create(col.getAsUnescapedPath(), org.apache.drill.common.types.Types.optional(TypeProtos.MinorType.INT)), (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)));
                }
                if (columnsNotFound.size() == getColumns().size()) {
                    noColumnsFound = true;
                }
            }
        }
        logger.debug("Requesting schema {}", projection);
        ColumnIOFactory factory = new ColumnIOFactory(false);
        MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
        Map<ColumnPath, ColumnChunkMetaData> paths = new HashMap<>();
        for (ColumnChunkMetaData md : footer.getBlocks().get(entry.getRowGroupIndex()).getColumns()) {
            paths.put(md.getPath(), md);
        }
        Path filePath = new Path(entry.getPath());
        BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
        recordCount = (int) blockMetaData.getRowCount();
        pageReadStore = new ColumnChunkIncReadStore(recordCount, CodecFactory.createDirectCodecFactory(fileSystem.getConf(), new ParquetDirectByteBufferAllocator(operatorContext.getAllocator()), 0), operatorContext.getAllocator(), fileSystem, filePath);
        for (String[] path : schema.getPaths()) {
            Type type = schema.getType(path);
            if (type.isPrimitive()) {
                ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
                pageReadStore.addColumn(schema.getColumnDescription(path), md);
            }
        }
        if (!noColumnsFound) {
            writer = new VectorContainerWriter(output);
            // Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
            final Collection<SchemaPath> columns = columnsNotFound == null || columnsNotFound.size() == 0 ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
            recordMaterializer = new DrillParquetRecordMaterializer(output, writer, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
            primitiveVectors = writer.getMapVector().getPrimitiveVectors();
            recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
        }
    } catch (Exception e) {
        handleAndRaise("Failure in setting up reader", e);
    }
}
Also used : ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) VectorContainerWriter(org.apache.drill.exec.vector.complex.impl.VectorContainerWriter) HashMap(java.util.HashMap) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) IOException(java.io.IOException) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnChunkIncReadStore(org.apache.parquet.hadoop.ColumnChunkIncReadStore) MessageType(org.apache.parquet.schema.MessageType)

Example 3 with BlockMetaData

use of org.apache.parquet.hadoop.metadata.BlockMetaData in project drill by apache.

the class ReadState method buildReader.

/**
   * Create the readers needed to read columns: fixed-length or variable length.
   *
   * @param reader
   * @param output
   * @throws Exception
   */
@SuppressWarnings("unchecked")
public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception {
    final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
    // initialize all of the column read status objects
    BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata();
    Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata);
    for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
        ColumnDescriptor column = columnMetadata.column;
        columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
        columnMetadata.buildVector(output);
        if (!columnMetadata.isFixedLength()) {
            // create a reader and add it to the appropriate list
            varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader));
        } else if (columnMetadata.isRepeated()) {
            varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader, schema.getRecordsPerBatch()));
        } else {
            columnReaders.add(columnMetadata.makeFixedWidthReader(reader, schema.getRecordsPerBatch()));
        }
    }
    varLengthReader = new VarLenBinaryReader(reader, varLengthColumns);
    if (!schema.isStarQuery()) {
        schema.createNonExistentColumns(output, nullFilledVectors);
    }
}
Also used : ValueVector(org.apache.drill.exec.vector.ValueVector) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList)

Example 4 with BlockMetaData

use of org.apache.parquet.hadoop.metadata.BlockMetaData in project drill by apache.

the class Metadata method getParquetFileMetadata_v3.

/**
   * Get the metadata for a single file
   *
   * @param file
   * @return
   * @throws IOException
   */
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
    ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
    MessageType schema = metadata.getFileMetaData().getSchema();
    //    Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
    Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
    }
    List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
    ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
    ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
    boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
    if (logger.isDebugEnabled()) {
        logger.debug(containsCorruptDates.toString());
    }
    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            ColumnMetadata_v3 columnMetadata;
            boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
            Statistics<?> stats = col.getStatistics();
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
            ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            // Save the column schema info. We'll merge it into one list
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
            if (statsAvailable) {
                // Write stats when they are not null
                Object minValue = null;
                Object maxValue = null;
                if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
                    minValue = stats.genericGetMin();
                    maxValue = stats.genericGetMax();
                    if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
                        minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
                        maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
                    }
                }
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
            } else {
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
            }
            columnMetadataList.add(columnMetadata);
            length += col.getTotalSize();
        }
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
        rowGroupMetadataList.add(rowGroupMeta);
    }
    String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
    return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ArrayList(java.util.ArrayList) SchemaPath(org.apache.drill.common.expression.SchemaPath) MessageType(org.apache.parquet.schema.MessageType)

Example 5 with BlockMetaData

use of org.apache.parquet.hadoop.metadata.BlockMetaData in project hive by apache.

the class ParquetRecordReaderBase method getSplit.

/**
   * gets a ParquetInputSplit corresponding to a split given by Hive
   *
   * @param oldSplit The split given by Hive
   * @param conf The JobConf of the Hive job
   * @return a ParquetInputSplit corresponding to the oldSplit
   * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
   */
@SuppressWarnings("deprecation")
protected ParquetInputSplit getSplit(final org.apache.hadoop.mapred.InputSplit oldSplit, final JobConf conf) throws IOException {
    ParquetInputSplit split;
    if (oldSplit == null) {
        return null;
    }
    if (oldSplit instanceof FileSplit) {
        final Path finalPath = ((FileSplit) oldSplit).getPath();
        jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());
        // TODO enable MetadataFilter by using readFooter(Configuration configuration, Path file,
        // MetadataFilter filter) API
        final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
        final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
        final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        final ReadSupport.ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema()));
        // Compute stats
        for (BlockMetaData bmd : blocks) {
            serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount());
            serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize());
        }
        schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
        final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
        final long splitStart = ((FileSplit) oldSplit).getStart();
        final long splitLength = ((FileSplit) oldSplit).getLength();
        for (final BlockMetaData block : blocks) {
            final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
                splitGroup.add(block);
            }
        }
        if (splitGroup.isEmpty()) {
            LOG.warn("Skipping split, could not find row group in: " + oldSplit);
            return null;
        }
        FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
        if (filter != null) {
            filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
            if (filtedBlocks.isEmpty()) {
                LOG.debug("All row groups are dropped due to filter predicates");
                return null;
            }
            long droppedBlocks = splitGroup.size() - filtedBlocks.size();
            if (droppedBlocks > 0) {
                LOG.debug("Dropping " + droppedBlocks + " row groups that do not pass filter predicate");
            }
        } else {
            filtedBlocks = splitGroup;
        }
        split = new ParquetInputSplit(finalPath, splitStart, splitLength, oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
        return split;
    } else {
        throw new IllegalArgumentException("Unknown split type: " + oldSplit);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) DataWritableReadSupport(org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapred.FileSplit) InitContext(org.apache.parquet.hadoop.api.InitContext) ReadSupport(org.apache.parquet.hadoop.api.ReadSupport) DataWritableReadSupport(org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Aggregations

BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)7 ArrayList (java.util.ArrayList)4 MessageType (org.apache.parquet.schema.MessageType)4 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)2 Path (org.apache.hadoop.fs.Path)2 FilterCompat (org.apache.parquet.filter2.compat.FilterCompat)2 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)2 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 DrillRuntimeException (org.apache.drill.common.exceptions.DrillRuntimeException)1 ExecutionSetupException (org.apache.drill.common.exceptions.ExecutionSetupException)1 OutOfMemoryException (org.apache.drill.exec.exception.OutOfMemoryException)1 ParquetDirectByteBufferAllocator (org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator)1 ValueVector (org.apache.drill.exec.vector.ValueVector)1 VectorContainerWriter (org.apache.drill.exec.vector.complex.impl.VectorContainerWriter)1 Configuration (org.apache.hadoop.conf.Configuration)1 DataWritableReadSupport (org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1