Search in sources :

Example 1 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
    jobConf = configuration;
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    if (rowGroupOffsets == null) {
        //TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readFooter(configuration, file, NO_FILTER);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    MessageType tableSchema;
    if (indexAccess) {
        List<Integer> indexSequence = new ArrayList<>();
        // Generates a sequence list of indexes
        for (int i = 0; i < columnNamesList.size(); i++) {
            indexSequence.add(i);
        }
        tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
    } else {
        tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
    }
    indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
    if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
        requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
    } else {
        requestedSchema = fileSchema;
    }
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 2 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project hive by apache.

the class ParquetRecordReaderBase method setTimeZoneConversion.

/**
   * Sets the TimeZone conversion for Parquet timestamp columns.
   *
   * @param configuration Configuration object where to get and set the TimeZone conversion
   * @param finalPath     path to the parquet file
   */
protected void setTimeZoneConversion(Configuration configuration, Path finalPath) {
    ParquetMetadata parquetMetadata;
    String timeZoneID;
    try {
        parquetMetadata = ParquetFileReader.readFooter(configuration, finalPath, ParquetMetadataConverter.NO_FILTER);
    } catch (IOException e) {
        // If an error occurred while reading the file, then we just skip the TimeZone setting.
        // This error will probably occur on any other part of the code.
        LOG.debug("Could not read parquet file footer at " + finalPath + ". Cannot determine " + "parquet file timezone", e);
        return;
    }
    boolean skipConversion = HiveConf.getBoolVar(configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION);
    FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") || skipConversion) {
        // Impala writes timestamp values using GMT only. We should not try to convert Impala
        // files to other type of timezones.
        timeZoneID = ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE;
    } else {
        // TABLE_PARQUET_INT96_TIMEZONE is a table property used to detect what timezone conversion
        // to use when reading Parquet timestamps.
        timeZoneID = configuration.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE);
        if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) {
            throw new IllegalStateException("Unexpected timezone id found for parquet int96 conversion: " + timeZoneID);
        }
    }
    // 'timeZoneID' should be valid, since we did not throw exception above
    configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getTimeZone(timeZoneID).getID());
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) IOException(java.io.IOException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Example 3 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class HiveDrillNativeScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, HiveDrillNativeParquetSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    final HiveTableWithColumnCache table = config.getTable();
    final List<InputSplit> splits = config.getInputSplits();
    final List<HivePartition> partitions = config.getPartitions();
    final List<SchemaPath> columns = config.getColumns();
    final String partitionDesignator = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val;
    List<Map<String, String>> implicitColumns = Lists.newLinkedList();
    boolean selectAllQuery = AbstractRecordReader.isStarQuery(columns);
    final boolean hasPartitions = (partitions != null && partitions.size() > 0);
    final List<String[]> partitionColumns = Lists.newArrayList();
    final List<Integer> selectedPartitionColumns = Lists.newArrayList();
    List<SchemaPath> newColumns = columns;
    if (!selectAllQuery) {
        // Separate out the partition and non-partition columns. Non-partition columns are passed directly to the
        // ParquetRecordReader. Partition columns are passed to ScanBatch.
        newColumns = Lists.newArrayList();
        Pattern pattern = Pattern.compile(String.format("%s[0-9]+", partitionDesignator));
        for (SchemaPath column : columns) {
            Matcher m = pattern.matcher(column.getAsUnescapedPath());
            if (m.matches()) {
                selectedPartitionColumns.add(Integer.parseInt(column.getAsUnescapedPath().substring(partitionDesignator.length())));
            } else {
                newColumns.add(column);
            }
        }
    }
    final OperatorContext oContext = context.newOperatorContext(config);
    int currentPartitionIndex = 0;
    final List<RecordReader> readers = Lists.newArrayList();
    final HiveConf conf = config.getHiveConf();
    // TODO: In future we can get this cache from Metadata cached on filesystem.
    final Map<String, ParquetMetadata> footerCache = Maps.newHashMap();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    try {
        for (InputSplit split : splits) {
            final FileSplit fileSplit = (FileSplit) split;
            final Path finalPath = fileSplit.getPath();
            final JobConf cloneJob = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(conf), finalPath.getParent());
            final FileSystem fs = finalPath.getFileSystem(cloneJob);
            ParquetMetadata parquetMetadata = footerCache.get(finalPath.toString());
            if (parquetMetadata == null) {
                parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
                footerCache.put(finalPath.toString(), parquetMetadata);
            }
            final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
            for (int rowGroupNum : rowGroupNums) {
                //DRILL-5009 : Skip the row group if the row count is zero
                if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
                    continue;
                }
                // Drill has only ever written a single row group per file, only detect corruption
                // in the first row group
                ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(parquetMetadata, config.getColumns(), true);
                if (logger.isDebugEnabled()) {
                    logger.debug(containsCorruptDates.toString());
                }
                readers.add(new ParquetRecordReader(context, Path.getPathWithoutSchemeAndAuthority(finalPath).toString(), rowGroupNum, fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), parquetMetadata, newColumns, containsCorruptDates));
                Map<String, String> implicitValues = Maps.newLinkedHashMap();
                if (hasPartitions) {
                    List<String> values = partitions.get(currentPartitionIndex).getValues();
                    for (int i = 0; i < values.size(); i++) {
                        if (selectAllQuery || selectedPartitionColumns.contains(i)) {
                            implicitValues.put(partitionDesignator + i, values.get(i));
                        }
                    }
                }
                implicitColumns.add(implicitValues);
                if (implicitValues.size() > mapWithMaxColumns.size()) {
                    mapWithMaxColumns = implicitValues;
                }
            }
            currentPartitionIndex++;
        }
    } catch (final IOException | RuntimeException e) {
        AutoCloseables.close(e, readers);
        throw new ExecutionSetupException("Failed to create RecordReaders. " + e.getMessage(), e);
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    mapWithMaxColumns = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, mapWithMaxColumns).entriesOnlyOnRight());
    }
    // create an empty RecordReader to output the schema
    if (readers.size() == 0) {
        readers.add(new HiveDefaultReader(table, null, null, columns, context, conf, ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName())));
    }
    return new ScanBatch(config, context, oContext, readers.iterator(), implicitColumns);
}
Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Matcher(java.util.regex.Matcher) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ProjectionPusher(org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) AbstractRecordReader(org.apache.drill.exec.store.AbstractRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) SchemaPath(org.apache.drill.common.expression.SchemaPath) OperatorContext(org.apache.drill.exec.ops.OperatorContext) FileSystem(org.apache.hadoop.fs.FileSystem) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) Pattern(java.util.regex.Pattern) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) IOException(java.io.IOException) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) Map(java.util.Map)

Example 4 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project h2o-3 by h2oai.

the class ParquetParser method parseChunk.

@Override
protected final ParseWriter parseChunk(int cidx, ParseReader din, ParseWriter dout) {
    if (!(din instanceof FVecParseReader)) {
        // TODO: Should we modify the interface to expose the underlying chunk for non-streaming parsers?
        throw new IllegalStateException("We only accept parser readers backed by a Vec (no streaming support!).");
    }
    Chunk chunk = ((FVecParseReader) din).getChunk();
    Vec vec = chunk.vec();
    // extract metadata, we want to read only the row groups that have centers in this chunk
    ParquetMetadataConverter.MetadataFilter chunkFilter = ParquetMetadataConverter.range(chunk.start(), chunk.start() + chunk.len());
    ParquetMetadata metadata = VecParquetReader.readFooter(_metadata, chunkFilter);
    if (metadata.getBlocks().isEmpty()) {
        Log.trace("Chunk #", cidx, " doesn't contain any Parquet block center.");
        return dout;
    }
    Log.info("Processing ", metadata.getBlocks().size(), " blocks of chunk #", cidx);
    VecParquetReader reader = new VecParquetReader(vec, metadata, dout, _setup.getColumnTypes());
    try {
        Integer recordNumber;
        do {
            recordNumber = reader.read();
        } while (recordNumber != null);
    } catch (IOException e) {
        throw new RuntimeException("Failed to parse records", e);
    }
    return dout;
}
Also used : ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) ByteVec(water.fvec.ByteVec) Vec(water.fvec.Vec) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) VecParquetReader(org.apache.parquet.hadoop.VecParquetReader) IOException(java.io.IOException) Chunk(water.fvec.Chunk)

Example 5 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class Metadata method getParquetFileMetadata_v3.

/**
   * Get the metadata for a single file
   *
   * @param file
   * @return
   * @throws IOException
   */
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
    ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
    MessageType schema = metadata.getFileMetaData().getSchema();
    //    Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
    Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
    }
    List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
    ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
    ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
    boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
    if (logger.isDebugEnabled()) {
        logger.debug(containsCorruptDates.toString());
    }
    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            ColumnMetadata_v3 columnMetadata;
            boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
            Statistics<?> stats = col.getStatistics();
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
            ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            // Save the column schema info. We'll merge it into one list
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
            if (statsAvailable) {
                // Write stats when they are not null
                Object minValue = null;
                Object maxValue = null;
                if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
                    minValue = stats.genericGetMin();
                    maxValue = stats.genericGetMax();
                    if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
                        minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
                        maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
                    }
                }
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
            } else {
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
            }
            columnMetadataList.add(columnMetadata);
            length += col.getTotalSize();
        }
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
        rowGroupMetadataList.add(rowGroupMeta);
    }
    String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
    return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ArrayList(java.util.ArrayList) SchemaPath(org.apache.drill.common.expression.SchemaPath) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6