Examples with ParquetMetadata - org.apache.parquet.hadoop.metadata.ParquetMetadata

Example 6 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class TestParquetFilterPushDown method getParquetMetaData.

private ParquetMetadata getParquetMetaData(String filePathStr) throws IOException {
    Configuration fsConf = new Configuration();
    ParquetMetadata footer = ParquetFileReader.readFooter(fsConf, new Path(filePathStr));
    return footer;
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata)

Example 7 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException {
    // the oldSplit may be null during the split phase
    if (oldSplit == null) {
        return;
    }
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    ParquetInputSplit split = (ParquetInputSplit) oldSplit;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    Object cacheKey = null;
    String cacheTag = null;
    // TODO: also support fileKey in splits, like OrcSplit does
    if (metadataCache != null) {
        cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID));
    }
    if (cacheKey != null) {
        if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
            cacheTag = LlapUtil.getDbAndTableNameForMetrics(file, true);
        }
        // If we are going to use cache, change the path to depend on file ID for extra consistency.
        FileSystem fs = file.getFileSystem(configuration);
        if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
            file = HdfsUtils.getFileIdPath(fs, file, (long) cacheKey);
        }
    }
    if (rowGroupOffsets == null) {
        // TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
    requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
    Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}

Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 8 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class HiveDrillNativeScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(ExecutorFragmentContext context, HiveDrillNativeParquetSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    final HiveTableWithColumnCache table = config.getTable();
    final List<List<InputSplit>> splits = config.getInputSplits();
    final List<HivePartition> partitions = config.getPartitions();
    final List<SchemaPath> columns = config.getColumns();
    final String partitionDesignator = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val;
    List<Map<String, String>> implicitColumns = Lists.newLinkedList();
    boolean selectAllQuery = Utilities.isStarQuery(columns);
    final boolean hasPartitions = (partitions != null && partitions.size() > 0);
    final List<String[]> partitionColumns = Lists.newArrayList();
    final List<Integer> selectedPartitionColumns = Lists.newArrayList();
    List<SchemaPath> tableColumns = columns;
    if (!selectAllQuery) {
        // Separate out the partition and non-partition columns. Non-partition columns are passed directly to the
        // ParquetRecordReader. Partition columns are passed to ScanBatch.
        tableColumns = Lists.newArrayList();
        Pattern pattern = Pattern.compile(String.format("%s[0-9]+", partitionDesignator));
        for (SchemaPath column : columns) {
            Matcher m = pattern.matcher(column.getRootSegmentPath());
            if (m.matches()) {
                selectedPartitionColumns.add(Integer.parseInt(column.getRootSegmentPath().substring(partitionDesignator.length())));
            } else {
                tableColumns.add(column);
            }
        }
    }
    final OperatorContext oContext = context.newOperatorContext(config);
    int currentPartitionIndex = 0;
    final List<RecordReader> readers = new LinkedList<>();
    final HiveConf conf = config.getHiveConf();
    // TODO: In future we can get this cache from Metadata cached on filesystem.
    final Map<String, ParquetMetadata> footerCache = Maps.newHashMap();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    try {
        for (List<InputSplit> splitGroups : splits) {
            for (InputSplit split : splitGroups) {
                final FileSplit fileSplit = (FileSplit) split;
                final Path finalPath = fileSplit.getPath();
                final JobConf cloneJob = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(conf), finalPath.getParent());
                final FileSystem fs = finalPath.getFileSystem(cloneJob);
                ParquetMetadata parquetMetadata = footerCache.get(finalPath.toString());
                if (parquetMetadata == null) {
                    parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
                    footerCache.put(finalPath.toString(), parquetMetadata);
                }
                final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
                for (int rowGroupNum : rowGroupNums) {
                    // DRILL-5009 : Skip the row group if the row count is zero
                    if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
                        continue;
                    }
                    // Drill has only ever written a single row group per file, only detect corruption
                    // in the first row group
                    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(parquetMetadata, config.getColumns(), true);
                    if (logger.isDebugEnabled()) {
                        logger.debug(containsCorruptDates.toString());
                    }
                    readers.add(new ParquetRecordReader(context, Path.getPathWithoutSchemeAndAuthority(finalPath).toString(), rowGroupNum, fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), parquetMetadata, tableColumns, containsCorruptDates));
                    Map<String, String> implicitValues = Maps.newLinkedHashMap();
                    if (hasPartitions) {
                        List<String> values = partitions.get(currentPartitionIndex).getValues();
                        for (int i = 0; i < values.size(); i++) {
                            if (selectAllQuery || selectedPartitionColumns.contains(i)) {
                                implicitValues.put(partitionDesignator + i, values.get(i));
                            }
                        }
                    }
                    implicitColumns.add(implicitValues);
                    if (implicitValues.size() > mapWithMaxColumns.size()) {
                        mapWithMaxColumns = implicitValues;
                    }
                }
                currentPartitionIndex++;
            }
        }
    } catch (final IOException | RuntimeException e) {
        AutoCloseables.close(e, readers);
        throw new ExecutionSetupException("Failed to create RecordReaders. " + e.getMessage(), e);
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    mapWithMaxColumns = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, mapWithMaxColumns).entriesOnlyOnRight());
    }
    // create an empty RecordReader to output the schema
    if (readers.size() == 0) {
        readers.add(new HiveDefaultReader(table, null, null, tableColumns, context, conf, ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName())));
    }
    return new ScanBatch(context, oContext, readers, implicitColumns);
}

Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Matcher(java.util.regex.Matcher) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ProjectionPusher(org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) SchemaPath(org.apache.drill.common.expression.SchemaPath) OperatorContext(org.apache.drill.exec.ops.OperatorContext) FileSystem(org.apache.hadoop.fs.FileSystem) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) LinkedList(java.util.LinkedList) List(java.util.List) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) Pattern(java.util.regex.Pattern) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) HiveDefaultReader(org.apache.drill.exec.store.hive.readers.HiveDefaultReader) IOException(java.io.IOException) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) LinkedList(java.util.LinkedList) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) Map(java.util.Map)

Example 9 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class FooterGatherer method readFooter.

/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
    final FileSystem fs = status.getPath().getFileSystem(config);
    try (FSDataInputStream file = fs.open(status.getPath())) {
        final long fileLength = status.getLen();
        Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());
        int len = (int) Math.min(fileLength, (long) DEFAULT_READ_SIZE);
        byte[] footerBytes = new byte[len];
        readFully(file, fileLength - len, footerBytes, 0, len);
        checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
        final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);
        if (size > footerBytes.length - FOOTER_METADATA_SIZE) {
            // if the footer is larger than our initial read, we need to read the rest.
            byte[] origFooterBytes = footerBytes;
            int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;
            footerBytes = new byte[size];
            readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
            System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
        } else {
            int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
            footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
        }
        ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes));
        Footer footer = new Footer(status.getPath(), metadata);
        return footer;
    }
}

Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ByteArrayInputStream(java.io.ByteArrayInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) Footer(org.apache.parquet.hadoop.Footer) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 10 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class ParquetScanBatchCreator method getBatch.

@SuppressWarnings("resource")
@Override
public ScanBatch getBatch(ExecutorFragmentContext context, ParquetRowGroupScan rowGroupScan, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    OperatorContext oContext = context.newOperatorContext(rowGroupScan);
    final ColumnExplorer columnExplorer = new ColumnExplorer(context.getOptions(), rowGroupScan.getColumns());
    if (!columnExplorer.isStarQuery()) {
        rowGroupScan = new ParquetRowGroupScan(rowGroupScan.getUserName(), rowGroupScan.getStorageEngine(), rowGroupScan.getRowGroupReadEntries(), columnExplorer.getTableColumns(), rowGroupScan.getSelectionRoot(), rowGroupScan.getFilter());
        rowGroupScan.setOperatorId(rowGroupScan.getOperatorId());
    }
    DrillFileSystem fs;
    try {
        boolean useAsyncPageReader = context.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_ASYNC).bool_val;
        if (useAsyncPageReader) {
            fs = oContext.newNonTrackingFileSystem(rowGroupScan.getStorageEngine().getFsConf());
        } else {
            fs = oContext.newFileSystem(rowGroupScan.getStorageEngine().getFsConf());
        }
    } catch (IOException e) {
        throw new ExecutionSetupException(String.format("Failed to create DrillFileSystem: %s", e.getMessage()), e);
    }
    Configuration conf = new Configuration(fs.getConf());
    conf.setBoolean(ENABLE_BYTES_READ_COUNTER, false);
    conf.setBoolean(ENABLE_BYTES_TOTAL_COUNTER, false);
    conf.setBoolean(ENABLE_TIME_READ_COUNTER, false);
    // keep footers in a map to avoid re-reading them
    Map<String, ParquetMetadata> footers = Maps.newHashMap();
    List<RecordReader> readers = new LinkedList<>();
    List<Map<String, String>> implicitColumns = Lists.newArrayList();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    for (RowGroupReadEntry e : rowGroupScan.getRowGroupReadEntries()) {
        /*
      Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file
      TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine)
      we should add more information to the RowGroupInfo that will be populated upon the first read to
      provide the reader with all of th file meta-data it needs
      These fields will be added to the constructor below
      */
        try {
            Stopwatch timer = Stopwatch.createUnstarted();
            if (!footers.containsKey(e.getPath())) {
                timer.start();
                ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(e.getPath()));
                long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
                logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", e.getPath(), "", 0, 0, 0, timeToRead);
                footers.put(e.getPath(), footer);
            }
            boolean autoCorrectCorruptDates = rowGroupScan.getFormatConfig().areCorruptDatesAutoCorrected();
            ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footers.get(e.getPath()), rowGroupScan.getColumns(), autoCorrectCorruptDates);
            if (logger.isDebugEnabled()) {
                logger.debug(containsCorruptDates.toString());
            }
            if (!context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER) && !isComplex(footers.get(e.getPath()))) {
                readers.add(new ParquetRecordReader(context, e.getPath(), e.getRowGroupIndex(), e.getNumRecordsToRead(), fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), footers.get(e.getPath()), rowGroupScan.getColumns(), containsCorruptDates));
            } else {
                ParquetMetadata footer = footers.get(e.getPath());
                readers.add(new DrillParquetReader(context, footer, e, columnExplorer.getTableColumns(), fs, containsCorruptDates));
            }
            Map<String, String> implicitValues = columnExplorer.populateImplicitColumns(e, rowGroupScan.getSelectionRoot());
            implicitColumns.add(implicitValues);
            if (implicitValues.size() > mapWithMaxColumns.size()) {
                mapWithMaxColumns = implicitValues;
            }
        } catch (IOException e1) {
            throw new ExecutionSetupException(e1);
        }
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    Map<String, String> diff = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, diff).entriesOnlyOnRight());
    }
    return new ScanBatch(context, oContext, readers, implicitColumns);
}

Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) Stopwatch(com.google.common.base.Stopwatch) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) OperatorContext(org.apache.drill.exec.ops.OperatorContext) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) Path(org.apache.hadoop.fs.Path) DrillParquetReader(org.apache.drill.exec.store.parquet2.DrillParquetReader) IOException(java.io.IOException) LinkedList(java.util.LinkedList) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Map(java.util.Map)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6