Search in sources :

Example 1 with ParquetInputSplit

use of org.apache.parquet.hadoop.ParquetInputSplit in project parquet-mr by apache.

the class ParquetRecordReaderWrapper method getSplit.

/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
protected ParquetInputSplit getSplit(final InputSplit oldSplit, final JobConf conf) throws IOException {
    if (oldSplit instanceof FileSplit) {
        FileSplit fileSplit = (FileSplit) oldSplit;
        final long splitStart = fileSplit.getStart();
        final long splitLength = fileSplit.getLength();
        final Path finalPath = fileSplit.getPath();
        final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
        final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
        final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
        schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
        return new ParquetInputSplit(finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null);
    } else {
        throw new IllegalArgumentException("Unknown split type: " + oldSplit);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ReadContext(org.apache.parquet.hadoop.api.ReadSupport.ReadContext) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Example 2 with ParquetInputSplit

use of org.apache.parquet.hadoop.ParquetInputSplit in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException, HiveException {
    // the oldSplit may be null during the split phase
    if (oldSplit == null) {
        return;
    }
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    MapWork mapWork = LlapHiveUtils.findMapWork(jobConf);
    if (mapWork != null) {
        parts = mapWork.getPathToPartitionInfo();
    }
    ParquetInputSplit split = (ParquetInputSplit) oldSplit;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    Object cacheKey = null;
    CacheTag cacheTag = null;
    // TODO: also support fileKey in splits, like OrcSplit does
    if (metadataCache != null) {
        if (cacheKey == null) {
            cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), !HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH));
        }
    }
    if (cacheKey != null) {
        if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
            PartitionDesc partitionDesc = LlapHiveUtils.partitionDescForPath(split.getPath(), parts);
            cacheTag = LlapHiveUtils.getDbAndTableNameForMetrics(file, true, partitionDesc);
        }
        // If we are going to use cache, change the path to depend on file ID for extra consistency.
        FileSystem fs = file.getFileSystem(configuration);
        if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
            file = HdfsUtils.getFileIdPath(file, (long) cacheKey);
        }
    }
    if (rowGroupOffsets == null) {
        // TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    this.writerTimezone = DataWritableReadSupport.getWriterTimeZoneId(footer.getFileMetaData().getKeyValueMetaData());
    colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
    requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
    Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 3 with ParquetInputSplit

use of org.apache.parquet.hadoop.ParquetInputSplit in project hive by apache.

the class VectorizedColumnReaderTestBase method createParquetReader.

protected VectorizedParquetRecordReader createParquetReader(String schemaString, Configuration conf) throws IOException, InterruptedException, HiveException {
    conf.set(PARQUET_READ_SCHEMA, schemaString);
    HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
    HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
    Job vectorJob = new Job(conf, "read vector");
    ParquetInputFormat.setInputPaths(vectorJob, file);
    ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class);
    ParquetInputSplit split = (ParquetInputSplit) parquetInputFormat.getSplits(vectorJob).get(0);
    initialVectorizedRowBatchCtx(conf);
    return new VectorizedParquetRecordReader(split, new JobConf(conf));
}
Also used : VectorizedParquetRecordReader(org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) ParquetInputFormat(org.apache.parquet.hadoop.ParquetInputFormat) Job(org.apache.hadoop.mapreduce.Job) JobConf(org.apache.hadoop.mapred.JobConf)

Example 4 with ParquetInputSplit

use of org.apache.parquet.hadoop.ParquetInputSplit in project parquet-mr by apache.

the class DeprecatedParquetInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    if (isTaskSideMetaData(job)) {
        return super.getSplits(job, numSplits);
    }
    List<Footer> footers = getFooters(job);
    List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
    if (splits == null) {
        return null;
    }
    InputSplit[] resultSplits = new InputSplit[splits.size()];
    int i = 0;
    for (ParquetInputSplit split : splits) {
        resultSplits[i++] = new ParquetInputSplitWrapper(split);
    }
    return resultSplits;
}
Also used : Footer(org.apache.parquet.hadoop.Footer) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit)

Example 5 with ParquetInputSplit

use of org.apache.parquet.hadoop.ParquetInputSplit in project hive by apache.

the class ParquetRecordReaderBase method getSplit.

/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
@SuppressWarnings("deprecation")
protected ParquetInputSplit getSplit(final org.apache.hadoop.mapred.InputSplit oldSplit, final JobConf conf) throws IOException {
    if (oldSplit.getLength() == 0) {
        return null;
    }
    ParquetInputSplit split;
    if (oldSplit instanceof FileSplit) {
        final Path finalPath = ((FileSplit) oldSplit).getPath();
        jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());
        // TODO enable MetadataFilter by using readFooter(Configuration configuration, Path file,
        // MetadataFilter filter) API
        final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
        final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
        final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        final ReadSupport.ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema()));
        // Compute stats
        for (BlockMetaData bmd : blocks) {
            serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount());
            serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize());
        }
        schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
        final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
        final long splitStart = ((FileSplit) oldSplit).getStart();
        final long splitLength = ((FileSplit) oldSplit).getLength();
        for (final BlockMetaData block : blocks) {
            final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
                splitGroup.add(block);
            }
        }
        if (splitGroup.isEmpty()) {
            LOG.warn("Skipping split, could not find row group in: " + oldSplit);
            return null;
        }
        FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
        if (filter != null) {
            filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
            if (filtedBlocks.isEmpty()) {
                LOG.debug("All row groups are dropped due to filter predicates");
                return null;
            }
            long droppedBlocks = splitGroup.size() - filtedBlocks.size();
            if (droppedBlocks > 0) {
                LOG.debug("Dropping " + droppedBlocks + " row groups that do not pass filter predicate");
            }
        } else {
            filtedBlocks = splitGroup;
        }
        if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION)) {
            skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr");
        }
        skipProlepticConversion = DataWritableReadSupport.getWriterDateProleptic(fileMetaData.getKeyValueMetaData());
        if (skipProlepticConversion == null) {
            skipProlepticConversion = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT);
        }
        legacyConversionEnabled = HiveConf.getBoolVar(conf, ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED);
        if (fileMetaData.getKeyValueMetaData().containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) {
            legacyConversionEnabled = Boolean.parseBoolean(fileMetaData.getKeyValueMetaData().get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY));
        }
        split = new ParquetInputSplit(finalPath, splitStart, splitLength, oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
        return split;
    } else {
        throw new IllegalArgumentException("Unknown split type: " + oldSplit);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) DataWritableReadSupport(org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapred.FileSplit) ReadSupport(org.apache.parquet.hadoop.api.ReadSupport) DataWritableReadSupport(org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport) InitContext(org.apache.parquet.hadoop.api.InitContext) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Aggregations

ParquetInputSplit (org.apache.parquet.hadoop.ParquetInputSplit)5 Path (org.apache.hadoop.fs.Path)3 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)3 FileSplit (org.apache.hadoop.mapred.FileSplit)2 JobConf (org.apache.hadoop.mapred.JobConf)2 FilterCompat (org.apache.parquet.filter2.compat.FilterCompat)2 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)2 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 CacheTag (org.apache.hadoop.hive.common.io.CacheTag)1 DataWritableReadSupport (org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport)1 VectorizedParquetRecordReader (org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader)1 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)1 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1 Job (org.apache.hadoop.mapreduce.Job)1 Footer (org.apache.parquet.hadoop.Footer)1 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)1