use of org.apache.parquet.hadoop.ParquetInputSplit in project parquet-mr by apache.
the class ParquetRecordReaderWrapper method getSplit.
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
protected ParquetInputSplit getSplit(final InputSplit oldSplit, final JobConf conf) throws IOException {
if (oldSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) oldSplit;
final long splitStart = fileSplit.getStart();
final long splitLength = fileSplit.getLength();
final Path finalPath = fileSplit.getPath();
final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
return new ParquetInputSplit(finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null);
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
use of org.apache.parquet.hadoop.ParquetInputSplit in project hive by apache.
the class VectorizedParquetRecordReader method initialize.
@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException, HiveException {
// the oldSplit may be null during the split phase
if (oldSplit == null) {
return;
}
ParquetMetadata footer;
List<BlockMetaData> blocks;
MapWork mapWork = LlapHiveUtils.findMapWork(jobConf);
if (mapWork != null) {
parts = mapWork.getPathToPartitionInfo();
}
ParquetInputSplit split = (ParquetInputSplit) oldSplit;
boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
String columnNames = configuration.get(IOConstants.COLUMNS);
columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
// if task.side.metadata is set, rowGroupOffsets is null
Object cacheKey = null;
CacheTag cacheTag = null;
// TODO: also support fileKey in splits, like OrcSplit does
if (metadataCache != null) {
if (cacheKey == null) {
cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), !HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH));
}
}
if (cacheKey != null) {
if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
PartitionDesc partitionDesc = LlapHiveUtils.partitionDescForPath(split.getPath(), parts);
cacheTag = LlapHiveUtils.getDbAndTableNameForMetrics(file, true, partitionDesc);
}
// If we are going to use cache, change the path to depend on file ID for extra consistency.
FileSystem fs = file.getFileSystem(configuration);
if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
file = HdfsUtils.getFileIdPath(file, (long) cacheKey);
}
}
if (rowGroupOffsets == null) {
// TODO check whether rowGroupOffSets can be null
// then we need to apply the predicate push down filter
footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(configuration);
blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
Set<Long> offsets = new HashSet<>();
for (long offset : rowGroupOffsets) {
offsets.add(offset);
}
blocks = new ArrayList<>();
for (BlockMetaData block : footer.getBlocks()) {
if (offsets.contains(block.getStartingPos())) {
blocks.add(block);
}
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {
long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
for (int i = 0; i < foundRowGroupOffsets.length; i++) {
foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
}
// provide a good error message in case there's a bug
throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
}
}
for (BlockMetaData block : blocks) {
this.totalRowCount += block.getRowCount();
}
this.fileSchema = footer.getFileMetaData().getSchema();
this.writerTimezone = DataWritableReadSupport.getWriterTimeZoneId(footer.getFileMetaData().getKeyValueMetaData());
colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}
use of org.apache.parquet.hadoop.ParquetInputSplit in project hive by apache.
the class VectorizedColumnReaderTestBase method createParquetReader.
protected VectorizedParquetRecordReader createParquetReader(String schemaString, Configuration conf) throws IOException, InterruptedException, HiveException {
conf.set(PARQUET_READ_SCHEMA, schemaString);
HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
Job vectorJob = new Job(conf, "read vector");
ParquetInputFormat.setInputPaths(vectorJob, file);
ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class);
ParquetInputSplit split = (ParquetInputSplit) parquetInputFormat.getSplits(vectorJob).get(0);
initialVectorizedRowBatchCtx(conf);
return new VectorizedParquetRecordReader(split, new JobConf(conf));
}
use of org.apache.parquet.hadoop.ParquetInputSplit in project parquet-mr by apache.
the class DeprecatedParquetInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
if (isTaskSideMetaData(job)) {
return super.getSplits(job, numSplits);
}
List<Footer> footers = getFooters(job);
List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
if (splits == null) {
return null;
}
InputSplit[] resultSplits = new InputSplit[splits.size()];
int i = 0;
for (ParquetInputSplit split : splits) {
resultSplits[i++] = new ParquetInputSplitWrapper(split);
}
return resultSplits;
}
use of org.apache.parquet.hadoop.ParquetInputSplit in project hive by apache.
the class ParquetRecordReaderBase method getSplit.
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
@SuppressWarnings("deprecation")
protected ParquetInputSplit getSplit(final org.apache.hadoop.mapred.InputSplit oldSplit, final JobConf conf) throws IOException {
if (oldSplit.getLength() == 0) {
return null;
}
ParquetInputSplit split;
if (oldSplit instanceof FileSplit) {
final Path finalPath = ((FileSplit) oldSplit).getPath();
jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());
// TODO enable MetadataFilter by using readFooter(Configuration configuration, Path file,
// MetadataFilter filter) API
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadSupport.ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema()));
// Compute stats
for (BlockMetaData bmd : blocks) {
serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount());
serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize());
}
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
final long splitStart = ((FileSplit) oldSplit).getStart();
final long splitLength = ((FileSplit) oldSplit).getLength();
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
splitGroup.add(block);
}
}
if (splitGroup.isEmpty()) {
LOG.warn("Skipping split, could not find row group in: " + oldSplit);
return null;
}
FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
if (filter != null) {
filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
if (filtedBlocks.isEmpty()) {
LOG.debug("All row groups are dropped due to filter predicates");
return null;
}
long droppedBlocks = splitGroup.size() - filtedBlocks.size();
if (droppedBlocks > 0) {
LOG.debug("Dropping " + droppedBlocks + " row groups that do not pass filter predicate");
}
} else {
filtedBlocks = splitGroup;
}
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION)) {
skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr");
}
skipProlepticConversion = DataWritableReadSupport.getWriterDateProleptic(fileMetaData.getKeyValueMetaData());
if (skipProlepticConversion == null) {
skipProlepticConversion = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT);
}
legacyConversionEnabled = HiveConf.getBoolVar(conf, ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED);
if (fileMetaData.getKeyValueMetaData().containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) {
legacyConversionEnabled = Boolean.parseBoolean(fileMetaData.getKeyValueMetaData().get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY));
}
split = new ParquetInputSplit(finalPath, splitStart, splitLength, oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
return split;
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
Aggregations