use of org.apache.hadoop.hive.ql.io.parquet.VectorizedParquetInputFormat in project hive by apache.
the class HiveVectorizedReader method parquetRecordReader.
private static RecordReader<NullWritable, VectorizedRowBatch> parquetRecordReader(JobConf job, Reporter reporter, FileScanTask task, Path path, long start, long length) throws IOException {
InputSplit split = new FileSplit(path, start, length, job);
VectorizedParquetInputFormat inputFormat = new VectorizedParquetInputFormat();
MessageType fileSchema = ParquetFileReader.readFooter(job, path).getFileMetaData().getSchema();
MessageType typeWithIds = null;
Schema expectedSchema = task.spec().schema();
if (ParquetSchemaUtil.hasIds(fileSchema)) {
typeWithIds = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
} else {
typeWithIds = ParquetSchemaUtil.pruneColumnsFallback(ParquetSchemaUtil.addFallbackIds(fileSchema), expectedSchema);
}
ParquetSchemaFieldNameVisitor psv = new ParquetSchemaFieldNameVisitor(fileSchema);
TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), typeWithIds, psv);
job.set(IOConstants.COLUMNS, psv.retrieveColumnNameList());
return inputFormat.getRecordReader(split, job, reporter);
}
Aggregations