use of org.apache.hive.iceberg.org.apache.parquet.schema.MessageType in project hive by apache.
the class HiveVectorizedReader method parquetRecordReader.
private static RecordReader<NullWritable, VectorizedRowBatch> parquetRecordReader(JobConf job, Reporter reporter, FileScanTask task, Path path, long start, long length) throws IOException {
InputSplit split = new FileSplit(path, start, length, job);
VectorizedParquetInputFormat inputFormat = new VectorizedParquetInputFormat();
MessageType fileSchema = ParquetFileReader.readFooter(job, path).getFileMetaData().getSchema();
MessageType typeWithIds = null;
Schema expectedSchema = task.spec().schema();
if (ParquetSchemaUtil.hasIds(fileSchema)) {
typeWithIds = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
} else {
typeWithIds = ParquetSchemaUtil.pruneColumnsFallback(ParquetSchemaUtil.addFallbackIds(fileSchema), expectedSchema);
}
ParquetSchemaFieldNameVisitor psv = new ParquetSchemaFieldNameVisitor(fileSchema);
TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), typeWithIds, psv);
job.set(IOConstants.COLUMNS, psv.retrieveColumnNameList());
return inputFormat.getRecordReader(split, job, reporter);
}
use of org.apache.hive.iceberg.org.apache.parquet.schema.MessageType in project hive by apache.
the class ParquetSchemaFieldNameVisitor method struct.
@Override
public Type struct(Types.StructType expected, GroupType struct, List<Type> fields) {
boolean isMessageType = struct instanceof MessageType;
List<Types.NestedField> expectedFields = expected != null ? expected.fields() : ImmutableList.of();
List<Type> types = Lists.newArrayListWithExpectedSize(expectedFields.size());
for (Types.NestedField field : expectedFields) {
int id = field.fieldId();
if (MetadataColumns.metadataFieldIds().contains(id)) {
continue;
}
Type fieldInPrunedFileSchema = typesById.get(id);
if (fieldInPrunedFileSchema == null) {
if (!originalFileSchema.containsField(field.name())) {
// Must be a new field - it isn't in this parquet file yet, so add the new field name instead of null
appendToColNamesList(isMessageType, field.name());
} else {
// This field is found in the parquet file with a different ID, so it must have been recreated since.
// Inserting a dummy col name to force Hive Parquet reader returning null for this column.
appendToColNamesList(isMessageType, DUMMY_COL_NAME);
}
} else {
// Already present column in this parquet file, add the original name
types.add(fieldInPrunedFileSchema);
appendToColNamesList(isMessageType, fieldInPrunedFileSchema.getName());
}
}
if (!isMessageType) {
GroupType groupType = new GroupType(Type.Repetition.REPEATED, fieldNames.peek(), types);
typesById.put(struct.getId().intValue(), groupType);
return groupType;
} else {
return new MessageType("table", types);
}
}
Aggregations