use of org.apache.parquet.schema.Type in project hive by apache.
the class DataWritableReadSupport method projectLeafTypes.
private static List<Type> projectLeafTypes(List<Type> types, List<FieldNode> nodes) {
List<Type> res = new ArrayList<>();
if (nodes.isEmpty()) {
return res;
}
Map<String, FieldNode> fieldMap = new HashMap<>();
for (FieldNode n : nodes) {
fieldMap.put(n.getFieldName().toLowerCase(), n);
}
for (Type type : types) {
String tn = type.getName().toLowerCase();
if (fieldMap.containsKey(tn)) {
FieldNode f = fieldMap.get(tn);
if (f.getNodes().isEmpty()) {
// no child, no need for pruning
res.add(type);
} else {
if (type instanceof GroupType) {
GroupType groupType = type.asGroupType();
List<Type> ts = projectLeafTypes(groupType.getFields(), f.getNodes());
GroupType g = buildProjectedGroupType(groupType, ts);
if (g != null) {
res.add(g);
}
} else {
throw new RuntimeException("Primitive type " + f.getFieldName() + "should not " + "doesn't match type" + f.toString());
}
}
}
}
return res;
}
use of org.apache.parquet.schema.Type in project hive by apache.
the class VectorizedParquetRecordReader method checkEndOfRowGroup.
private void checkEndOfRowGroup() throws IOException {
if (rowsReturned != totalCountLoadedSoFar) {
return;
}
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
}
List<ColumnDescriptor> columns = requestedSchema.getColumns();
List<Type> types = requestedSchema.getFields();
columnReaders = new VectorizedColumnReader[columns.size()];
if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) {
// However, if colsToInclude is not empty we should initialize each columnReader
if (!colsToInclude.isEmpty()) {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
}
}
} else {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
}
}
totalCountLoadedSoFar += pages.getRowCount();
}
use of org.apache.parquet.schema.Type in project hive by apache.
the class VectorizedParquetRecordReader method buildVectorizedParquetReader.
// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, boolean skipTimestampConversion, int depth) throws IOException {
List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
switch(typeInfo.getCategory()) {
case PRIMITIVE:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
if (fileSchema.getColumns().contains(descriptors.get(0))) {
return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo);
} else {
// Support for schema evolution
return new VectorizedDummyColumnReader();
}
case STRUCT:
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
List<Type> types = type.asGroupType().getFields();
for (int i = 0; i < fieldTypes.size(); i++) {
VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion, depth + 1);
if (r != null) {
fieldReaders.add(r);
} else {
throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
}
}
return new VectorizedStructColumnReader(fieldReaders);
case LIST:
checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, getElementType(type), typeInfo);
case MAP:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
// to handle the different Map definition in Parquet, eg:
// definition has 1 group:
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// definition has 2 groups:
// optional group m1 (MAP) {
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// }
int nestGroup = 0;
GroupType groupType = type.asGroupType();
// otherwise, continue to get the group type until MAP_DEFINITION_LEVEL_MAX.
while (groupType.getFieldCount() < 2) {
if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
throw new RuntimeException("More than " + MAP_DEFINITION_LEVEL_MAX + " level is found in Map definition, " + "Failed to get the field types for Map with type " + type);
}
groupType = groupType.getFields().get(0).asGroupType();
nestGroup++;
}
List<Type> kvTypes = groupType.getFields();
VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, kvTypes.get(0), typeInfo);
VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader(descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, kvTypes.get(1), typeInfo);
return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
case UNION:
default:
throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
}
}
use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class DataWritableWriter method writeData.
private void writeData(final ArrayWritable arr, final GroupType type) {
if (arr == null) {
return;
}
final int fieldCount = type.getFieldCount();
Writable[] values = arr.get();
for (int field = 0; field < fieldCount; ++field) {
final Type fieldType = type.getType(field);
final String fieldName = fieldType.getName();
final Writable value = values[field];
if (value == null) {
continue;
}
recordConsumer.startField(fieldName, field);
if (fieldType.isPrimitive()) {
writePrimitive(value);
} else {
recordConsumer.startGroup();
if (value instanceof ArrayWritable) {
if (fieldType.asGroupType().getRepetition().equals(Type.Repetition.REPEATED)) {
writeArray((ArrayWritable) value, fieldType.asGroupType());
} else {
writeData((ArrayWritable) value, fieldType.asGroupType());
}
} else if (value != null) {
throw new ParquetEncodingException("This should be an ArrayWritable or MapWritable: " + value);
}
recordConsumer.endGroup();
}
recordConsumer.endField(fieldName, field);
}
}
use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class AvroSchemaConverter method convertFields.
private Schema convertFields(String name, List<Type> parquetFields) {
List<Schema.Field> fields = new ArrayList<Schema.Field>();
for (Type parquetType : parquetFields) {
Schema fieldSchema = convertField(parquetType);
if (parquetType.isRepetition(REPEATED)) {
throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType);
} else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) {
fields.add(new Schema.Field(parquetType.getName(), optional(fieldSchema), null, NULL_VALUE));
} else {
// REQUIRED
fields.add(new Schema.Field(parquetType.getName(), fieldSchema, null, (Object) null));
}
}
Schema schema = Schema.createRecord(name, null, null, false);
schema.setFields(fields);
return schema;
}
Aggregations