use of org.apache.parquet.schema.Type in project hive by apache.
the class DataWritableReadSupport method getProjectedType.
private static Type getProjectedType(TypeInfo colType, Type fieldType) {
switch(colType.getCategory()) {
case STRUCT:
List<Type> groupFields = getProjectedGroupFields(fieldType.asGroupType(), ((StructTypeInfo) colType).getAllStructFieldNames(), ((StructTypeInfo) colType).getAllStructFieldTypeInfos());
Type[] typesArray = groupFields.toArray(new Type[0]);
return Types.buildGroup(fieldType.getRepetition()).addFields(typesArray).named(fieldType.getName());
case LIST:
TypeInfo elemType = ((ListTypeInfo) colType).getListElementTypeInfo();
if (elemType.getCategory() == ObjectInspector.Category.STRUCT) {
Type subFieldType = fieldType.asGroupType().getType(0);
if (!subFieldType.isPrimitive()) {
String subFieldName = subFieldType.getName();
Text name = new Text(subFieldName);
if (name.equals(ParquetHiveSerDe.ARRAY) || name.equals(ParquetHiveSerDe.LIST)) {
subFieldType = new GroupType(Repetition.REPEATED, subFieldName, getProjectedType(elemType, subFieldType.asGroupType().getType(0)));
} else {
subFieldType = getProjectedType(elemType, subFieldType);
}
return Types.buildGroup(Repetition.OPTIONAL).as(OriginalType.LIST).addFields(subFieldType).named(fieldType.getName());
}
}
break;
default:
}
return fieldType;
}
use of org.apache.parquet.schema.Type in project hive by apache.
the class DataWritableReadSupport method projectLeafTypes.
private static List<Type> projectLeafTypes(List<Type> types, List<FieldNode> nodes) {
List<Type> res = new ArrayList<>();
if (nodes.isEmpty()) {
return res;
}
Map<String, FieldNode> fieldMap = new HashMap<>();
for (FieldNode n : nodes) {
fieldMap.put(n.getFieldName().toLowerCase(), n);
}
for (Type type : types) {
String tn = type.getName().toLowerCase();
if (fieldMap.containsKey(tn)) {
FieldNode f = fieldMap.get(tn);
if (f.getNodes().isEmpty()) {
// no child, no need for pruning
res.add(type);
} else {
if (type instanceof GroupType) {
GroupType groupType = type.asGroupType();
List<Type> ts = projectLeafTypes(groupType.getFields(), f.getNodes());
GroupType g = buildProjectedGroupType(groupType, ts);
if (g != null) {
res.add(g);
}
} else {
throw new RuntimeException("Primitive type " + f.getFieldName() + "should not " + "doesn't match type" + f.toString());
}
}
}
}
return res;
}
use of org.apache.parquet.schema.Type in project hive by apache.
the class VectorizedParquetRecordReader method checkEndOfRowGroup.
private void checkEndOfRowGroup() throws IOException {
if (rowsReturned != totalCountLoadedSoFar) {
return;
}
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
}
List<ColumnDescriptor> columns = requestedSchema.getColumns();
List<Type> types = requestedSchema.getFields();
columnReaders = new VectorizedColumnReader[columns.size()];
String timeZoneId = jobConf.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY);
if (!ColumnProjectionUtils.isReadAllColumns(jobConf) && !indexColumnsWanted.isEmpty()) {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(indexColumnsWanted.get(i)), types.get(i), pages, requestedSchema.getColumns(), timeZoneId, 0);
}
} else {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages, requestedSchema.getColumns(), timeZoneId, 0);
}
}
totalCountLoadedSoFar += pages.getRowCount();
}
use of org.apache.parquet.schema.Type in project drill by apache.
the class ParquetRecordWriter method newSchema.
private void newSchema() throws IOException {
List<Type> types = Lists.newArrayList();
for (MaterializedField field : batchSchema) {
if (field.getPath().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
continue;
}
types.add(getType(field));
}
schema = new MessageType("root", types);
int initialBlockBufferSize = max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5);
pageStore = ColumnChunkPageWriteStoreExposer.newColumnChunkPageWriteStore(this.oContext, codecFactory.getCompressor(codec), schema);
int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
store = new ColumnWriteStoreV1(pageStore, pageSize, initialPageBufferSize, enableDictionary, writerVersion, new ParquetDirectByteBufferAllocator(oContext));
MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
consumer = columnIO.getRecordWriter(store);
setUp(schema, consumer);
}
use of org.apache.parquet.schema.Type in project drill by apache.
the class ParquetRecordWriter method getType.
private Type getType(MaterializedField field) {
MinorType minorType = field.getType().getMinorType();
DataMode dataMode = field.getType().getMode();
switch(minorType) {
case MAP:
List<Type> types = Lists.newArrayList();
for (MaterializedField childField : field.getChildren()) {
types.add(getType(childField));
}
return new GroupType(dataMode == DataMode.REPEATED ? Repetition.REPEATED : Repetition.OPTIONAL, field.getLastName(), types);
case LIST:
throw new UnsupportedOperationException("Unsupported type " + minorType);
default:
return getPrimitiveType(field);
}
}
Aggregations