use of org.apache.parquet.schema.Type in project drill by apache.
the class ParquetRecordWriter method getType.
private Type getType(MaterializedField field) {
MinorType minorType = field.getType().getMinorType();
DataMode dataMode = field.getType().getMode();
switch(minorType) {
case MAP:
List<Type> types = getChildrenTypes(field);
return new GroupType(dataMode == DataMode.REPEATED ? Repetition.REPEATED : Repetition.OPTIONAL, field.getName(), types);
case DICT:
// RepeatedDictVector has DictVector as data vector hence the need to get the first child
// for REPEATED case to be able to access map's key and value fields
MaterializedField dictField = dataMode != DataMode.REPEATED ? field : ((List<MaterializedField>) field.getChildren()).get(0);
List<Type> keyValueTypes = getChildrenTypes(dictField);
GroupType keyValueGroup = new GroupType(Repetition.REPEATED, GROUP_KEY_VALUE_NAME, keyValueTypes);
if (dataMode == DataMode.REPEATED) {
// Parquet's MAP repetition must be either optional or required, so nest it inside Parquet's LIST type
GroupType elementType = org.apache.parquet.schema.Types.buildGroup(Repetition.OPTIONAL).as(OriginalType.MAP).addField(keyValueGroup).named(LIST);
GroupType listGroup = new GroupType(Repetition.REPEATED, LIST, elementType);
return org.apache.parquet.schema.Types.buildGroup(Repetition.OPTIONAL).as(OriginalType.LIST).addField(listGroup).named(field.getName());
} else {
return org.apache.parquet.schema.Types.buildGroup(Repetition.OPTIONAL).as(OriginalType.MAP).addField(keyValueGroup).named(field.getName());
}
case LIST:
MaterializedField elementField = getDataField(field);
ListBuilder<GroupType> listBuilder = org.apache.parquet.schema.Types.list(dataMode == DataMode.OPTIONAL ? Repetition.OPTIONAL : Repetition.REQUIRED);
addElementType(listBuilder, elementField);
GroupType listType = listBuilder.named(field.getName());
return listType;
case NULL:
MaterializedField newField = field.withType(TypeProtos.MajorType.newBuilder().setMinorType(MinorType.INT).setMode(DataMode.OPTIONAL).build());
return getPrimitiveType(newField);
default:
return getPrimitiveType(field);
}
}
use of org.apache.parquet.schema.Type in project drill by apache.
the class ParquetRecordWriter method addElementType.
/**
* Adds element type to {@code listBuilder} based on Drill's
* {@code elementField}.
*
* @param listBuilder list schema builder
* @param elementField Drill's type of list elements
*/
private void addElementType(ListBuilder<GroupType> listBuilder, MaterializedField elementField) {
if (elementField.getDataMode() == DataMode.REPEATED) {
ListBuilder<GroupType> inner = org.apache.parquet.schema.Types.requiredList();
if (elementField.getType().getMinorType() == MinorType.MAP) {
GroupType mapGroupType = new GroupType(Repetition.REQUIRED, ELEMENT, getChildrenTypes(elementField));
inner.element(mapGroupType);
} else {
MaterializedField child2 = getDataField(elementField);
addElementType(inner, child2);
}
listBuilder.setElementType(inner.named(ELEMENT));
} else {
Type element = getType(elementField);
// rename it to 'element' according to Parquet list schema
if (element.isPrimitive()) {
PrimitiveType primitiveElement = element.asPrimitiveType();
element = new PrimitiveType(primitiveElement.getRepetition(), primitiveElement.getPrimitiveTypeName(), ELEMENT, primitiveElement.getOriginalType());
} else {
GroupType groupElement = element.asGroupType();
element = new GroupType(groupElement.getRepetition(), ELEMENT, groupElement.getFields());
}
listBuilder.element(element);
}
}
use of org.apache.parquet.schema.Type in project drill by apache.
the class ParquetRecordWriter method newSchema.
private void newSchema() throws IOException {
List<Type> types = new ArrayList<>();
for (MaterializedField field : batchSchema) {
if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
continue;
}
types.add(getType(field));
}
schema = new MessageType("root", types);
// We don't want this number to be too small, ideally we divide the block equally across the columns.
// It is unlikely all columns are going to be the same size.
// Its value is likely below Integer.MAX_VALUE (2GB), although rowGroupSize is a long type.
// Therefore this size is cast to int, since allocating byte array in under layer needs to
// limit the array size in an int scope.
int initialBlockBufferSize = this.schema.getColumns().size() > 0 ? max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5) : MINIMUM_BUFFER_SIZE;
// We don't want this number to be too small either. Ideally, slightly bigger than the page size,
// but not bigger than the block buffer
int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
ValuesWriterFactory valWriterFactory = writerVersion == WriterVersion.PARQUET_1_0 ? new DefaultV1ValuesWriterFactory() : new DefaultV2ValuesWriterFactory();
ParquetProperties parquetProperties = ParquetProperties.builder().withPageSize(pageSize).withDictionaryEncoding(enableDictionary).withDictionaryPageSize(initialPageBufferSize).withAllocator(new ParquetDirectByteBufferAllocator(oContext)).withValuesWriterFactory(valWriterFactory).withWriterVersion(writerVersion).build();
// TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
// once DRILL-7906 (PARQUET-1006) will be resolved
pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, parquetProperties.getInitialSlabSize(), pageSize, parquetProperties.getAllocator(), parquetProperties.getColumnIndexTruncateLength(), parquetProperties.getPageWriteChecksumEnabled());
store = writerVersion == WriterVersion.PARQUET_1_0 ? new ColumnWriteStoreV1(schema, pageStore, parquetProperties) : new ColumnWriteStoreV2(schema, pageStore, parquetProperties);
MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
consumer = columnIO.getRecordWriter(store);
setUp(schema, consumer);
}
Aggregations