use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class DataWritableReadSupport method resolveSchemaAccess.
/**
* Determine the file column names based on the position within the requested columns and
* use that as the requested schema.
*/
private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema, Configuration configuration) {
if (configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) {
final List<String> listColumns = getColumns(configuration.get(IOConstants.COLUMNS));
List<Type> requestedTypes = new ArrayList<Type>();
for (Type t : requestedSchema.getFields()) {
int index = listColumns.indexOf(t.getName());
requestedTypes.add(fileSchema.getType(index));
}
requestedSchema = new MessageType(requestedSchema.getName(), requestedTypes);
}
return requestedSchema;
}
use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class AvroSchemaConverter method convertUnion.
private Type convertUnion(String fieldName, Schema schema, Type.Repetition repetition) {
List<Schema> nonNullSchemas = new ArrayList<Schema>(schema.getTypes().size());
for (Schema childSchema : schema.getTypes()) {
if (childSchema.getType().equals(Schema.Type.NULL)) {
if (Type.Repetition.REQUIRED == repetition) {
repetition = Type.Repetition.OPTIONAL;
}
} else {
nonNullSchemas.add(childSchema);
}
}
// otherwise construct a union container
switch(nonNullSchemas.size()) {
case 0:
throw new UnsupportedOperationException("Cannot convert Avro union of only nulls");
case 1:
return convertField(fieldName, nonNullSchemas.get(0), repetition);
default:
// complex union type
List<Type> unionTypes = new ArrayList<Type>(nonNullSchemas.size());
int index = 0;
for (Schema childSchema : nonNullSchemas) {
unionTypes.add(convertField("member" + index++, childSchema, Type.Repetition.OPTIONAL));
}
return new GroupType(repetition, fieldName, unionTypes);
}
}
use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class TestPigSchemaConverter method testListsOfPrimitive.
@Test
public void testListsOfPrimitive() throws Exception {
for (Type.Repetition repetition : Type.Repetition.values()) {
for (Type.Repetition valueRepetition : Type.Repetition.values()) {
for (PrimitiveType.PrimitiveTypeName primitiveTypeName : PrimitiveType.PrimitiveTypeName.values()) {
if (primitiveTypeName != PrimitiveType.PrimitiveTypeName.INT96) {
// INT96 is NYI
Types.PrimitiveBuilder<PrimitiveType> value = Types.primitive(primitiveTypeName, valueRepetition);
if (primitiveTypeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
value.length(1);
GroupType type = Types.buildGroup(repetition).addField(value.named("b")).as(OriginalType.LIST).named("a");
// no exceptions, please
pigSchemaConverter.convertField(type);
}
}
}
}
}
use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class GroupWriter method writeGroup.
private void writeGroup(Group group, GroupType type) {
int fieldCount = type.getFieldCount();
for (int field = 0; field < fieldCount; ++field) {
int valueCount = group.getFieldRepetitionCount(field);
if (valueCount > 0) {
Type fieldType = type.getType(field);
String fieldName = fieldType.getName();
recordConsumer.startField(fieldName, field);
for (int index = 0; index < valueCount; ++index) {
if (fieldType.isPrimitive()) {
group.writeValue(field, index, recordConsumer);
} else {
recordConsumer.startGroup();
writeGroup(group.getGroup(field, index), fieldType.asGroupType());
recordConsumer.endGroup();
}
}
recordConsumer.endField(fieldName, field);
}
}
}
use of org.apache.parquet.schema.Type in project drill by axbaretto.
the class ParquetRecordWriter method newSchema.
private void newSchema() throws IOException {
List<Type> types = Lists.newArrayList();
for (MaterializedField field : batchSchema) {
if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
continue;
}
types.add(getType(field));
}
schema = new MessageType("root", types);
// We don't want this number to be too small, ideally we divide the block equally across the columns.
// It is unlikely all columns are going to be the same size.
// Its value is likely below Integer.MAX_VALUE (2GB), although rowGroupSize is a long type.
// Therefore this size is cast to int, since allocating byte array in under layer needs to
// limit the array size in an int scope.
int initialBlockBufferSize = max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5);
// We don't want this number to be too small either. Ideally, slightly bigger than the page size,
// but not bigger than the block buffer
int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
// TODO: Use initialSlabSize from ParquetProperties once drill will be updated to the latest version of Parquet library
int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(64, pageSize, 10);
// TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
// once PARQUET-1006 will be resolved
pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, initialSlabSize, pageSize, new ParquetDirectByteBufferAllocator(oContext));
store = new ColumnWriteStoreV1(pageStore, pageSize, initialPageBufferSize, enableDictionary, writerVersion, new ParquetDirectByteBufferAllocator(oContext));
MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
consumer = columnIO.getRecordWriter(store);
setUp(schema, consumer);
}
Aggregations