use of org.apache.parquet.schema.Type in project drill by axbaretto.
the class ParquetRecordWriter method getType.
private Type getType(MaterializedField field) {
MinorType minorType = field.getType().getMinorType();
DataMode dataMode = field.getType().getMode();
switch(minorType) {
case MAP:
List<Type> types = Lists.newArrayList();
for (MaterializedField childField : field.getChildren()) {
types.add(getType(childField));
}
return new GroupType(dataMode == DataMode.REPEATED ? Repetition.REPEATED : Repetition.OPTIONAL, field.getName(), types);
case LIST:
throw new UnsupportedOperationException("Unsupported type " + minorType);
default:
return getPrimitiveType(field);
}
}
use of org.apache.parquet.schema.Type in project drill by axbaretto.
the class Metadata method getColTypeInfo.
private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) {
if (type.isPrimitive()) {
PrimitiveType primitiveType = (PrimitiveType) type;
int precision = 0;
int scale = 0;
if (primitiveType.getDecimalMetadata() != null) {
precision = primitiveType.getDecimalMetadata().getPrecision();
scale = primitiveType.getDecimalMetadata().getScale();
}
int repetitionLevel = schema.getMaxRepetitionLevel(path);
int definitionLevel = schema.getMaxDefinitionLevel(path);
return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel);
}
Type t = ((GroupType) type).getType(path[depth]);
return getColTypeInfo(schema, t, path, depth + 1);
}
use of org.apache.parquet.schema.Type in project drill by axbaretto.
the class DrillParquetReader method setup.
@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
try {
this.operatorContext = context;
schema = footer.getFileMetaData().getSchema();
MessageType projection;
if (isStarQuery()) {
projection = schema;
} else {
columnsNotFound = new ArrayList<>();
projection = getProjection(schema, getColumns(), columnsNotFound);
if (projection == null) {
projection = schema;
}
if (columnsNotFound != null && columnsNotFound.size() > 0) {
nullFilledVectors = new ArrayList<>();
for (SchemaPath col : columnsNotFound) {
// col.toExpr() is used here as field name since we don't want to see these fields in the existing maps
nullFilledVectors.add((NullableIntVector) output.addField(MaterializedField.create(col.toExpr(), org.apache.drill.common.types.Types.optional(TypeProtos.MinorType.INT)), (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)));
}
if (columnsNotFound.size() == getColumns().size()) {
noColumnsFound = true;
}
}
}
logger.debug("Requesting schema {}", projection);
ColumnIOFactory factory = new ColumnIOFactory(false);
MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
Map<ColumnPath, ColumnChunkMetaData> paths = new HashMap<>();
for (ColumnChunkMetaData md : footer.getBlocks().get(entry.getRowGroupIndex()).getColumns()) {
paths.put(md.getPath(), md);
}
Path filePath = new Path(entry.getPath());
BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
recordCount = (int) blockMetaData.getRowCount();
pageReadStore = new ColumnChunkIncReadStore(recordCount, CodecFactory.createDirectCodecFactory(fileSystem.getConf(), new ParquetDirectByteBufferAllocator(operatorContext.getAllocator()), 0), operatorContext.getAllocator(), fileSystem, filePath);
for (String[] path : schema.getPaths()) {
Type type = schema.getType(path);
if (type.isPrimitive()) {
ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
pageReadStore.addColumn(schema.getColumnDescription(path), md);
}
}
if (!noColumnsFound) {
// Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
@SuppressWarnings("unchecked") final Collection<SchemaPath> columns = columnsNotFound == null || columnsNotFound.size() == 0 ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
recordMaterializer = new DrillParquetRecordMaterializer(output, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
}
} catch (Exception e) {
handleAndRaise("Failure in setting up reader", e);
}
}
use of org.apache.parquet.schema.Type in project drill by axbaretto.
the class DrillParquetReader method getProjection.
public static MessageType getProjection(MessageType schema, Collection<SchemaPath> columns, List<SchemaPath> columnsNotFound) {
MessageType projection = null;
String messageName = schema.getName();
List<ColumnDescriptor> schemaColumns = schema.getColumns();
// parquet type.union() seems to lose ConvertedType info when merging two columns that are the same type. This can
// happen when selecting two elements from an array. So to work around this, we use set of SchemaPath to avoid duplicates
// and then merge the types at the end
Set<SchemaPath> selectedSchemaPaths = Sets.newLinkedHashSet();
// get a list of modified columns which have the array elements removed from the schema path since parquet schema doesn't include array elements
List<SchemaPath> modifiedColumns = Lists.newLinkedList();
for (SchemaPath path : columns) {
List<String> segments = Lists.newArrayList();
PathSegment seg = path.getRootSegment();
do {
if (seg.isNamed()) {
segments.add(seg.getNameSegment().getPath());
}
} while ((seg = seg.getChild()) != null);
String[] pathSegments = new String[segments.size()];
segments.toArray(pathSegments);
SchemaPath modifiedSchemaPath = SchemaPath.getCompoundPath(pathSegments);
modifiedColumns.add(modifiedSchemaPath);
}
// convert the columns in the parquet schema to a list of SchemaPath columns so that they can be compared in case insensitive manner
// to the projection columns
List<SchemaPath> schemaPaths = Lists.newLinkedList();
for (ColumnDescriptor columnDescriptor : schemaColumns) {
String[] schemaColDesc = Arrays.copyOf(columnDescriptor.getPath(), columnDescriptor.getPath().length);
SchemaPath schemaPath = SchemaPath.getCompoundPath(schemaColDesc);
schemaPaths.add(schemaPath);
}
// loop through projection columns and add any columns that are missing from parquet schema to columnsNotFound list
for (SchemaPath columnPath : modifiedColumns) {
boolean notFound = true;
for (SchemaPath schemaPath : schemaPaths) {
if (schemaPath.contains(columnPath)) {
selectedSchemaPaths.add(schemaPath);
notFound = false;
}
}
if (notFound) {
columnsNotFound.add(columnPath);
}
}
// convert SchemaPaths from selectedSchemaPaths and convert to parquet type, and merge into projection schema
for (SchemaPath schemaPath : selectedSchemaPaths) {
List<String> segments = Lists.newArrayList();
PathSegment seg = schemaPath.getRootSegment();
do {
segments.add(seg.getNameSegment().getPath());
} while ((seg = seg.getChild()) != null);
String[] pathSegments = new String[segments.size()];
segments.toArray(pathSegments);
Type t = getType(pathSegments, 0, schema);
if (projection == null) {
projection = new MessageType(messageName, t);
} else {
projection = projection.union(new MessageType(messageName, t));
}
}
return projection;
}
use of org.apache.parquet.schema.Type in project drill by apache.
the class DrillParquetReader method adaptColumnsToParquetSchema.
/**
* This method adjusts collection of SchemaPath projection columns to better match columns in given
* schema. It does few things to reach the goal:
* <ul>
* <li>skips ArraySegments if present;</li>
* <li>interrupts further projections for Parquet MAPs to allow EvaluationVisitor manage get by key logic;</li>
* <li>adds additional listName and elementName for logical lists, because they exists in schema but absent in original projection columns.</li>
* </ul>
*
* @param columns original projection columns
* @param schema Parquet file schema
* @return adjusted projection columns
*/
private static List<SchemaPath> adaptColumnsToParquetSchema(Collection<SchemaPath> columns, MessageType schema) {
List<SchemaPath> modifiedColumns = new LinkedList<>();
for (SchemaPath path : columns) {
List<String> segments = new ArrayList<>();
Type segmentType = schema;
for (PathSegment seg = path.getRootSegment(); seg != null; seg = seg.getChild()) {
if (seg.isNamed()) {
segments.add(seg.getNameSegment().getPath());
}
segmentType = getSegmentType(segmentType, seg);
if (segmentType != null && !segmentType.isPrimitive()) {
GroupType segGroupType = segmentType.asGroupType();
if (ParquetReaderUtility.isLogicalMapType(segGroupType)) {
// later as values obtained from dict by key differ from the actual column's path
break;
} else if (ParquetReaderUtility.isLogicalListType(segGroupType)) {
// 'list' or 'bag'
String listName = segGroupType.getType(0).getName();
// 'element' or 'array_element'
String elementName = segGroupType.getType(0).asGroupType().getType(0).getName();
segments.add(listName);
segments.add(elementName);
}
}
}
modifiedColumns.add(SchemaPath.getCompoundPath(segments.toArray(new String[0])));
}
return modifiedColumns;
}
Aggregations