use of org.apache.parquet.schema.Type in project presto by prestodb.
the class SingleLevelArraySchemaConverter method convertMapType.
// An optional group containing a repeated anonymous group "map", containing
// 2 elements: "key", "value"
private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo, final Repetition repetition) {
final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED);
final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), typeInfo.getMapValueTypeInfo());
return ConversionPatterns.mapType(repetition, name, keyType, valueType);
}
use of org.apache.parquet.schema.Type in project presto by prestodb.
the class ParquetTypeVisitor method visit.
public static <T> T visit(Type type, ParquetTypeVisitor<T> visitor) {
if (type instanceof MessageType) {
return visitor.message((MessageType) type, visitFields(type.asGroupType(), visitor));
} else if (type.isPrimitive()) {
return visitor.primitive(type.asPrimitiveType());
} else {
// if not a primitive, the typeId must be a group
GroupType group = type.asGroupType();
OriginalType annotation = group.getOriginalType();
if (annotation == LIST) {
checkArgument(!group.isRepetition(REPEATED), "Invalid list: top-level group is repeated: " + group);
checkArgument(group.getFieldCount() == 1, "Invalid list: does not contain single repeated field: " + group);
GroupType repeatedElement = group.getFields().get(0).asGroupType();
checkArgument(repeatedElement.isRepetition(REPEATED), "Invalid list: inner group is not repeated");
checkArgument(repeatedElement.getFieldCount() <= 1, "Invalid list: repeated group is not a single field: " + group);
visitor.fieldNames.push(repeatedElement.getName());
try {
T elementResult = null;
if (repeatedElement.getFieldCount() > 0) {
elementResult = visitField(repeatedElement.getType(0), visitor);
}
return visitor.list(group, elementResult);
} finally {
visitor.fieldNames.pop();
}
} else if (annotation == MAP) {
checkArgument(!group.isRepetition(REPEATED), "Invalid map: top-level group is repeated: " + group);
checkArgument(group.getFieldCount() == 1, "Invalid map: does not contain single repeated field: " + group);
GroupType repeatedKeyValue = group.getType(0).asGroupType();
checkArgument(repeatedKeyValue.isRepetition(REPEATED), "Invalid map: inner group is not repeated");
checkArgument(repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields");
visitor.fieldNames.push(repeatedKeyValue.getName());
try {
T keyResult = null;
T valueResult = null;
if (repeatedKeyValue.getFieldCount() == 2) {
keyResult = visitField(repeatedKeyValue.getType(0), visitor);
valueResult = visitField(repeatedKeyValue.getType(1), visitor);
} else if (repeatedKeyValue.getFieldCount() == 1) {
Type keyOrValue = repeatedKeyValue.getType(0);
if (keyOrValue.getName().equalsIgnoreCase("key")) {
keyResult = visitField(keyOrValue, visitor);
// value result remains null
} else {
valueResult = visitField(keyOrValue, visitor);
// key result remains null
}
}
return visitor.map(group, keyResult, valueResult);
} finally {
visitor.fieldNames.pop();
}
}
return visitor.struct(group, visitFields(group, visitor));
}
}
use of org.apache.parquet.schema.Type in project flink by apache.
the class ParquetVectorizedInputFormat method checkSchema.
private void checkSchema(MessageType fileSchema, MessageType requestedSchema) throws IOException, UnsupportedOperationException {
if (projectedFields.length != requestedSchema.getFieldCount()) {
throw new RuntimeException("The quality of field type is incompatible with the request schema!");
}
/*
* Check that the requested schema is supported.
*/
for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
Type t = requestedSchema.getFields().get(i);
if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
throw new UnsupportedOperationException("Complex types not supported.");
}
String[] colPath = requestedSchema.getPaths().get(i);
if (fileSchema.containsPath(colPath)) {
ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
if (!fd.equals(requestedSchema.getColumns().get(i))) {
throw new UnsupportedOperationException("Schema evolution not supported.");
}
} else {
if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) {
// invalid.
throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath));
}
}
}
}
use of org.apache.parquet.schema.Type in project hive by apache.
the class ParquetFilterPredicateConverter method translate.
private static FilterPredicate translate(ExpressionTree root, List<PredicateLeaf> leaves, Map<String, TypeInfo> columns, MessageType schema) throws Exception {
FilterPredicate p = null;
switch(root.getOperator()) {
case OR:
for (ExpressionTree child : root.getChildren()) {
FilterPredicate childPredicate = translate(child, leaves, columns, schema);
if (childPredicate == null) {
return null;
}
if (p == null) {
p = childPredicate;
} else {
p = FilterApi.or(p, childPredicate);
}
}
return p;
case AND:
for (ExpressionTree child : root.getChildren()) {
if (p == null) {
p = translate(child, leaves, columns, schema);
} else {
FilterPredicate right = translate(child, leaves, columns, schema);
// constant means no filter, ignore it when it is null
if (right != null) {
p = FilterApi.and(p, right);
}
}
}
return p;
case NOT:
FilterPredicate op = translate(root.getChildren().get(0), leaves, columns, schema);
if (op != null) {
return FilterApi.not(op);
} else {
return null;
}
case LEAF:
PredicateLeaf leaf = leaves.get(root.getLeaf());
// If columns is null, then we need to create the leaf
if (columns.containsKey(leaf.getColumnName())) {
Type parquetType = schema.getType(leaf.getColumnName());
TypeInfo hiveType = columns.get(leaf.getColumnName());
return buildFilterPredicateFromPredicateLeaf(leaf, parquetType, hiveType);
} else {
// Do not create predicate if the leaf is not on the passed schema.
return null;
}
case CONSTANT:
// no filter will be executed for constant
return null;
default:
throw new IllegalStateException("Unknown operator: " + root.getOperator());
}
}
use of org.apache.parquet.schema.Type in project hive by apache.
the class DataWritableReadSupport method getProjectedSchema.
/**
* Generate the projected schema from colIndexes and nested column paths. If the column is
* contained by colIndex, it will be added directly, otherwise it will build a group type which
* contains all required sub types using nestedColumnPaths.
* @param schema original schema
* @param colNames
* @param colIndexes the index of needed columns
* @param nestedColumnPaths the paths for nested columns
* @return
*/
public static MessageType getProjectedSchema(MessageType schema, List<String> colNames, List<Integer> colIndexes, Set<String> nestedColumnPaths) {
List<Type> schemaTypes = new ArrayList<Type>();
Map<String, FieldNode> prunedCols = getPrunedNestedColumns(nestedColumnPaths);
for (Integer i : colIndexes) {
if (i < colNames.size()) {
if (i < schema.getFieldCount()) {
Type t = schema.getType(i);
String tn = t.getName().toLowerCase();
if (!prunedCols.containsKey(tn)) {
schemaTypes.add(schema.getType(i));
} else {
if (t.isPrimitive()) {
// For primitive type, add directly.
schemaTypes.add(t);
} else {
// For group type, we need to build the projected group type with required leaves
List<Type> g = projectLeafTypes(Arrays.asList(t), Arrays.asList(prunedCols.get(tn)));
if (!g.isEmpty()) {
schemaTypes.addAll(g);
}
}
}
} else {
// prefixing with '_mask_' to ensure no conflict with named
// columns in the file schema
schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i)));
}
}
}
return new MessageType(schema.getName(), schemaTypes);
}
Aggregations