use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class MetadataUtils method showDetails.
private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List<String> cpath) {
String name = Strings.repeat(".", depth) + type.getName();
Repetition rep = type.getRepetition();
int fcount = type.getFieldCount();
out.format("%s: %s F:%d%n", name, rep, fcount);
cpath.add(type.getName());
for (Type ftype : type.getFields()) {
showDetails(out, ftype, depth + 1, container, cpath);
}
cpath.remove(cpath.size() - 1);
}
use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class TupleWriteSupport method write.
@Override
public void write(TupleEntry record) {
recordConsumer.startMessage();
final List<Type> fields = rootSchema.getFields();
for (int i = 0; i < fields.size(); i++) {
Type field = fields.get(i);
if (record == null || record.getObject(field.getName()) == null) {
continue;
}
recordConsumer.startField(field.getName(), i);
if (field.isPrimitive()) {
writePrimitive(record, field.asPrimitiveType());
} else {
throw new UnsupportedOperationException("Complex type not implemented");
}
recordConsumer.endField(field.getName(), i);
}
recordConsumer.endMessage();
}
use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class DataWritableReadSupport method init.
/**
* It creates the readContext for Parquet side with the requested schema during the init phase.
*
* @param configuration needed to get the wanted columns
* @param keyValueMetaData // unused
* @param fileSchema parquet file schema
* @return the parquet ReadContext
*/
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration, final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
final String columns = configuration.get(IOConstants.COLUMNS);
final Map<String, String> contextMetadata = new HashMap<String, String>();
if (columns != null) {
final List<String> listColumns = getColumns(columns);
final List<Type> typeListTable = new ArrayList<Type>();
for (final String col : listColumns) {
// listColumns contains partition columns which are metadata only
if (fileSchema.containsField(col)) {
typeListTable.add(fileSchema.getType(col));
} else {
// below allows schema evolution
typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
}
}
MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());
MessageType requestedSchemaByUser = tableSchema;
final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
final List<Type> typeListWanted = new ArrayList<Type>();
for (final Integer idx : indexColumnsWanted) {
typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
}
requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), typeListWanted), fileSchema, configuration);
return new ReadContext(requestedSchemaByUser, contextMetadata);
} else {
contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
return new ReadContext(fileSchema, contextMetadata);
}
}
use of org.apache.parquet.schema.Type in project parquet-mr by apache.
the class HiveSchemaConverter method convertMapType.
// An optional group containing a repeated anonymous group "map", containing
// 2 elements: "key", "value"
private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo) {
final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED);
final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), typeInfo.getMapValueTypeInfo());
return ConversionPatterns.mapType(Repetition.OPTIONAL, name, keyType, valueType);
}
use of org.apache.parquet.schema.Type in project drill by apache.
the class ParquetReaderUtility method containsComplexColumn.
/**
* Check whether any of columns in the given list is either nested or repetitive.
*
* @param footer Parquet file schema
* @param columns list of query SchemaPath objects
*/
public static boolean containsComplexColumn(ParquetMetadata footer, List<SchemaPath> columns) {
MessageType schema = footer.getFileMetaData().getSchema();
if (Utilities.isStarQuery(columns)) {
for (Type type : schema.getFields()) {
if (!type.isPrimitive()) {
return true;
}
}
for (ColumnDescriptor col : schema.getColumns()) {
if (col.getMaxRepetitionLevel() > 0) {
return true;
}
}
return false;
} else {
Map<String, ColumnDescriptor> colDescMap = ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
for (SchemaPath schemaPath : columns) {
// Schema path which is non-leaf is complex column
if (!schemaPath.isLeaf()) {
logger.trace("rowGroupScan contains complex column: {}", schemaPath.getUnIndexed().toString());
return true;
}
// following column descriptor lookup failure may mean two cases, depending on subsequent SchemaElement lookup:
// 1. success: queried column is complex, i.e. GroupType
// 2. failure: queried column is not in schema and thus is non-complex
ColumnDescriptor column = colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase());
if (column == null) {
SchemaElement schemaElement = schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase());
if (schemaElement != null) {
return true;
}
} else {
if (column.getMaxRepetitionLevel() > 0) {
logger.trace("rowGroupScan contains repetitive column: {}", schemaPath.getUnIndexed().toString());
return true;
}
}
}
}
return false;
}
Aggregations