use of org.apache.parquet.schema.Type in project hive by apache.
the class VectorizedParquetRecordReader method buildVectorizedParquetReader.
// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, String conversionTimeZone, int depth) throws IOException {
List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
switch(typeInfo.getCategory()) {
case PRIMITIVE:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
} else {
return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), conversionTimeZone, type);
}
case STRUCT:
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
List<Type> types = type.asGroupType().getFields();
for (int i = 0; i < fieldTypes.size(); i++) {
VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, conversionTimeZone, depth + 1);
if (r != null) {
fieldReaders.add(r);
} else {
throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
}
}
return new VectorizedStructColumnReader(fieldReaders);
case LIST:
case MAP:
case UNION:
default:
throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
}
}
use of org.apache.parquet.schema.Type in project hive by apache.
the class HiveParquetSchemaTestUtils method testConversion.
public static void testConversion(final String columnNamesStr, final String columnsTypeStr, final String actualSchema) throws Exception {
final List<String> columnNames = createHiveColumnsFrom(columnNamesStr);
final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr);
final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
final MessageType expectedMT = MessageTypeParser.parseMessageType(actualSchema);
assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + actualSchema, expectedMT, messageTypeFound);
// Required to check the original types manually as PrimitiveType.equals does not care about it
List<Type> expectedFields = expectedMT.getFields();
List<Type> actualFields = messageTypeFound.getFields();
for (int i = 0, n = expectedFields.size(); i < n; ++i) {
OriginalType exp = expectedFields.get(i).getOriginalType();
OriginalType act = actualFields.get(i).getOriginalType();
assertEquals("Original types of the field do not match", exp, act);
}
}
use of org.apache.parquet.schema.Type in project druid by druid-io.
the class DruidParquetReadSupport method getPartialReadSchema.
private MessageType getPartialReadSchema(InitContext context) {
MessageType fullSchema = context.getFileSchema();
String name = fullSchema.getName();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
Set<String> dimensions = Sets.newHashSet();
for (DimensionSchema dim : dimensionSchema) {
dimensions.add(dim.getName());
}
Set<String> metricsFields = Sets.newHashSet();
for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
metricsFields.addAll(agg.requiredFields());
}
List<Type> partialFields = Lists.newArrayList();
for (Type type : fullSchema.getFields()) {
if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
partialFields.add(type);
}
}
return new MessageType(name, partialFields);
}
use of org.apache.parquet.schema.Type in project drill by apache.
the class Metadata method getColTypeInfo.
private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) {
if (type.isPrimitive()) {
PrimitiveType primitiveType = (PrimitiveType) type;
int precision = 0;
int scale = 0;
if (primitiveType.getDecimalMetadata() != null) {
precision = primitiveType.getDecimalMetadata().getPrecision();
scale = primitiveType.getDecimalMetadata().getScale();
}
int repetitionLevel = schema.getMaxRepetitionLevel(path);
int definitionLevel = schema.getMaxDefinitionLevel(path);
return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel);
}
Type t = ((GroupType) type).getType(path[depth]);
return getColTypeInfo(schema, t, path, depth + 1);
}
use of org.apache.parquet.schema.Type in project drill by apache.
the class DrillParquetReader method getProjection.
public static MessageType getProjection(MessageType schema, Collection<SchemaPath> columns, List<SchemaPath> columnsNotFound) {
MessageType projection = null;
String messageName = schema.getName();
List<ColumnDescriptor> schemaColumns = schema.getColumns();
// parquet type.union() seems to lose ConvertedType info when merging two columns that are the same type. This can
// happen when selecting two elements from an array. So to work around this, we use set of SchemaPath to avoid duplicates
// and then merge the types at the end
Set<SchemaPath> selectedSchemaPaths = Sets.newLinkedHashSet();
// get a list of modified columns which have the array elements removed from the schema path since parquet schema doesn't include array elements
List<SchemaPath> modifiedColumns = Lists.newLinkedList();
for (SchemaPath path : columns) {
List<String> segments = Lists.newArrayList();
PathSegment seg = path.getRootSegment();
do {
if (seg.isNamed()) {
segments.add(seg.getNameSegment().getPath());
}
} while ((seg = seg.getChild()) != null);
String[] pathSegments = new String[segments.size()];
segments.toArray(pathSegments);
SchemaPath modifiedSchemaPath = SchemaPath.getCompoundPath(pathSegments);
modifiedColumns.add(modifiedSchemaPath);
}
// convert the columns in the parquet schema to a list of SchemaPath columns so that they can be compared in case insensitive manner
// to the projection columns
List<SchemaPath> schemaPaths = Lists.newLinkedList();
for (ColumnDescriptor columnDescriptor : schemaColumns) {
String[] schemaColDesc = Arrays.copyOf(columnDescriptor.getPath(), columnDescriptor.getPath().length);
SchemaPath schemaPath = SchemaPath.getCompoundPath(schemaColDesc);
schemaPaths.add(schemaPath);
}
// loop through projection columns and add any columns that are missing from parquet schema to columnsNotFound list
for (SchemaPath columnPath : modifiedColumns) {
boolean notFound = true;
for (SchemaPath schemaPath : schemaPaths) {
if (schemaPath.contains(columnPath)) {
selectedSchemaPaths.add(schemaPath);
notFound = false;
}
}
if (notFound) {
columnsNotFound.add(columnPath);
}
}
// convert SchemaPaths from selectedSchemaPaths and convert to parquet type, and merge into projection schema
for (SchemaPath schemaPath : selectedSchemaPaths) {
List<String> segments = Lists.newArrayList();
PathSegment seg = schemaPath.getRootSegment();
do {
segments.add(seg.getNameSegment().getPath());
} while ((seg = seg.getChild()) != null);
String[] pathSegments = new String[segments.size()];
segments.toArray(pathSegments);
Type t = getType(pathSegments, 0, schema);
if (projection == null) {
projection = new MessageType(messageName, t);
} else {
projection = projection.union(new MessageType(messageName, t));
}
}
return projection;
}
Aggregations