use of org.apache.parquet.schema.Type in project druid by druid-io.
the class DruidParquetAvroReadSupport method getPartialReadSchema.
/**
* Select the columns from the parquet schema that are used in the schema of the ingestion job
*
* @param context The context of the file to be read
*
* @return the partial schema that only contains the columns that are being used in the schema
*/
private MessageType getPartialReadSchema(InitContext context) {
MessageType fullSchema = context.getFileSchema();
String name = fullSchema.getName();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
ParseSpec parseSpec = config.getParser().getParseSpec();
if (parseSpec instanceof AvroParseSpec) {
if (((AvroParseSpec) parseSpec).getFlattenSpec() != null) {
return fullSchema;
}
}
String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
Set<String> dimensions = new HashSet<>();
for (DimensionSchema dim : dimensionSchema) {
dimensions.add(dim.getName());
}
Set<String> metricsFields = new HashSet<>();
for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
metricsFields.addAll(agg.requiredFields());
}
List<Type> partialFields = new ArrayList<>();
for (Type type : fullSchema.getFields()) {
if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
partialFields.add(type);
}
}
return new MessageType(name, partialFields);
}
use of org.apache.parquet.schema.Type in project druid by druid-io.
the class ParquetGroupConverter method convertField.
/**
* See {@link ParquetGroupConverter#convertField(Group, String)}
*/
@Nullable
private static Object convertField(Group g, String fieldName, boolean binaryAsString) {
if (!g.getType().containsField(fieldName)) {
return null;
}
final int fieldIndex = g.getType().getFieldIndex(fieldName);
if (g.getFieldRepetitionCount(fieldIndex) <= 0) {
return null;
}
Type fieldType = g.getType().getFields().get(fieldIndex);
// primitive field
if (fieldType.isPrimitive()) {
// primitive list
if (fieldType.getRepetition().equals(Type.Repetition.REPEATED)) {
int repeated = g.getFieldRepetitionCount(fieldIndex);
List<Object> vals = new ArrayList<>();
for (int i = 0; i < repeated; i++) {
vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString));
}
return vals;
}
return convertPrimitiveField(g, fieldIndex, binaryAsString);
} else {
if (fieldType.isRepetition(Type.Repetition.REPEATED)) {
return convertRepeatedFieldToList(g, fieldIndex, binaryAsString);
}
if (isLogicalMapType(fieldType)) {
return convertLogicalMap(g.getGroup(fieldIndex, 0), binaryAsString);
}
if (isLogicalListType(fieldType)) {
return convertLogicalList(g.getGroup(fieldIndex, 0), binaryAsString);
}
// not a list, but not a primitive, return the nested group type
return g.getGroup(fieldIndex, 0);
}
}
use of org.apache.parquet.schema.Type in project druid by druid-io.
the class ParquetGroupConverter method convertRepeatedFieldToList.
/**
* convert a repeated field into a list of primitives or groups
*/
private static List<Object> convertRepeatedFieldToList(Group g, int fieldIndex, boolean binaryAsString) {
Type t = g.getType().getFields().get(fieldIndex);
assert t.getRepetition().equals(Type.Repetition.REPEATED);
int repeated = g.getFieldRepetitionCount(fieldIndex);
List<Object> vals = new ArrayList<>();
for (int i = 0; i < repeated; i++) {
if (t.isPrimitive()) {
vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString));
} else {
vals.add(g.getGroup(fieldIndex, i));
}
}
return vals;
}
Aggregations