use of org.apache.parquet.schema.Type in project presto by prestodb.
the class TestDataWritableWriter method writeMap.
/**
* It writes a map type and its key-pair values to the Parquet RecordConsumer.
* This is called when the original type (MAP) is detected by writeValue().
* This function assumes the following schema:
* optional group mapCol (MAP) {
* repeated group map (MAP_KEY_VALUE) {
* required TYPE key;
* optional TYPE value;
* }
* }
*
* @param value The object that contains the map key-values.
* @param inspector The object inspector used to get the correct value type.
* @param type Type that contains information about the group (MAP) schema.
*/
private void writeMap(final Object value, final MapObjectInspector inspector, final GroupType type) {
// Get the internal map structure (MAP_KEY_VALUE)
GroupType repeatedType = type.getType(0).asGroupType();
recordConsumer.startGroup();
Map<?, ?> mapValues = inspector.getMap(value);
if (mapValues != null && mapValues.size() > 0) {
recordConsumer.startField(repeatedType.getName(), 0);
Type keyType = repeatedType.getType(0);
String keyName = keyType.getName();
ObjectInspector keyInspector = inspector.getMapKeyObjectInspector();
Type valuetype = repeatedType.getType(1);
String valueName = valuetype.getName();
ObjectInspector valueInspector = inspector.getMapValueObjectInspector();
for (Map.Entry<?, ?> keyValue : mapValues.entrySet()) {
recordConsumer.startGroup();
if (keyValue != null) {
// write key element
Object keyElement = keyValue.getKey();
recordConsumer.startField(keyName, 0);
writeValue(keyElement, keyInspector, keyType);
recordConsumer.endField(keyName, 0);
// write value element
Object valueElement = keyValue.getValue();
if (valueElement != null) {
recordConsumer.startField(valueName, 1);
writeValue(valueElement, valueInspector, valuetype);
recordConsumer.endField(valueName, 1);
}
}
recordConsumer.endGroup();
}
recordConsumer.endField(repeatedType.getName(), 0);
}
recordConsumer.endGroup();
}
use of org.apache.parquet.schema.Type in project flink by apache.
the class ParquetVectorizedInputFormat method clipParquetSchema.
/**
* Clips `parquetSchema` according to `fieldNames`.
*/
private MessageType clipParquetSchema(GroupType parquetSchema) {
Type[] types = new Type[projectedFields.length];
if (isCaseSensitive) {
for (int i = 0; i < projectedFields.length; ++i) {
String fieldName = projectedFields[i];
if (!parquetSchema.containsField(fieldName)) {
LOG.warn("{} does not exist in {}, will fill the field with null.", fieldName, parquetSchema);
types[i] = ParquetSchemaConverter.convertToParquetType(fieldName, projectedTypes[i]);
unknownFieldsIndices.add(i);
} else {
types[i] = parquetSchema.getType(fieldName);
}
}
} else {
Map<String, Type> caseInsensitiveFieldMap = new HashMap<>();
for (Type type : parquetSchema.getFields()) {
caseInsensitiveFieldMap.compute(type.getName().toLowerCase(Locale.ROOT), (key, previousType) -> {
if (previousType != null) {
throw new FlinkRuntimeException("Parquet with case insensitive mode should have no duplicate key: " + key);
}
return type;
});
}
for (int i = 0; i < projectedFields.length; ++i) {
Type type = caseInsensitiveFieldMap.get(projectedFields[i].toLowerCase(Locale.ROOT));
if (type == null) {
LOG.warn("{} does not exist in {}, will fill the field with null.", projectedFields[i], parquetSchema);
type = ParquetSchemaConverter.convertToParquetType(projectedFields[i].toLowerCase(Locale.ROOT), projectedTypes[i]);
unknownFieldsIndices.add(i);
}
// TODO clip for array,map,row types.
types[i] = type;
}
}
return Types.buildMessage().addFields(types).named("flink-parquet");
}
use of org.apache.parquet.schema.Type in project flink by apache.
the class ParquetColumnarRowSplitReader method checkSchema.
private void checkSchema() throws IOException, UnsupportedOperationException {
if (selectedTypes.length != requestedSchema.getFieldCount()) {
throw new RuntimeException("The quality of field type is incompatible with the request schema!");
}
/*
* Check that the requested schema is supported.
*/
for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
Type t = requestedSchema.getFields().get(i);
if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
throw new UnsupportedOperationException("Complex types not supported.");
}
String[] colPath = requestedSchema.getPaths().get(i);
if (fileSchema.containsPath(colPath)) {
ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
if (!fd.equals(requestedSchema.getColumns().get(i))) {
throw new UnsupportedOperationException("Schema evolution not supported.");
}
} else {
if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) {
// invalid.
throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath));
}
}
}
}
use of org.apache.parquet.schema.Type in project flink by apache.
the class ParquetColumnarRowSplitReader method clipParquetSchema.
/**
* Clips `parquetSchema` according to `fieldNames`.
*/
private static MessageType clipParquetSchema(GroupType parquetSchema, String[] fieldNames, boolean caseSensitive) {
Type[] types = new Type[fieldNames.length];
if (caseSensitive) {
for (int i = 0; i < fieldNames.length; ++i) {
String fieldName = fieldNames[i];
if (parquetSchema.getFieldIndex(fieldName) < 0) {
throw new IllegalArgumentException(fieldName + " does not exist");
}
types[i] = parquetSchema.getType(fieldName);
}
} else {
Map<String, Type> caseInsensitiveFieldMap = new HashMap<>();
for (Type type : parquetSchema.getFields()) {
caseInsensitiveFieldMap.compute(type.getName().toLowerCase(Locale.ROOT), (key, previousType) -> {
if (previousType != null) {
throw new FlinkRuntimeException("Parquet with case insensitive mode should have no duplicate key: " + key);
}
return type;
});
}
for (int i = 0; i < fieldNames.length; ++i) {
Type type = caseInsensitiveFieldMap.get(fieldNames[i].toLowerCase(Locale.ROOT));
if (type == null) {
throw new IllegalArgumentException(fieldNames[i] + " does not exist");
}
// TODO clip for array,map,row types.
types[i] = type;
}
}
return Types.buildMessage().addFields(types).named("flink-parquet");
}
use of org.apache.parquet.schema.Type in project druid by druid-io.
the class DruidParquetReadSupport method getPartialReadSchema.
/**
* Select the columns from the parquet schema that are used in the schema of the ingestion job
*
* @param context The context of the file to be read
*
* @return the partial schema that only contains the columns that are being used in the schema
*/
private MessageType getPartialReadSchema(InitContext context) {
MessageType fullSchema = context.getFileSchema();
String name = fullSchema.getName();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
ParseSpec parseSpec = config.getParser().getParseSpec();
// parse the flatten spec and determine it isn't auto discovering props?
if (parseSpec instanceof ParquetParseSpec) {
if (((ParquetParseSpec) parseSpec).getFlattenSpec() != null) {
return fullSchema;
}
}
String tsField = parseSpec.getTimestampSpec().getTimestampColumn();
List<DimensionSchema> dimensionSchema = parseSpec.getDimensionsSpec().getDimensions();
Set<String> dimensions = new HashSet<>();
for (DimensionSchema dim : dimensionSchema) {
dimensions.add(dim.getName());
}
Set<String> metricsFields = new HashSet<>();
for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
metricsFields.addAll(agg.requiredFields());
}
List<Type> partialFields = new ArrayList<>();
for (Type type : fullSchema.getFields()) {
if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
partialFields.add(type);
}
}
return new MessageType(name, partialFields);
}
Aggregations