Search in sources :

Example 1 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project drill by apache.

the class Metadata method getColTypeInfo.

private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) {
    if (type.isPrimitive()) {
        PrimitiveType primitiveType = (PrimitiveType) type;
        int precision = 0;
        int scale = 0;
        if (primitiveType.getDecimalMetadata() != null) {
            precision = primitiveType.getDecimalMetadata().getPrecision();
            scale = primitiveType.getDecimalMetadata().getScale();
        }
        int repetitionLevel = schema.getMaxRepetitionLevel(path);
        int definitionLevel = schema.getMaxDefinitionLevel(path);
        return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel);
    }
    Type t = ((GroupType) type).getType(path[depth]);
    return getColTypeInfo(schema, t, path, depth + 1);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 2 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project drill by apache.

the class ParquetMetaStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    // map from column to ColumnMetadata
    final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
        SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
        columnMetadataMap.put(schemaPath, columnMetadata);
    }
    for (final SchemaPath schemaPath : fields) {
        final PrimitiveType.PrimitiveTypeName primitiveType;
        final OriginalType originalType;
        final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(schemaPath);
        if (columnMetadata != null) {
            final Object min = columnMetadata.getMinValue();
            final Object max = columnMetadata.getMaxValue();
            final Long numNull = columnMetadata.getNulls();
            primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
            originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
            final Integer repetitionLevel = this.parquetTableMetadata.getRepetitionLevel(columnMetadata.getName());
            statMap.put(schemaPath, getStat(min, max, numNull, primitiveType, originalType, repetitionLevel));
        } else {
            final String columnName = schemaPath.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(schemaPath, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) Stopwatch(com.google.common.base.Stopwatch) Metadata(org.apache.drill.exec.store.parquet.Metadata) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 3 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestParquetFileWriter method testMergeMetadata.

@Test
public void testMergeMetadata() {
    FileMetaData md1 = new FileMetaData(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a"), new PrimitiveType(OPTIONAL, BINARY, "b")), new HashMap<String, String>(), "test");
    FileMetaData md2 = new FileMetaData(new MessageType("root2", new PrimitiveType(REQUIRED, BINARY, "c")), new HashMap<String, String>(), "test2");
    GlobalMetaData merged = ParquetFileWriter.mergeInto(md2, ParquetFileWriter.mergeInto(md1, null));
    assertEquals(merged.getSchema(), new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a"), new PrimitiveType(OPTIONAL, BINARY, "b"), new PrimitiveType(REQUIRED, BINARY, "c")));
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 4 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestParquetFileWriter method testMergeFooters.

@Test
public void testMergeFooters() {
    List<BlockMetaData> oneBlocks = new ArrayList<BlockMetaData>();
    oneBlocks.add(new BlockMetaData());
    oneBlocks.add(new BlockMetaData());
    List<BlockMetaData> twoBlocks = new ArrayList<BlockMetaData>();
    twoBlocks.add(new BlockMetaData());
    List<BlockMetaData> expected = new ArrayList<BlockMetaData>();
    expected.addAll(oneBlocks);
    expected.addAll(twoBlocks);
    Footer one = new Footer(new Path("file:/tmp/output/one.parquet"), new ParquetMetadata(new FileMetaData(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a"), new PrimitiveType(OPTIONAL, BINARY, "b")), new HashMap<String, String>(), "test"), oneBlocks));
    Footer two = new Footer(new Path("/tmp/output/two.parquet"), new ParquetMetadata(new FileMetaData(new MessageType("root2", new PrimitiveType(REQUIRED, BINARY, "c")), new HashMap<String, String>(), "test2"), twoBlocks));
    List<Footer> footers = new ArrayList<Footer>();
    footers.add(one);
    footers.add(two);
    ParquetMetadata merged = ParquetFileWriter.mergeFooters(new Path("/tmp"), footers);
    assertEquals(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a"), new PrimitiveType(OPTIONAL, BINARY, "b"), new PrimitiveType(REQUIRED, BINARY, "c")), merged.getFileMetaData().getSchema());
    assertEquals("Should have all blocks", expected, merged.getBlocks());
}
Also used : Path(org.apache.hadoop.fs.Path) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 5 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class AvroSchemaConverter method convertField.

@SuppressWarnings("deprecation")
private Type convertField(String fieldName, Schema schema, Type.Repetition repetition) {
    Types.PrimitiveBuilder<PrimitiveType> builder;
    Schema.Type type = schema.getType();
    if (type.equals(Schema.Type.BOOLEAN)) {
        builder = Types.primitive(BOOLEAN, repetition);
    } else if (type.equals(Schema.Type.INT)) {
        builder = Types.primitive(INT32, repetition);
    } else if (type.equals(Schema.Type.LONG)) {
        builder = Types.primitive(INT64, repetition);
    } else if (type.equals(Schema.Type.FLOAT)) {
        builder = Types.primitive(FLOAT, repetition);
    } else if (type.equals(Schema.Type.DOUBLE)) {
        builder = Types.primitive(DOUBLE, repetition);
    } else if (type.equals(Schema.Type.BYTES)) {
        builder = Types.primitive(BINARY, repetition);
    } else if (type.equals(Schema.Type.STRING)) {
        builder = Types.primitive(BINARY, repetition).as(UTF8);
    } else if (type.equals(Schema.Type.RECORD)) {
        return new GroupType(repetition, fieldName, convertFields(schema.getFields()));
    } else if (type.equals(Schema.Type.ENUM)) {
        builder = Types.primitive(BINARY, repetition).as(ENUM);
    } else if (type.equals(Schema.Type.ARRAY)) {
        if (writeOldListStructure) {
            return ConversionPatterns.listType(repetition, fieldName, convertField("array", schema.getElementType(), REPEATED));
        } else {
            return ConversionPatterns.listOfElements(repetition, fieldName, convertField(AvroWriteSupport.LIST_ELEMENT_NAME, schema.getElementType()));
        }
    } else if (type.equals(Schema.Type.MAP)) {
        Type valType = convertField("value", schema.getValueType());
        // avro map key type is always string
        return ConversionPatterns.stringKeyMapType(repetition, fieldName, valType);
    } else if (type.equals(Schema.Type.FIXED)) {
        builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(schema.getFixedSize());
    } else if (type.equals(Schema.Type.UNION)) {
        return convertUnion(fieldName, schema, repetition);
    } else {
        throw new UnsupportedOperationException("Cannot convert Avro type " + type);
    }
    // schema translation can only be done for known logical types because this
    // creates an equivalence
    LogicalType logicalType = schema.getLogicalType();
    if (logicalType != null) {
        if (logicalType instanceof LogicalTypes.Decimal) {
            builder = builder.as(DECIMAL).precision(((LogicalTypes.Decimal) logicalType).getPrecision()).scale(((LogicalTypes.Decimal) logicalType).getScale());
        } else {
            OriginalType annotation = convertLogicalType(logicalType);
            if (annotation != null) {
                builder.as(annotation);
            }
        }
    }
    return builder.named(fieldName);
}
Also used : Types(org.apache.parquet.schema.Types) LogicalTypes(org.apache.avro.LogicalTypes) OriginalType(org.apache.parquet.schema.OriginalType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) LogicalType(org.apache.avro.LogicalType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) Schema(org.apache.avro.Schema) LogicalType(org.apache.avro.LogicalType) LogicalTypes(org.apache.avro.LogicalTypes) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Aggregations

PrimitiveType (org.apache.parquet.schema.PrimitiveType)102 Test (org.junit.Test)52 MessageType (org.apache.parquet.schema.MessageType)31 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)28 BooleanWritable (org.apache.hadoop.io.BooleanWritable)28 BytesWritable (org.apache.hadoop.io.BytesWritable)28 DoubleWritable (org.apache.hadoop.io.DoubleWritable)28 FloatWritable (org.apache.hadoop.io.FloatWritable)28 IntWritable (org.apache.hadoop.io.IntWritable)28 LongWritable (org.apache.hadoop.io.LongWritable)28 Writable (org.apache.hadoop.io.Writable)28 Test (org.testng.annotations.Test)20 GroupType (org.apache.parquet.schema.GroupType)19 OriginalType (org.apache.parquet.schema.OriginalType)15 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)13 List (java.util.List)11 ArrayList (java.util.ArrayList)10 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)10 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)10 Type (org.apache.parquet.schema.Type)10