Search in sources :

Example 1 with OriginalType

use of org.apache.parquet.schema.OriginalType in project hive by apache.

the class HiveParquetSchemaTestUtils method testConversion.

public static void testConversion(final String columnNamesStr, final String columnsTypeStr, final String actualSchema) throws Exception {
    final List<String> columnNames = createHiveColumnsFrom(columnNamesStr);
    final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr);
    final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
    final MessageType expectedMT = MessageTypeParser.parseMessageType(actualSchema);
    assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + actualSchema, expectedMT, messageTypeFound);
    // Required to check the original types manually as PrimitiveType.equals does not care about it
    List<Type> expectedFields = expectedMT.getFields();
    List<Type> actualFields = messageTypeFound.getFields();
    for (int i = 0, n = expectedFields.size(); i < n; ++i) {
        OriginalType exp = expectedFields.get(i).getOriginalType();
        OriginalType act = actualFields.get(i).getOriginalType();
        assertEquals("Original types of the field do not match", exp, act);
    }
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType)

Example 2 with OriginalType

use of org.apache.parquet.schema.OriginalType in project drill by apache.

the class ParquetMetaStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    // map from column to ColumnMetadata
    final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
        SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
        columnMetadataMap.put(schemaPath, columnMetadata);
    }
    for (final SchemaPath schemaPath : fields) {
        final PrimitiveType.PrimitiveTypeName primitiveType;
        final OriginalType originalType;
        final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(schemaPath);
        if (columnMetadata != null) {
            final Object min = columnMetadata.getMinValue();
            final Object max = columnMetadata.getMaxValue();
            final Long numNull = columnMetadata.getNulls();
            primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
            originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
            final Integer repetitionLevel = this.parquetTableMetadata.getRepetitionLevel(columnMetadata.getName());
            statMap.put(schemaPath, getStat(min, max, numNull, primitiveType, originalType, repetitionLevel));
        } else {
            final String columnName = schemaPath.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(schemaPath, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) Stopwatch(com.google.common.base.Stopwatch) Metadata(org.apache.drill.exec.store.parquet.Metadata) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 3 with OriginalType

use of org.apache.parquet.schema.OriginalType in project drill by apache.

the class ParquetReaderUtility method correctDatesInMetadataCache.

public static void correctDatesInMetadataCache(Metadata.ParquetTableMetadataBase parquetTableMetadata) {
    DateCorruptionStatus cacheFileCanContainsCorruptDates = parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v3 ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
    if (cacheFileCanContainsCorruptDates == DateCorruptionStatus.META_UNCLEAR_TEST_VALUES) {
        // Looking for the DATE data type of column names in the metadata cache file ("metadata_version" : "v2")
        String[] names = new String[0];
        if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v2) {
            for (Metadata.ColumnTypeMetadata_v2 columnTypeMetadata : ((Metadata.ParquetTableMetadata_v2) parquetTableMetadata).columnTypeInfo.values()) {
                if (OriginalType.DATE.equals(columnTypeMetadata.originalType)) {
                    names = columnTypeMetadata.name;
                }
            }
        }
        for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            // Drill has only ever written a single row group per file, only need to correct the statistics
            // on the first row group
            Metadata.RowGroupMetadata rowGroupMetadata = file.getRowGroups().get(0);
            for (Metadata.ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
                // Setting Min/Max values for ParquetTableMetadata_v1
                if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v1) {
                    OriginalType originalType = columnMetadata.getOriginalType();
                    if (OriginalType.DATE.equals(originalType) && columnMetadata.hasSingleValue() && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
                        int newMinMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
                        columnMetadata.setMax(newMinMax);
                        columnMetadata.setMin(newMinMax);
                    }
                } else // Setting Max values for ParquetTableMetadata_v2
                if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v2 && columnMetadata.getName() != null && Arrays.equals(columnMetadata.getName(), names) && columnMetadata.hasSingleValue() && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
                    int newMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
                    columnMetadata.setMax(newMax);
                }
            }
        }
    }
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OriginalType(org.apache.parquet.schema.OriginalType)

Example 4 with OriginalType

use of org.apache.parquet.schema.OriginalType in project hive by apache.

the class DataWritableWriter method createWriter.

/**
   * Creates a writer for the specific object inspector. The returned writer will be used
   * to call Parquet API for the specific data type.
   * @param inspector The object inspector used to get the correct value type.
   * @param type Type that contains information about the type schema.
   * @return A ParquetWriter object used to call the Parquet API fo the specific data type.
   */
private DataWriter createWriter(ObjectInspector inspector, Type type) {
    if (type.isPrimitive()) {
        checkInspectorCategory(inspector, ObjectInspector.Category.PRIMITIVE);
        PrimitiveObjectInspector primitiveInspector = (PrimitiveObjectInspector) inspector;
        switch(primitiveInspector.getPrimitiveCategory()) {
            case BOOLEAN:
                return new BooleanDataWriter((BooleanObjectInspector) inspector);
            case BYTE:
                return new ByteDataWriter((ByteObjectInspector) inspector);
            case SHORT:
                return new ShortDataWriter((ShortObjectInspector) inspector);
            case INT:
                return new IntDataWriter((IntObjectInspector) inspector);
            case LONG:
                return new LongDataWriter((LongObjectInspector) inspector);
            case FLOAT:
                return new FloatDataWriter((FloatObjectInspector) inspector);
            case DOUBLE:
                return new DoubleDataWriter((DoubleObjectInspector) inspector);
            case STRING:
                return new StringDataWriter((StringObjectInspector) inspector);
            case CHAR:
                return new CharDataWriter((HiveCharObjectInspector) inspector);
            case VARCHAR:
                return new VarcharDataWriter((HiveVarcharObjectInspector) inspector);
            case BINARY:
                return new BinaryDataWriter((BinaryObjectInspector) inspector);
            case TIMESTAMP:
                return new TimestampDataWriter((TimestampObjectInspector) inspector);
            case DECIMAL:
                return new DecimalDataWriter((HiveDecimalObjectInspector) inspector);
            case DATE:
                return new DateDataWriter((DateObjectInspector) inspector);
            default:
                throw new IllegalArgumentException("Unsupported primitive data type: " + primitiveInspector.getPrimitiveCategory());
        }
    } else {
        GroupType groupType = type.asGroupType();
        OriginalType originalType = type.getOriginalType();
        if (originalType != null && originalType.equals(OriginalType.LIST)) {
            checkInspectorCategory(inspector, ObjectInspector.Category.LIST);
            return new ListDataWriter((ListObjectInspector) inspector, groupType);
        } else if (originalType != null && originalType.equals(OriginalType.MAP)) {
            checkInspectorCategory(inspector, ObjectInspector.Category.MAP);
            return new MapDataWriter((MapObjectInspector) inspector, groupType);
        } else {
            checkInspectorCategory(inspector, ObjectInspector.Category.STRUCT);
            return new StructDataWriter((StructObjectInspector) inspector, groupType);
        }
    }
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 5 with OriginalType

use of org.apache.parquet.schema.OriginalType in project drill by apache.

the class ParquetGroupScan method checkForPartitionColumn.

/**
   * When reading the very first footer, any column is a potential partition column. So for the first footer, we check
   * every column to see if it is single valued, and if so, add it to the list of potential partition columns. For the
   * remaining footers, we will not find any new partition columns, but we may discover that what was previously a
   * potential partition column now no longer qualifies, so it needs to be removed from the list.
   * @return whether column is a potential partition column
   */
private boolean checkForPartitionColumn(ColumnMetadata columnMetadata, boolean first) {
    SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
    final PrimitiveTypeName primitiveType;
    final OriginalType originalType;
    if (this.parquetTableMetadata.hasColumnMetadata()) {
        primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
        originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
    } else {
        primitiveType = columnMetadata.getPrimitiveType();
        originalType = columnMetadata.getOriginalType();
    }
    if (first) {
        if (hasSingleValue(columnMetadata)) {
            partitionColTypeMap.put(schemaPath, getType(primitiveType, originalType));
            return true;
        } else {
            return false;
        }
    } else {
        if (!partitionColTypeMap.keySet().contains(schemaPath)) {
            return false;
        } else {
            if (!hasSingleValue(columnMetadata)) {
                partitionColTypeMap.remove(schemaPath);
                return false;
            }
            if (!getType(primitiveType, originalType).equals(partitionColTypeMap.get(schemaPath))) {
                partitionColTypeMap.remove(schemaPath);
                return false;
            }
        }
    }
    return true;
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Aggregations

OriginalType (org.apache.parquet.schema.OriginalType)6 SchemaPath (org.apache.drill.common.expression.SchemaPath)2 PrimitiveType (org.apache.parquet.schema.PrimitiveType)2 PrimitiveTypeName (org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)2 Stopwatch (com.google.common.base.Stopwatch)1 HashMap (java.util.HashMap)1 TypeProtos (org.apache.drill.common.types.TypeProtos)1 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)1 Metadata (org.apache.drill.exec.store.parquet.Metadata)1 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)1 PrimitiveObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector)1 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)1 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)1 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)1 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)1 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)1 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)1 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)1 Statistics (org.apache.parquet.column.statistics.Statistics)1 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)1