Search in sources :

Example 1 with ColumnTypeMetadata_v2

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V2.ColumnTypeMetadata_v2 in project drill by apache.

the class ParquetReaderUtility method correctDatesInMetadataCache.

public static void correctDatesInMetadataCache(ParquetTableMetadataBase parquetTableMetadata) {
    MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
    DateCorruptionStatus cacheFileCanContainsCorruptDates = metadataVersion.isAtLeast(3, 0) ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
    if (cacheFileCanContainsCorruptDates == DateCorruptionStatus.META_UNCLEAR_TEST_VALUES) {
        // Looking for the DATE data type of column names in the metadata cache file ("metadata_version" : "v2")
        String[] names = new String[0];
        if (metadataVersion.isEqualTo(2, 0)) {
            for (ColumnTypeMetadata_v2 columnTypeMetadata : ((ParquetTableMetadata_v2) parquetTableMetadata).columnTypeInfo.values()) {
                if (OriginalType.DATE.equals(columnTypeMetadata.originalType)) {
                    names = columnTypeMetadata.name;
                }
            }
        }
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            // Drill has only ever written a single row group per file, only need to correct the statistics
            // on the first row group
            RowGroupMetadata rowGroupMetadata = file.getRowGroups().get(0);
            Long rowCount = rowGroupMetadata.getRowCount();
            for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
                // Setting Min/Max values for ParquetTableMetadata_v1
                if (metadataVersion.isEqualTo(1, 0)) {
                    OriginalType originalType = columnMetadata.getOriginalType();
                    if (OriginalType.DATE.equals(originalType) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
                        int newMinMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
                        columnMetadata.setMax(newMinMax);
                        columnMetadata.setMin(newMinMax);
                    }
                } else // Setting Max values for ParquetTableMetadata_v2
                if (metadataVersion.isEqualTo(2, 0) && columnMetadata.getName() != null && Arrays.equals(columnMetadata.getName(), names) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
                    int newMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
                    columnMetadata.setMax(newMax);
                }
            }
        }
    }
}
Also used : MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) ColumnTypeMetadata_v2(org.apache.drill.exec.store.parquet.metadata.Metadata_V2.ColumnTypeMetadata_v2) OriginalType(org.apache.parquet.schema.OriginalType) ColumnMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) RowGroupMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata)

Aggregations

ColumnMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata)1 ParquetFileMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata)1 RowGroupMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata)1 MetadataVersion (org.apache.drill.exec.store.parquet.metadata.MetadataVersion)1 ColumnTypeMetadata_v2 (org.apache.drill.exec.store.parquet.metadata.Metadata_V2.ColumnTypeMetadata_v2)1 OriginalType (org.apache.parquet.schema.OriginalType)1