Search in sources :

Example 11 with MetadataVersion

use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.

the class TestParquetMetadataVersion method testZeroMinorVersion.

@Test
public void testZeroMinorVersion() throws Exception {
    MetadataVersion zeroMinorVersion = new MetadataVersion("4.0");
    MetadataVersion expectedVersionZeroMinorVersion = new MetadataVersion(4, 0);
    assertEquals("Parquet metadata version is parsed incorrectly", expectedVersionZeroMinorVersion, zeroMinorVersion);
}
Also used : MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) BaseTest(org.apache.drill.test.BaseTest) ParquetTest(org.apache.drill.categories.ParquetTest) UnlikelyTest(org.apache.drill.categories.UnlikelyTest) Test(org.junit.Test)

Example 12 with MetadataVersion

use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.

the class ParquetReaderUtility method transformBinaryInMetadataCache.

/**
 * Transforms values for min / max binary statistics to byte array.
 * Transformation logic depends on metadata file version.
 *
 * @param parquetTableMetadata table metadata that should be corrected
 * @param readerConfig parquet reader config
 */
public static void transformBinaryInMetadataCache(ParquetTableMetadataBase parquetTableMetadata, ParquetReaderConfig readerConfig) {
    // Looking for the names of the columns with BINARY data type
    // in the metadata cache file for V2 and all v3 versions
    Set<List<String>> columnsNames = getBinaryColumnsNames(parquetTableMetadata);
    boolean allowBinaryMetadata = allowBinaryMetadata(parquetTableMetadata.getDrillVersion(), readerConfig);
    // Setting Min / Max values for ParquetTableMetadata_v1
    MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
    if (metadataVersion.isEqualTo(1, 0)) {
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            for (RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
                Long rowCount = rowGroupMetadata.getRowCount();
                for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
                    if (columnMetadata.getPrimitiveType() == PrimitiveTypeName.BINARY || columnMetadata.getPrimitiveType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
                        setMinMaxValues(columnMetadata, rowCount, allowBinaryMetadata, false);
                    }
                }
            }
        }
        return;
    }
    // Variables needed for debugging only
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    int maxRowGroups = 0;
    int minRowGroups = Integer.MAX_VALUE;
    int maxNumColumns = 0;
    // Setting Min / Max values for V2, V3 and V4 versions; for versions V3_3 and above need to do decoding
    boolean needDecoding = metadataVersion.isAtLeast(3, 3);
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        if (timer != null) {
            // for debugging only
            maxRowGroups = Math.max(maxRowGroups, file.getRowGroups().size());
            minRowGroups = Math.min(minRowGroups, file.getRowGroups().size());
        }
        for (RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
            Long rowCount = rowGroupMetadata.getRowCount();
            if (timer != null) {
                // for debugging only
                maxNumColumns = Math.max(maxNumColumns, rowGroupMetadata.getColumns().size());
            }
            for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
                if (columnsNames.contains(Arrays.asList(columnMetadata.getName()))) {
                    setMinMaxValues(columnMetadata, rowCount, allowBinaryMetadata, needDecoding);
                }
            }
        }
    }
    if (timer != null) {
        // log a debug message and stop the timer
        String reportRG = 1 == maxRowGroups ? "1 rowgroup" : "between " + minRowGroups + "-" + maxRowGroups + "rowgroups";
        logger.debug("Transforming binary in metadata cache took {} ms ({} files, {} per file, max {} columns)", timer.elapsed(TimeUnit.MILLISECONDS), parquetTableMetadata.getFiles().size(), reportRG, maxNumColumns);
        timer.stop();
    }
}
Also used : MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) ColumnMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) List(java.util.List) ArrayList(java.util.ArrayList) RowGroupMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata)

Example 13 with MetadataVersion

use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.

the class ParquetTableMetadataUtils method getRowGroupFields.

/**
 * Returns map of column names with their drill types for specified {@code rowGroup}.
 *
 * @param parquetTableMetadata the source of primitive and original column types
 * @param rowGroup             row group whose columns should be discovered
 * @return map of column names with their drill types
 */
public static Map<SchemaPath, TypeProtos.MajorType> getRowGroupFields(MetadataBase.ParquetTableMetadataBase parquetTableMetadata, MetadataBase.RowGroupMetadata rowGroup) {
    Map<SchemaPath, TypeProtos.MajorType> columns = new LinkedHashMap<>();
    if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isHigherThan(4, 0) && !((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).isAllColumnsInteresting()) {
        // adds non-interesting fields from table metadata
        for (MetadataBase.ColumnTypeMetadata columnTypeMetadata : parquetTableMetadata.getColumnTypeInfoList()) {
            Metadata_V4.ColumnTypeMetadata_v4 metadata = (Metadata_V4.ColumnTypeMetadata_v4) columnTypeMetadata;
            if (!metadata.isInteresting) {
                TypeProtos.MajorType columnType = getColumnType(metadata.name, metadata.primitiveType, metadata.originalType, parquetTableMetadata);
                SchemaPath columnPath = SchemaPath.getCompoundPath(metadata.name);
                putType(columns, columnPath, columnType);
            }
        }
    }
    for (MetadataBase.ColumnMetadata column : rowGroup.getColumns()) {
        TypeProtos.MajorType columnType = getColumnType(parquetTableMetadata, column);
        SchemaPath columnPath = SchemaPath.getCompoundPath(column.getName());
        putType(columns, columnPath, columnType);
    }
    return columns;
}
Also used : TypeProtos(org.apache.drill.common.types.TypeProtos) LinkedHashMap(java.util.LinkedHashMap) MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) Metadata_V4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase)

Example 14 with MetadataVersion

use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.

the class ParquetTableMetadataUtils method getColumnType.

private static TypeProtos.MajorType getColumnType(String[] name, PrimitiveType.PrimitiveTypeName primitiveType, OriginalType originalType, MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
    int precision = 0;
    int scale = 0;
    MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
    // only ColumnTypeMetadata_v3 and ColumnTypeMetadata_v4 store information about scale, precision, repetition level and definition level
    if (metadataVersion.isAtLeast(3, 0)) {
        scale = parquetTableMetadata.getScale(name);
        precision = parquetTableMetadata.getPrecision(name);
    }
    TypeProtos.DataMode mode = getDataMode(parquetTableMetadata, metadataVersion, name);
    return TypeProtos.MajorType.newBuilder(ParquetReaderUtility.getType(primitiveType, originalType, precision, scale)).setMode(mode).build();
}
Also used : MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) TypeProtos(org.apache.drill.common.types.TypeProtos)

Example 15 with MetadataVersion

use of org.apache.drill.exec.store.parquet.metadata.MetadataVersion in project drill by apache.

the class ParquetReaderUtility method correctDatesInMetadataCache.

public static void correctDatesInMetadataCache(ParquetTableMetadataBase parquetTableMetadata) {
    MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
    DateCorruptionStatus cacheFileCanContainsCorruptDates = metadataVersion.isAtLeast(3, 0) ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
    if (cacheFileCanContainsCorruptDates == DateCorruptionStatus.META_UNCLEAR_TEST_VALUES) {
        // Looking for the DATE data type of column names in the metadata cache file ("metadata_version" : "v2")
        String[] names = new String[0];
        if (metadataVersion.isEqualTo(2, 0)) {
            for (ColumnTypeMetadata_v2 columnTypeMetadata : ((ParquetTableMetadata_v2) parquetTableMetadata).columnTypeInfo.values()) {
                if (OriginalType.DATE.equals(columnTypeMetadata.originalType)) {
                    names = columnTypeMetadata.name;
                }
            }
        }
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            // Drill has only ever written a single row group per file, only need to correct the statistics
            // on the first row group
            RowGroupMetadata rowGroupMetadata = file.getRowGroups().get(0);
            Long rowCount = rowGroupMetadata.getRowCount();
            for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
                // Setting Min/Max values for ParquetTableMetadata_v1
                if (metadataVersion.isEqualTo(1, 0)) {
                    OriginalType originalType = columnMetadata.getOriginalType();
                    if (OriginalType.DATE.equals(originalType) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
                        int newMinMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
                        columnMetadata.setMax(newMinMax);
                        columnMetadata.setMin(newMinMax);
                    }
                } else // Setting Max values for ParquetTableMetadata_v2
                if (metadataVersion.isEqualTo(2, 0) && columnMetadata.getName() != null && Arrays.equals(columnMetadata.getName(), names) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
                    int newMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
                    columnMetadata.setMax(newMax);
                }
            }
        }
    }
}
Also used : MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) ColumnTypeMetadata_v2(org.apache.drill.exec.store.parquet.metadata.Metadata_V2.ColumnTypeMetadata_v2) OriginalType(org.apache.parquet.schema.OriginalType) ColumnMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) RowGroupMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata)

Aggregations

MetadataVersion (org.apache.drill.exec.store.parquet.metadata.MetadataVersion)15 UnlikelyTest (org.apache.drill.categories.UnlikelyTest)10 Test (org.junit.Test)10 ParquetTest (org.apache.drill.categories.ParquetTest)9 BaseTest (org.apache.drill.test.BaseTest)9 TypeProtos (org.apache.drill.common.types.TypeProtos)3 LinkedHashMap (java.util.LinkedHashMap)2 SchemaPath (org.apache.drill.common.expression.SchemaPath)2 MetadataBase (org.apache.drill.exec.store.parquet.metadata.MetadataBase)2 ColumnMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata)2 ParquetFileMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata)2 RowGroupMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata)2 Metadata_V4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4)2 OriginalType (org.apache.parquet.schema.OriginalType)2 File (java.io.File)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 ColumnTypeMetadata_v2 (org.apache.drill.exec.store.parquet.metadata.Metadata_V2.ColumnTypeMetadata_v2)1 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)1 Category (org.junit.experimental.categories.Category)1