Search in sources :

Example 1 with ParquetFileMetadata

use of org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata in project drill by apache.

the class Metadata method readBlockMeta.

/**
 * Read the parquet metadata from a file
 *
 * @param path to metadata file
 * @param dirsOnly true for {@link Metadata#METADATA_DIRECTORIES_FILENAME}
 *                 or false for {@link Metadata#OLD_METADATA_FILENAME} files reading
 * @param metaContext current metadata context
 */
private void readBlockMeta(Path path, boolean dirsOnly, MetadataContext metaContext, FileSystem fs) {
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    Path metadataParentDir = Path.getPathWithoutSchemeAndAuthority(path.getParent());
    String metadataParentDirPath = metadataParentDir.toUri().getPath();
    ObjectMapper mapper = new ObjectMapper();
    final SimpleModule serialModule = new SimpleModule();
    serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
    serialModule.addKeyDeserializer(Metadata_V2.ColumnTypeMetadata_v2.Key.class, new Metadata_V2.ColumnTypeMetadata_v2.Key.DeSerializer());
    serialModule.addKeyDeserializer(Metadata_V3.ColumnTypeMetadata_v3.Key.class, new Metadata_V3.ColumnTypeMetadata_v3.Key.DeSerializer());
    serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new ColumnTypeMetadata_v4.Key.DeSerializer());
    AfterburnerModule module = new AfterburnerModule();
    module.setUseOptimizedBeanDeserializer(true);
    boolean isFileMetadata = path.toString().endsWith(METADATA_FILENAME);
    boolean isSummaryFile = path.toString().endsWith(METADATA_SUMMARY_FILENAME);
    mapper.registerModule(serialModule);
    mapper.registerModule(module);
    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    try (InputStream is = fs.open(path)) {
        boolean alreadyCheckedModification;
        boolean newMetadata = false;
        alreadyCheckedModification = metaContext.getStatus(metadataParentDirPath);
        if (dirsOnly) {
            parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class);
            if (timer != null) {
                logger.debug("Took {} ms to read directories from directory cache file", timer.elapsed(TimeUnit.MILLISECONDS));
                timer.stop();
            }
            parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
            if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(), path, metadataParentDir, metaContext, fs)) {
                parquetTableMetadataDirs = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getRight();
                newMetadata = true;
            }
        } else {
            if (isFileMetadata) {
                parquetTableMetadata.assignFiles((mapper.readValue(is, FileMetadata.class)).getFiles());
                if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isAtLeast(4, 0)) {
                    ((ParquetTableMetadata_v4) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
                }
                if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(), path, metadataParentDir, metaContext, fs)) {
                    parquetTableMetadata = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getLeft();
                    newMetadata = true;
                }
            } else if (isSummaryFile) {
                MetadataSummary metadataSummary = mapper.readValue(is, Metadata_V4.MetadataSummary.class);
                parquetTableMetadata = new ParquetTableMetadata_v4(metadataSummary);
            } else {
                parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
                if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isAtLeast(3, 0)) {
                    ((Metadata_V3.ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
                }
                if (!alreadyCheckedModification && tableModified((parquetTableMetadata.getDirectories()), path, metadataParentDir, metaContext, fs)) {
                    parquetTableMetadata = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getLeft();
                    newMetadata = true;
                }
            }
            if (timer != null) {
                logger.debug("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS));
                timer.stop();
            }
            if (!isSummaryFile) {
                List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
                if (files != null) {
                    for (ParquetFileMetadata file : files) {
                        // DRILL-5009: Remove empty row groups unless it is the only row group
                        List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
                        if (rowGroups.size() == 1) {
                            continue;
                        }
                        rowGroups.removeIf(r -> r.getRowCount() == 0);
                    }
                }
            }
            if (newMetadata) {
                // if new metadata files were created, invalidate the existing metadata context
                metaContext.clear();
            }
        }
    } catch (IOException e) {
        logger.error("Failed to read '{}' metadata file", path, e);
        metaContext.setMetadataCacheCorrupted(true);
    }
}
Also used : ColumnTypeMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) FileMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) InputStream(java.io.InputStream) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) IOException(java.io.IOException) AfterburnerModule(com.fasterxml.jackson.module.afterburner.AfterburnerModule) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) SimpleModule(com.fasterxml.jackson.databind.module.SimpleModule)

Example 2 with ParquetFileMetadata

use of org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata in project drill by apache.

the class MetadataPathUtils method convertToFilesWithAbsolutePaths.

/**
 * Convert a list of files with relative paths to files with absolute ones
 *
 * @param files list of files with relative paths
 * @param baseDir base parent directory
 * @return list of files with absolute paths
 */
public static List<? extends ParquetFileMetadata> convertToFilesWithAbsolutePaths(List<? extends ParquetFileMetadata> files, String baseDir) {
    if (!files.isEmpty()) {
        List<ParquetFileMetadata> filesWithAbsolutePaths = new ArrayList<>();
        for (ParquetFileMetadata file : files) {
            Path relativePath = file.getPath();
            ParquetFileMetadata fileWithAbsolutePath = null;
            // create a new file if old one contains a relative path, otherwise use an old file
            if (file instanceof ParquetFileMetadata_v4) {
                fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v4(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V4.RowGroupMetadata_v4>) file.getRowGroups());
            } else if (file instanceof ParquetFileMetadata_v3) {
                fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v3(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V3.RowGroupMetadata_v3>) file.getRowGroups());
            }
            filesWithAbsolutePaths.add(fileWithAbsolutePath);
        }
        return filesWithAbsolutePaths;
    }
    return files;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) ParquetFileMetadata_v3(org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ParquetFileMetadata_v3) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)

Example 3 with ParquetFileMetadata

use of org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata in project drill by apache.

the class ParquetReaderUtility method transformBinaryInMetadataCache.

/**
 * Transforms values for min / max binary statistics to byte array.
 * Transformation logic depends on metadata file version.
 *
 * @param parquetTableMetadata table metadata that should be corrected
 * @param readerConfig parquet reader config
 */
public static void transformBinaryInMetadataCache(ParquetTableMetadataBase parquetTableMetadata, ParquetReaderConfig readerConfig) {
    // Looking for the names of the columns with BINARY data type
    // in the metadata cache file for V2 and all v3 versions
    Set<List<String>> columnsNames = getBinaryColumnsNames(parquetTableMetadata);
    boolean allowBinaryMetadata = allowBinaryMetadata(parquetTableMetadata.getDrillVersion(), readerConfig);
    // Setting Min / Max values for ParquetTableMetadata_v1
    MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
    if (metadataVersion.isEqualTo(1, 0)) {
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            for (RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
                Long rowCount = rowGroupMetadata.getRowCount();
                for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
                    if (columnMetadata.getPrimitiveType() == PrimitiveTypeName.BINARY || columnMetadata.getPrimitiveType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
                        setMinMaxValues(columnMetadata, rowCount, allowBinaryMetadata, false);
                    }
                }
            }
        }
        return;
    }
    // Variables needed for debugging only
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    int maxRowGroups = 0;
    int minRowGroups = Integer.MAX_VALUE;
    int maxNumColumns = 0;
    // Setting Min / Max values for V2, V3 and V4 versions; for versions V3_3 and above need to do decoding
    boolean needDecoding = metadataVersion.isAtLeast(3, 3);
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        if (timer != null) {
            // for debugging only
            maxRowGroups = Math.max(maxRowGroups, file.getRowGroups().size());
            minRowGroups = Math.min(minRowGroups, file.getRowGroups().size());
        }
        for (RowGroupMetadata rowGroupMetadata : file.getRowGroups()) {
            Long rowCount = rowGroupMetadata.getRowCount();
            if (timer != null) {
                // for debugging only
                maxNumColumns = Math.max(maxNumColumns, rowGroupMetadata.getColumns().size());
            }
            for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
                if (columnsNames.contains(Arrays.asList(columnMetadata.getName()))) {
                    setMinMaxValues(columnMetadata, rowCount, allowBinaryMetadata, needDecoding);
                }
            }
        }
    }
    if (timer != null) {
        // log a debug message and stop the timer
        String reportRG = 1 == maxRowGroups ? "1 rowgroup" : "between " + minRowGroups + "-" + maxRowGroups + "rowgroups";
        logger.debug("Transforming binary in metadata cache took {} ms ({} files, {} per file, max {} columns)", timer.elapsed(TimeUnit.MILLISECONDS), parquetTableMetadata.getFiles().size(), reportRG, maxNumColumns);
        timer.stop();
    }
}
Also used : MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) ColumnMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) List(java.util.List) ArrayList(java.util.ArrayList) RowGroupMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata)

Example 4 with ParquetFileMetadata

use of org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata in project drill by apache.

the class ParquetReaderUtility method correctDatesInMetadataCache.

public static void correctDatesInMetadataCache(ParquetTableMetadataBase parquetTableMetadata) {
    MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
    DateCorruptionStatus cacheFileCanContainsCorruptDates = metadataVersion.isAtLeast(3, 0) ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
    if (cacheFileCanContainsCorruptDates == DateCorruptionStatus.META_UNCLEAR_TEST_VALUES) {
        // Looking for the DATE data type of column names in the metadata cache file ("metadata_version" : "v2")
        String[] names = new String[0];
        if (metadataVersion.isEqualTo(2, 0)) {
            for (ColumnTypeMetadata_v2 columnTypeMetadata : ((ParquetTableMetadata_v2) parquetTableMetadata).columnTypeInfo.values()) {
                if (OriginalType.DATE.equals(columnTypeMetadata.originalType)) {
                    names = columnTypeMetadata.name;
                }
            }
        }
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            // Drill has only ever written a single row group per file, only need to correct the statistics
            // on the first row group
            RowGroupMetadata rowGroupMetadata = file.getRowGroups().get(0);
            Long rowCount = rowGroupMetadata.getRowCount();
            for (ColumnMetadata columnMetadata : rowGroupMetadata.getColumns()) {
                // Setting Min/Max values for ParquetTableMetadata_v1
                if (metadataVersion.isEqualTo(1, 0)) {
                    OriginalType originalType = columnMetadata.getOriginalType();
                    if (OriginalType.DATE.equals(originalType) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
                        int newMinMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
                        columnMetadata.setMax(newMinMax);
                        columnMetadata.setMin(newMinMax);
                    }
                } else // Setting Max values for ParquetTableMetadata_v2
                if (metadataVersion.isEqualTo(2, 0) && columnMetadata.getName() != null && Arrays.equals(columnMetadata.getName(), names) && columnMetadata.hasSingleValue(rowCount) && (Integer) columnMetadata.getMaxValue() > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
                    int newMax = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) columnMetadata.getMaxValue());
                    columnMetadata.setMax(newMax);
                }
            }
        }
    }
}
Also used : MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) ColumnTypeMetadata_v2(org.apache.drill.exec.store.parquet.metadata.Metadata_V2.ColumnTypeMetadata_v2) OriginalType(org.apache.parquet.schema.OriginalType) ColumnMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) RowGroupMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata)

Aggregations

ParquetFileMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata)4 ArrayList (java.util.ArrayList)2 List (java.util.List)2 ColumnMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata)2 RowGroupMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata)2 MetadataVersion (org.apache.drill.exec.store.parquet.metadata.MetadataVersion)2 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)2 Path (org.apache.hadoop.fs.Path)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 SimpleModule (com.fasterxml.jackson.databind.module.SimpleModule)1 AfterburnerModule (com.fasterxml.jackson.module.afterburner.AfterburnerModule)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 SchemaPath (org.apache.drill.common.expression.SchemaPath)1 ColumnTypeMetadata_v2 (org.apache.drill.exec.store.parquet.metadata.Metadata_V2.ColumnTypeMetadata_v2)1 ParquetFileMetadata_v3 (org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ParquetFileMetadata_v3)1 ColumnTypeMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4)1 FileMetadata (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata)1 MetadataSummary (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary)1 ParquetFileMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)1