Search in sources :

Example 1 with ColumnTypeMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4 in project drill by apache.

the class ParquetTableMetadataUtils method getIntermediateFields.

/**
 * Returns map of column names with their Drill types for every {@code NameSegment} in {@code SchemaPath}
 * in specified {@code rowGroup}. The type for a {@code SchemaPath} can be {@code null} in case when
 * it is not possible to determine its type. Actually, as of now this hierarchy is of interest solely
 * because there is a need to account for {@link org.apache.drill.common.types.TypeProtos.MinorType#DICT}
 * to make sure filters used on {@code DICT}'s values (get by key) are not pruned out before actual filtering
 * happens.
 *
 * @param parquetTableMetadata the source of column types
 * @param rowGroup row group whose columns should be discovered
 * @return map of column names with their drill types
 */
public static Map<SchemaPath, TypeProtos.MajorType> getIntermediateFields(MetadataBase.ParquetTableMetadataBase parquetTableMetadata, MetadataBase.RowGroupMetadata rowGroup) {
    Map<SchemaPath, TypeProtos.MajorType> columns = new LinkedHashMap<>();
    MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
    boolean hasParentTypes = metadataVersion.isAtLeast(4, 1);
    if (!hasParentTypes) {
        return Collections.emptyMap();
    }
    for (MetadataBase.ColumnMetadata column : rowGroup.getColumns()) {
        Metadata_V4.ColumnTypeMetadata_v4 columnTypeMetadata = ((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).getColumnTypeInfo(column.getName());
        List<OriginalType> parentTypes = columnTypeMetadata.parentTypes;
        List<TypeProtos.MajorType> drillTypes = ParquetReaderUtility.getComplexTypes(parentTypes);
        for (int i = 0; i < drillTypes.size(); i++) {
            SchemaPath columnPath = SchemaPath.getCompoundPath(i + 1, column.getName());
            TypeProtos.MajorType drillType = drillTypes.get(i);
            putType(columns, columnPath, drillType);
        }
    }
    return columns;
}
Also used : TypeProtos(org.apache.drill.common.types.TypeProtos) LinkedHashMap(java.util.LinkedHashMap) MetadataVersion(org.apache.drill.exec.store.parquet.metadata.MetadataVersion) OriginalType(org.apache.parquet.schema.OriginalType) Metadata_V4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase)

Example 2 with ColumnTypeMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4 in project drill by apache.

the class ConvertCountToDirectScanRule method collectCounts.

/**
 * Collects counts for each aggregation call by using the metadata summary information
 * Will return empty result map if was not able to determine count for at least one aggregation call.
 *
 * For each aggregate call will determine if count can be calculated. Collects counts only for COUNT function.
 *   1. First, we get the total row count from the metadata summary.
 *   2. For COUNT(*) and COUNT(<non null column>) and COUNT(<implicit column>), the count = total row count
 *   3. For COUNT(nullable column), count = (total row count - column's null count)
 *   4. Also count can not be calculated for parition columns.
 *   5. For the columns that are not present in the Summary(Non-existent columns), the count = 0
 *
 * @param settings planner options
 * @param metadataSummary metadata summary containing row counts and column counts
 * @param agg aggregate relational expression
 * @param scan scan relational expression
 * @param project project relational expression
 * @return result map where key is count column name, value is count value
 */
private Map<String, Long> collectCounts(PlannerSettings settings, Metadata_V4.MetadataSummary metadataSummary, Aggregate agg, TableScan scan, Project project) {
    final Set<String> implicitColumnsNames = ColumnExplorer.initImplicitFileColumns(settings.getOptions()).keySet();
    final long totalRecordCount = metadataSummary.getTotalRowCount();
    final LinkedHashMap<String, Long> result = new LinkedHashMap<>();
    for (int i = 0; i < agg.getAggCallList().size(); i++) {
        AggregateCall aggCall = agg.getAggCallList().get(i);
        long cnt;
        // rule can be applied only for count function, return empty counts
        if (!"count".equalsIgnoreCase(aggCall.getAggregation().getName())) {
            return ImmutableMap.of();
        }
        if (CountToDirectScanUtils.containsStarOrNotNullInput(aggCall, agg)) {
            cnt = totalRecordCount;
        } else if (aggCall.getArgList().size() == 1) {
            // count(columnName) ==> Agg ( Scan )) ==> columnValueCount
            int index = aggCall.getArgList().get(0);
            if (project != null) {
                // return count of "col2" in Scan's metadata, if found.
                if (!(project.getProjects().get(index) instanceof RexInputRef)) {
                    // do not apply for all other cases.
                    return ImmutableMap.of();
                }
                index = ((RexInputRef) project.getProjects().get(index)).getIndex();
            }
            String columnName = scan.getRowType().getFieldNames().get(index).toLowerCase();
            // for implicit column count will be the same as total record count
            if (implicitColumnsNames.contains(columnName)) {
                cnt = totalRecordCount;
            } else {
                SchemaPath simplePath = SchemaPath.getSimplePath(columnName);
                if (ColumnExplorer.isPartitionColumn(settings.getOptions(), simplePath)) {
                    return ImmutableMap.of();
                }
                Metadata_V4.ColumnTypeMetadata_v4 columnMetadata = metadataSummary.getColumnTypeInfo(new Metadata_V4.ColumnTypeMetadata_v4.Key(simplePath));
                if (columnMetadata == null) {
                    // If the column doesn't exist in the table, row count is set to 0
                    cnt = 0;
                } else if (columnMetadata.totalNullCount == Statistic.NO_COLUMN_STATS) {
                    // if column stats is not available don't apply this rule, return empty counts
                    return ImmutableMap.of();
                } else {
                    // count of a nullable column = (total row count - column's null count)
                    cnt = totalRecordCount - columnMetadata.totalNullCount;
                }
            }
        } else {
            return ImmutableMap.of();
        }
        String name = "count" + i + "$" + (aggCall.getName() == null ? aggCall.toString() : aggCall.getName());
        result.put(name, cnt);
    }
    return ImmutableMap.copyOf(result);
}
Also used : LinkedHashMap(java.util.LinkedHashMap) AggregateCall(org.apache.calcite.rel.core.AggregateCall) Metadata_V4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4) SchemaPath(org.apache.drill.common.expression.SchemaPath) RexInputRef(org.apache.calcite.rex.RexInputRef)

Example 3 with ColumnTypeMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4 in project drill by apache.

the class Metadata method readBlockMeta.

/**
 * Read the parquet metadata from a file
 *
 * @param path to metadata file
 * @param dirsOnly true for {@link Metadata#METADATA_DIRECTORIES_FILENAME}
 *                 or false for {@link Metadata#OLD_METADATA_FILENAME} files reading
 * @param metaContext current metadata context
 */
private void readBlockMeta(Path path, boolean dirsOnly, MetadataContext metaContext, FileSystem fs) {
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    Path metadataParentDir = Path.getPathWithoutSchemeAndAuthority(path.getParent());
    String metadataParentDirPath = metadataParentDir.toUri().getPath();
    ObjectMapper mapper = new ObjectMapper();
    final SimpleModule serialModule = new SimpleModule();
    serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
    serialModule.addKeyDeserializer(Metadata_V2.ColumnTypeMetadata_v2.Key.class, new Metadata_V2.ColumnTypeMetadata_v2.Key.DeSerializer());
    serialModule.addKeyDeserializer(Metadata_V3.ColumnTypeMetadata_v3.Key.class, new Metadata_V3.ColumnTypeMetadata_v3.Key.DeSerializer());
    serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new ColumnTypeMetadata_v4.Key.DeSerializer());
    AfterburnerModule module = new AfterburnerModule();
    module.setUseOptimizedBeanDeserializer(true);
    boolean isFileMetadata = path.toString().endsWith(METADATA_FILENAME);
    boolean isSummaryFile = path.toString().endsWith(METADATA_SUMMARY_FILENAME);
    mapper.registerModule(serialModule);
    mapper.registerModule(module);
    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    try (InputStream is = fs.open(path)) {
        boolean alreadyCheckedModification;
        boolean newMetadata = false;
        alreadyCheckedModification = metaContext.getStatus(metadataParentDirPath);
        if (dirsOnly) {
            parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class);
            if (timer != null) {
                logger.debug("Took {} ms to read directories from directory cache file", timer.elapsed(TimeUnit.MILLISECONDS));
                timer.stop();
            }
            parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
            if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(), path, metadataParentDir, metaContext, fs)) {
                parquetTableMetadataDirs = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getRight();
                newMetadata = true;
            }
        } else {
            if (isFileMetadata) {
                parquetTableMetadata.assignFiles((mapper.readValue(is, FileMetadata.class)).getFiles());
                if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isAtLeast(4, 0)) {
                    ((ParquetTableMetadata_v4) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
                }
                if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(), path, metadataParentDir, metaContext, fs)) {
                    parquetTableMetadata = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getLeft();
                    newMetadata = true;
                }
            } else if (isSummaryFile) {
                MetadataSummary metadataSummary = mapper.readValue(is, Metadata_V4.MetadataSummary.class);
                parquetTableMetadata = new ParquetTableMetadata_v4(metadataSummary);
            } else {
                parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
                if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isAtLeast(3, 0)) {
                    ((Metadata_V3.ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
                }
                if (!alreadyCheckedModification && tableModified((parquetTableMetadata.getDirectories()), path, metadataParentDir, metaContext, fs)) {
                    parquetTableMetadata = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getLeft();
                    newMetadata = true;
                }
            }
            if (timer != null) {
                logger.debug("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS));
                timer.stop();
            }
            if (!isSummaryFile) {
                List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
                if (files != null) {
                    for (ParquetFileMetadata file : files) {
                        // DRILL-5009: Remove empty row groups unless it is the only row group
                        List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
                        if (rowGroups.size() == 1) {
                            continue;
                        }
                        rowGroups.removeIf(r -> r.getRowCount() == 0);
                    }
                }
            }
            if (newMetadata) {
                // if new metadata files were created, invalidate the existing metadata context
                metaContext.clear();
            }
        }
    } catch (IOException e) {
        logger.error("Failed to read '{}' metadata file", path, e);
        metaContext.setMetadataCacheCorrupted(true);
    }
}
Also used : ColumnTypeMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) FileMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) InputStream(java.io.InputStream) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) IOException(java.io.IOException) AfterburnerModule(com.fasterxml.jackson.module.afterburner.AfterburnerModule) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) SimpleModule(com.fasterxml.jackson.databind.module.SimpleModule)

Example 4 with ColumnTypeMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4 in project drill by apache.

the class Metadata method createMetaFilesRecursively.

/**
 * Create the parquet metadata files for the directory at the given path and for any subdirectories.
 * Metadata cache files written to the disk contain relative paths. Returned Pair of metadata contains absolute paths.
 *
 * @param path to the directory of the parquet table
 * @param fs file system
 * @param allColumnsInteresting if set, store column metadata for all the columns
 * @param columnSet Set of columns for which column metadata has to be stored
 * @return Pair of parquet metadata. The left one is a parquet metadata for the table. The right one of the Pair is
 *         a metadata for all subdirectories (if they are present and there are no any parquet files in the
 *         {@code path} directory).
 * @throws IOException if parquet metadata can't be serialized and written to the json file
 */
private Pair<ParquetTableMetadata_v4, ParquetTableMetadataDirs> createMetaFilesRecursively(Path path, FileSystem fs, boolean allColumnsInteresting, Set<SchemaPath> columnSet) throws IOException {
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    List<ParquetFileMetadata_v4> metaDataList = Lists.newArrayList();
    List<Path> directoryList = Lists.newArrayList();
    ConcurrentHashMap<ColumnTypeMetadata_v4.Key, ColumnTypeMetadata_v4> columnTypeInfoSet = new ConcurrentHashMap<>();
    FileStatus fileStatus = fs.getFileStatus(path);
    long dirTotalRowCount = 0;
    assert fileStatus.isDirectory() : "Expected directory";
    final Map<FileStatus, FileSystem> childFiles = new LinkedHashMap<>();
    for (final FileStatus file : DrillFileSystemUtil.listAll(fs, path, false)) {
        if (file.isDirectory()) {
            ParquetTableMetadata_v4 subTableMetadata = (createMetaFilesRecursively(file.getPath(), fs, allColumnsInteresting, columnSet)).getLeft();
            ConcurrentHashMap<ColumnTypeMetadata_v4.Key, ColumnTypeMetadata_v4> subTableColumnTypeInfo = subTableMetadata.getColumnTypeInfoMap();
            metaDataList.addAll((List<ParquetFileMetadata_v4>) subTableMetadata.getFiles());
            directoryList.addAll(subTableMetadata.getDirectories());
            directoryList.add(file.getPath());
            // TODO: We need a merge method that merges two columns with the same name but different types
            if (columnTypeInfoSet.isEmpty()) {
                columnTypeInfoSet.putAll(subTableColumnTypeInfo);
            } else {
                for (ColumnTypeMetadata_v4.Key key : subTableColumnTypeInfo.keySet()) {
                    ColumnTypeMetadata_v4 columnTypeMetadata_v4 = columnTypeInfoSet.get(key);
                    if (columnTypeMetadata_v4 == null) {
                        columnTypeMetadata_v4 = subTableColumnTypeInfo.get(key);
                    } else {
                        // as unknown
                        if (subTableColumnTypeInfo.get(key).totalNullCount < 0 || columnTypeMetadata_v4.totalNullCount < 0) {
                            columnTypeMetadata_v4.totalNullCount = NULL_COUNT_NOT_EXISTS;
                        } else {
                            columnTypeMetadata_v4.totalNullCount = columnTypeMetadata_v4.totalNullCount + subTableColumnTypeInfo.get(key).totalNullCount;
                        }
                    }
                    columnTypeInfoSet.put(key, columnTypeMetadata_v4);
                }
            }
            dirTotalRowCount = dirTotalRowCount + subTableMetadata.getTotalRowCount();
        } else {
            childFiles.put(file, fs);
        }
    }
    Metadata_V4.MetadataSummary metadataSummary = new Metadata_V4.MetadataSummary(SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion(), allColumnsInteresting || columnSet == null);
    ParquetTableMetadata_v4 parquetTableMetadata = new ParquetTableMetadata_v4(metadataSummary);
    if (childFiles.size() > 0) {
        List<ParquetFileAndRowCountMetadata> childFileAndRowCountMetadata = getParquetFileMetadata_v4(parquetTableMetadata, childFiles, allColumnsInteresting, columnSet);
        // If the columnTypeInfoSet is empty, add the columnTypeInfo from the parquetTableMetadata
        if (columnTypeInfoSet.isEmpty()) {
            columnTypeInfoSet.putAll(parquetTableMetadata.getColumnTypeInfoMap());
        }
        for (ParquetFileAndRowCountMetadata parquetFileAndRowCountMetadata : childFileAndRowCountMetadata) {
            metaDataList.add(parquetFileAndRowCountMetadata.getFileMetadata());
            dirTotalRowCount = dirTotalRowCount + parquetFileAndRowCountMetadata.getFileRowCount();
            Map<ColumnTypeMetadata_v4.Key, Long> totalNullCountMap = parquetFileAndRowCountMetadata.getTotalNullCountMap();
            for (ColumnTypeMetadata_v4.Key column : totalNullCountMap.keySet()) {
                ColumnTypeMetadata_v4 columnTypeMetadata_v4 = columnTypeInfoSet.get(column);
                // If the column is not present in columnTypeInfoSet, get it from parquetTableMetadata
                if (columnTypeMetadata_v4 == null) {
                    columnTypeMetadata_v4 = parquetTableMetadata.getColumnTypeInfoMap().get(column);
                }
                // as unknown
                if (columnTypeMetadata_v4.totalNullCount < 0 || totalNullCountMap.get(column) < 0) {
                    columnTypeMetadata_v4.totalNullCount = NULL_COUNT_NOT_EXISTS;
                } else {
                    columnTypeMetadata_v4.totalNullCount += totalNullCountMap.get(column);
                }
                columnTypeInfoSet.put(column, columnTypeMetadata_v4);
            }
        }
    }
    metadataSummary.directories = directoryList;
    parquetTableMetadata.assignFiles(metaDataList);
    // TODO: We need a merge method that merges two columns with the same name but different types
    if (metadataSummary.columnTypeInfo == null) {
        metadataSummary.columnTypeInfo = new ConcurrentHashMap<>();
    }
    metadataSummary.columnTypeInfo.putAll(columnTypeInfoSet);
    metadataSummary.allColumnsInteresting = allColumnsInteresting;
    metadataSummary.totalRowCount = dirTotalRowCount;
    parquetTableMetadata.metadataSummary = metadataSummary;
    for (String oldName : OLD_METADATA_FILENAMES) {
        fs.delete(new Path(path, oldName), false);
    }
    // relative paths in the metadata are only necessary for meta cache files.
    ParquetTableMetadata_v4 metadataTableWithRelativePaths = MetadataPathUtils.createMetadataWithRelativePaths(parquetTableMetadata, path);
    writeFile(metadataTableWithRelativePaths.fileMetadata, new Path(path, METADATA_FILENAME), fs);
    writeFile(metadataTableWithRelativePaths.getSummary(), new Path(path, METADATA_SUMMARY_FILENAME), fs);
    Metadata_V4.MetadataSummary metadataSummaryWithRelativePaths = metadataTableWithRelativePaths.getSummary();
    // Directories list will be empty at the leaf level directories. For sub-directories with both files and directories,
    // only the directories will be included in the list.
    writeFile(new ParquetTableMetadataDirs(metadataSummaryWithRelativePaths.directories), new Path(path, METADATA_DIRECTORIES_FILENAME), fs);
    if (timer != null) {
        logger.debug("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
    }
    return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(directoryList));
}
Also used : ColumnTypeMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4) FileStatus(org.apache.hadoop.fs.FileStatus) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) LinkedHashMap(java.util.LinkedHashMap) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) FileSystem(org.apache.hadoop.fs.FileSystem) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFileAndRowCountMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)

Example 5 with ColumnTypeMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4 in project drill by apache.

the class Metadata method getSummary.

/**
 * Reads the summary from the metadata cache file, if the cache file is stale recreates the metadata
 * @param fs file system
 * @param metadataParentDir parent directory that holds metadata files
 * @param autoRefreshTriggered true if the auto-refresh is already triggered
 * @param readerConfig Parquet reader config
 * @return returns metadata summary
 */
public static Metadata_V4.MetadataSummary getSummary(FileSystem fs, Path metadataParentDir, boolean autoRefreshTriggered, ParquetReaderConfig readerConfig) {
    Path summaryFile = getSummaryFileName(metadataParentDir);
    Path metadataDirFile = getDirFileName(metadataParentDir);
    MetadataContext metaContext = new MetadataContext();
    try {
        // If autoRefresh is not triggered and none of the metadata files exist
        if (!autoRefreshTriggered && !metadataExists(fs, metadataParentDir)) {
            logger.debug("Metadata doesn't exist in {}", metadataParentDir);
            return null;
        } else if (autoRefreshTriggered && !fs.exists(summaryFile)) {
            logger.debug("Metadata Summary file {} does not exist", summaryFile);
            return null;
        } else {
            // If the autorefresh is not triggered, check if the cache file is stale and trigger auto-refresh
            if (!autoRefreshTriggered) {
                Metadata metadata = new Metadata(readerConfig);
                if (!fs.exists(metadataDirFile)) {
                    return null;
                }
                ParquetTableMetadataDirs metadataDirs = readMetadataDirs(fs, metadataDirFile, metaContext, readerConfig);
                if (metadata.tableModified(metadataDirs.getDirectories(), summaryFile, metadataParentDir, metaContext, fs) && true) {
                    ParquetTableMetadata_v4 parquetTableMetadata = (metadata.createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(summaryFile.getParent()), fs, true, null, true)).getLeft();
                    return parquetTableMetadata.getSummary();
                }
            }
            // Read the existing metadataSummary cache file to get the metadataSummary
            ObjectMapper mapper = new ObjectMapper();
            final SimpleModule serialModule = new SimpleModule();
            serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
            serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new ColumnTypeMetadata_v4.Key.DeSerializer());
            AfterburnerModule module = new AfterburnerModule();
            module.setUseOptimizedBeanDeserializer(true);
            mapper.registerModule(serialModule);
            mapper.registerModule(module);
            mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
            InputStream is = fs.open(summaryFile);
            return mapper.readValue(is, Metadata_V4.MetadataSummary.class);
        }
    } catch (IOException e) {
        logger.debug("Failed to read '{}' summary metadata file", summaryFile, e);
        return null;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) PathSerDe(org.apache.drill.exec.serialization.PathSerDe) ColumnTypeMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4) InputStream(java.io.InputStream) RowGroupMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata) FileMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) ParquetFileAndRowCountMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) IOException(java.io.IOException) AfterburnerModule(com.fasterxml.jackson.module.afterburner.AfterburnerModule) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetadataContext(org.apache.drill.exec.store.dfs.MetadataContext) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) SimpleModule(com.fasterxml.jackson.databind.module.SimpleModule)

Aggregations

SchemaPath (org.apache.drill.common.expression.SchemaPath)7 LinkedHashMap (java.util.LinkedHashMap)5 Metadata_V4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4)4 TypeProtos (org.apache.drill.common.types.TypeProtos)3 ColumnTypeMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4)3 MetadataSummary (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary)3 ParquetTableMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4)3 Path (org.apache.hadoop.fs.Path)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 SimpleModule (com.fasterxml.jackson.databind.module.SimpleModule)2 AfterburnerModule (com.fasterxml.jackson.module.afterburner.AfterburnerModule)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 MetadataBase (org.apache.drill.exec.store.parquet.metadata.MetadataBase)2 ParquetFileMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata)2 MetadataVersion (org.apache.drill.exec.store.parquet.metadata.MetadataVersion)2 FileMetadata (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata)2 ParquetFileAndRowCountMetadata (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata)2 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)2 OriginalType (org.apache.parquet.schema.OriginalType)2