Search in sources :

Example 6 with Metadata_V4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.

the class Metadata method getSummary.

/**
 * Reads the summary from the metadata cache file, if the cache file is stale recreates the metadata
 * @param fs file system
 * @param metadataParentDir parent directory that holds metadata files
 * @param autoRefreshTriggered true if the auto-refresh is already triggered
 * @param readerConfig Parquet reader config
 * @return returns metadata summary
 */
public static Metadata_V4.MetadataSummary getSummary(FileSystem fs, Path metadataParentDir, boolean autoRefreshTriggered, ParquetReaderConfig readerConfig) {
    Path summaryFile = getSummaryFileName(metadataParentDir);
    Path metadataDirFile = getDirFileName(metadataParentDir);
    MetadataContext metaContext = new MetadataContext();
    try {
        // If autoRefresh is not triggered and none of the metadata files exist
        if (!autoRefreshTriggered && !metadataExists(fs, metadataParentDir)) {
            logger.debug("Metadata doesn't exist in {}", metadataParentDir);
            return null;
        } else if (autoRefreshTriggered && !fs.exists(summaryFile)) {
            logger.debug("Metadata Summary file {} does not exist", summaryFile);
            return null;
        } else {
            // If the autorefresh is not triggered, check if the cache file is stale and trigger auto-refresh
            if (!autoRefreshTriggered) {
                Metadata metadata = new Metadata(readerConfig);
                if (!fs.exists(metadataDirFile)) {
                    return null;
                }
                ParquetTableMetadataDirs metadataDirs = readMetadataDirs(fs, metadataDirFile, metaContext, readerConfig);
                if (metadata.tableModified(metadataDirs.getDirectories(), summaryFile, metadataParentDir, metaContext, fs) && true) {
                    ParquetTableMetadata_v4 parquetTableMetadata = (metadata.createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(summaryFile.getParent()), fs, true, null, true)).getLeft();
                    return parquetTableMetadata.getSummary();
                }
            }
            // Read the existing metadataSummary cache file to get the metadataSummary
            ObjectMapper mapper = new ObjectMapper();
            final SimpleModule serialModule = new SimpleModule();
            serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
            serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new ColumnTypeMetadata_v4.Key.DeSerializer());
            AfterburnerModule module = new AfterburnerModule();
            module.setUseOptimizedBeanDeserializer(true);
            mapper.registerModule(serialModule);
            mapper.registerModule(module);
            mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
            InputStream is = fs.open(summaryFile);
            return mapper.readValue(is, Metadata_V4.MetadataSummary.class);
        }
    } catch (IOException e) {
        logger.debug("Failed to read '{}' summary metadata file", summaryFile, e);
        return null;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) PathSerDe(org.apache.drill.exec.serialization.PathSerDe) ColumnTypeMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4) InputStream(java.io.InputStream) RowGroupMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata) FileMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) ParquetFileAndRowCountMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) IOException(java.io.IOException) AfterburnerModule(com.fasterxml.jackson.module.afterburner.AfterburnerModule) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetadataContext(org.apache.drill.exec.store.dfs.MetadataContext) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) SimpleModule(com.fasterxml.jackson.databind.module.SimpleModule)

Example 7 with Metadata_V4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.

the class Metadata method getParquetTableMetadata.

/**
 * Get the parquet metadata for a list of parquet files
 *
 * @param fileStatusMap file statuses and corresponding file systems
 * @return parquet table metadata object
 * @throws IOException if parquet file metadata can't be obtained
 */
private ParquetTableMetadata_v4 getParquetTableMetadata(Map<FileStatus, FileSystem> fileStatusMap) throws IOException {
    Metadata_V4.MetadataSummary tableMetadataSummary = new Metadata_V4.MetadataSummary(SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion(), new ArrayList<>(), true);
    ParquetTableMetadata_v4 tableMetadata = new ParquetTableMetadata_v4(tableMetadataSummary);
    List<ParquetFileAndRowCountMetadata> parquetFileAndRowCountMetadata = getParquetFileMetadata_v4(tableMetadata, fileStatusMap, true, null);
    List<ParquetFileMetadata_v4> parquetFileMetadata = new ArrayList<>();
    for (ParquetFileAndRowCountMetadata fileAndGlobalMetadata : parquetFileAndRowCountMetadata) {
        parquetFileMetadata.add(fileAndGlobalMetadata.getFileMetadata());
    }
    tableMetadata.assignFiles(parquetFileMetadata);
    return tableMetadata;
}
Also used : ParquetFileAndRowCountMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) ArrayList(java.util.ArrayList) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)

Example 8 with Metadata_V4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.

the class Metadata method getParquetTableMetadata.

/**
 * Get the parquet metadata for the parquet files in a directory.
 *
 * @param path the path of the directory
 * @return metadata object for an entire parquet directory structure
 * @throws IOException in case of problems during accessing files
 */
private ParquetTableMetadata_v4 getParquetTableMetadata(Path path, FileSystem fs) throws IOException {
    FileStatus fileStatus = fs.getFileStatus(path);
    Stopwatch watch = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    List<FileStatus> fileStatuses = new ArrayList<>();
    if (fileStatus.isFile()) {
        fileStatuses.add(fileStatus);
    } else {
        // the thing we need!?
        fileStatuses.addAll(DrillFileSystemUtil.listFiles(fs, path, true));
    }
    if (watch != null) {
        logger.debug("Took {} ms to get file statuses", watch.elapsed(TimeUnit.MILLISECONDS));
        watch.reset();
        watch.start();
    }
    Map<FileStatus, FileSystem> fileStatusMap = fileStatuses.stream().collect(java.util.stream.Collectors.toMap(Function.identity(), s -> fs, (oldFs, newFs) -> newFs, LinkedHashMap::new));
    ParquetTableMetadata_v4 metadata_v4 = getParquetTableMetadata(fileStatusMap);
    if (watch != null) {
        logger.debug("Took {} ms to read file metadata", watch.elapsed(TimeUnit.MILLISECONDS));
        watch.stop();
    }
    return metadata_v4;
}
Also used : TimedCallable(org.apache.drill.exec.store.TimedCallable) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) FileSystem(org.apache.hadoop.fs.FileSystem) LoggerFactory(org.slf4j.LoggerFactory) FileStatus(org.apache.hadoop.fs.FileStatus) DeserializationFeature(com.fasterxml.jackson.databind.DeserializationFeature) SimpleModule(com.fasterxml.jackson.databind.module.SimpleModule) Pair(org.apache.commons.lang3.tuple.Pair) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) SHORT_PREFIX_STYLE(org.apache.commons.lang3.builder.ToStringStyle.SHORT_PREFIX_STYLE) ParquetTableMetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetTableMetadataBase) Collectors(org.apache.drill.common.collections.Collectors) SchemaPath(org.apache.drill.common.expression.SchemaPath) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) RowGroupMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.RowGroupMetadata) AfterburnerModule(com.fasterxml.jackson.module.afterburner.AfterburnerModule) DrillFileSystemUtil(org.apache.drill.exec.util.DrillFileSystemUtil) List(java.util.List) FileMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata) ToStringBuilder(org.apache.commons.lang3.builder.ToStringBuilder) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4) ParquetReaderConfig(org.apache.drill.exec.store.parquet.ParquetReaderConfig) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) ParquetFileAndRowCountMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) PathSerDe(org.apache.drill.exec.serialization.PathSerDe) ColumnMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnMetadata_v4) OutputStream(java.io.OutputStream) DrillVersionInfo(org.apache.drill.common.util.DrillVersionInfo) Logger(org.slf4j.Logger) JsonParser(com.fasterxml.jackson.core.JsonParser) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) SUPPORTED_VERSIONS(org.apache.drill.exec.store.parquet.metadata.MetadataVersion.Constants.SUPPORTED_VERSIONS) IOException(java.io.IOException) MetadataContext(org.apache.drill.exec.store.dfs.MetadataContext) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) TimeUnit(java.util.concurrent.TimeUnit) JsonFactory(com.fasterxml.jackson.core.JsonFactory) Feature(com.fasterxml.jackson.core.JsonGenerator.Feature) Lists(org.apache.drill.shaded.guava.com.google.common.collect.Lists) ColumnTypeMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) InputStream(java.io.InputStream) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) ArrayList(java.util.ArrayList) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4)

Example 9 with Metadata_V4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.

the class MetadataPathUtils method convertToFilesWithAbsolutePaths.

/**
 * Convert a list of files with relative paths to files with absolute ones
 *
 * @param files list of files with relative paths
 * @param baseDir base parent directory
 * @return list of files with absolute paths
 */
public static List<? extends ParquetFileMetadata> convertToFilesWithAbsolutePaths(List<? extends ParquetFileMetadata> files, String baseDir) {
    if (!files.isEmpty()) {
        List<ParquetFileMetadata> filesWithAbsolutePaths = new ArrayList<>();
        for (ParquetFileMetadata file : files) {
            Path relativePath = file.getPath();
            ParquetFileMetadata fileWithAbsolutePath = null;
            // create a new file if old one contains a relative path, otherwise use an old file
            if (file instanceof ParquetFileMetadata_v4) {
                fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v4(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V4.RowGroupMetadata_v4>) file.getRowGroups());
            } else if (file instanceof ParquetFileMetadata_v3) {
                fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v3(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V3.RowGroupMetadata_v3>) file.getRowGroups());
            }
            filesWithAbsolutePaths.add(fileWithAbsolutePath);
        }
        return filesWithAbsolutePaths;
    }
    return files;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) ParquetFileMetadata_v3(org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ParquetFileMetadata_v3) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)

Example 10 with Metadata_V4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.

the class ConvertCountToDirectScanRule method onMatch.

@Override
public void onMatch(RelOptRuleCall call) {
    final Aggregate agg = call.rel(0);
    final TableScan scan = call.rel(call.rels.length - 1);
    final Project project = call.rels.length == 3 ? (Project) call.rel(1) : null;
    // 3) Additional checks are done further below ..
    if (agg.getGroupCount() > 0 || agg.containsDistinctCall()) {
        return;
    }
    DrillTable drillTable = DrillRelOptUtil.getDrillTable(scan);
    if (drillTable == null) {
        logger.debug("Rule does not apply since an eligible drill table instance was not found.");
        return;
    }
    Object selection = drillTable.getSelection();
    if (!(selection instanceof FormatSelection)) {
        logger.debug("Rule does not apply since only Parquet file format is eligible.");
        return;
    }
    PlannerSettings settings = call.getPlanner().getContext().unwrap(PlannerSettings.class);
    // Rule is applicable only if the statistics for row count and null count are available from the metadata,
    FormatSelection formatSelection = (FormatSelection) selection;
    // Rule cannot be applied if the selection had wildcard since the totalrowcount cannot be read from the parent directory
    if (formatSelection.getSelection().hadWildcard()) {
        logger.debug("Rule does not apply when there is a wild card since the COUNT could not be determined from metadata.");
        return;
    }
    Pair<Boolean, Metadata_V4.MetadataSummary> status = checkMetadataForScanStats(settings, drillTable, formatSelection);
    if (!status.getLeft()) {
        logger.debug("Rule does not apply since MetadataSummary metadata was not found.");
        return;
    }
    Metadata_V4.MetadataSummary metadataSummary = status.getRight();
    Map<String, Long> result = collectCounts(settings, metadataSummary, agg, scan, project);
    logger.trace("Calculated the following aggregate counts: {}", result);
    // if counts could not be determined, rule won't be applied
    if (result.isEmpty()) {
        logger.debug("Rule does not apply since one or more COUNTs could not be determined from metadata.");
        return;
    }
    Path summaryFileName = Metadata.getSummaryFileName(formatSelection.getSelection().getSelectionRoot());
    final RelDataType scanRowType = CountToDirectScanUtils.constructDataType(agg, result.keySet());
    final DynamicPojoRecordReader<Long> reader = new DynamicPojoRecordReader<>(CountToDirectScanUtils.buildSchema(scanRowType.getFieldNames()), Collections.singletonList(new ArrayList<>(result.values())));
    final ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, 1, 1, scanRowType.getFieldCount());
    final MetadataDirectGroupScan directScan = new MetadataDirectGroupScan(reader, summaryFileName, 1, scanStats, true, false);
    final DrillDirectScanRel newScan = new DrillDirectScanRel(scan.getCluster(), scan.getTraitSet().plus(DrillRel.DRILL_LOGICAL), directScan, scanRowType);
    final DrillProjectRel newProject = new DrillProjectRel(agg.getCluster(), agg.getTraitSet().plus(DrillRel.DRILL_LOGICAL), newScan, CountToDirectScanUtils.prepareFieldExpressions(scanRowType), agg.getRowType());
    call.transformTo(newProject);
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) TableScan(org.apache.calcite.rel.core.TableScan) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) ArrayList(java.util.ArrayList) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) RelDataType(org.apache.calcite.rel.type.RelDataType) Project(org.apache.calcite.rel.core.Project) MetadataDirectGroupScan(org.apache.drill.exec.store.direct.MetadataDirectGroupScan) Metadata_V4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4) Aggregate(org.apache.calcite.rel.core.Aggregate) ScanStats(org.apache.drill.exec.physical.base.ScanStats)

Aggregations

SchemaPath (org.apache.drill.common.expression.SchemaPath)11 Path (org.apache.hadoop.fs.Path)8 LinkedHashMap (java.util.LinkedHashMap)7 Metadata_V4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4)7 ArrayList (java.util.ArrayList)6 IOException (java.io.IOException)5 ParquetFileMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata)5 MetadataSummary (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary)5 ParquetTableMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4)5 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)4 SimpleModule (com.fasterxml.jackson.databind.module.SimpleModule)4 ColumnTypeMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4)4 FileMetadata (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata)4 ParquetFileAndRowCountMetadata (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata)4 ParquetFileMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)4 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)4 AfterburnerModule (com.fasterxml.jackson.module.afterburner.AfterburnerModule)3 InputStream (java.io.InputStream)3 TypeProtos (org.apache.drill.common.types.TypeProtos)3 PathSerDe (org.apache.drill.exec.serialization.PathSerDe)3