Search in sources :

Example 6 with FileSelection

use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.

the class ParquetGroupScan method initFromMetadataCache.

/**
   * Create and return a new file selection based on reading the metadata cache file.
   *
   * This function also initializes a few of ParquetGroupScan's fields as appropriate.
   *
   * @param selection initial file selection
   * @param metaFilePath metadata cache file path
   * @return file selection read from cache
   *
   * @throws IOException
   * @throws UserException when the updated selection is empty, this happens if the user selects an empty folder.
   */
private FileSelection initFromMetadataCache(FileSelection selection, Path metaFilePath) throws IOException {
    // get the metadata for the root directory by reading the metadata file
    // parquetTableMetadata contains the metadata for all files in the selection root folder, but we need to make sure
    // we only select the files that are part of selection (by setting fileSet appropriately)
    // get (and set internal field) the metadata for the directory by reading the metadata file
    this.parquetTableMetadata = Metadata.readBlockMeta(fs, metaFilePath.toString(), selection.getMetaContext(), formatConfig);
    if (formatConfig.autoCorrectCorruptDates) {
        ParquetReaderUtility.correctDatesInMetadataCache(this.parquetTableMetadata);
    }
    List<FileStatus> fileStatuses = selection.getStatuses(fs);
    if (fileSet == null) {
        fileSet = Sets.newHashSet();
    }
    final Path first = fileStatuses.get(0).getPath();
    if (fileStatuses.size() == 1 && selection.getSelectionRoot().equals(first.toString())) {
        // we are selecting all files from selection root. Expand the file list from the cache
        for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }
    } else if (selection.isExpandedPartial() && !selection.hadWildcard() && cacheFileRoot != null) {
        if (selection.wasAllPartitionsPruned()) {
            // if all partitions were previously pruned, we only need to read 1 file (for the schema)
            fileSet.add(this.parquetTableMetadata.getFiles().get(0).getPath());
        } else {
            // second phase of partition pruning will apply on the files and modify the file selection appropriately.
            for (Metadata.ParquetFileMetadata file : this.parquetTableMetadata.getFiles()) {
                fileSet.add(file.getPath());
            }
        }
    } else {
        // we need to expand the files from fileStatuses
        for (FileStatus status : fileStatuses) {
            if (status.isDirectory()) {
                //TODO [DRILL-4496] read the metadata cache files in parallel
                final Path metaPath = new Path(status.getPath(), Metadata.METADATA_FILENAME);
                final Metadata.ParquetTableMetadataBase metadata = Metadata.readBlockMeta(fs, metaPath.toString(), selection.getMetaContext(), formatConfig);
                for (Metadata.ParquetFileMetadata file : metadata.getFiles()) {
                    fileSet.add(file.getPath());
                }
            } else {
                final Path path = Path.getPathWithoutSchemeAndAuthority(status.getPath());
                fileSet.add(path.toString());
            }
        }
    }
    if (fileSet.isEmpty()) {
        // no files were found, most likely we tried to query some empty sub folders
        throw UserException.validationError().message("The table you tried to query is empty").build(logger);
    }
    List<String> fileNames = Lists.newArrayList(fileSet);
    // when creating the file selection, set the selection root without the URI prefix
    // The reason is that the file names above have been created in the form
    // /a/b/c.parquet and the format of the selection root must match that of the file names
    // otherwise downstream operations such as partition pruning can break.
    final Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(new Path(selection.getSelectionRoot()));
    this.selectionRoot = metaRootPath.toString();
    // Use the FileSelection constructor directly here instead of the FileSelection.create() method
    // because create() changes the root to include the scheme and authority; In future, if create()
    // is the preferred way to instantiate a file selection, we may need to do something different...
    // WARNING: file statuses and file names are inconsistent
    FileSelection newSelection = new FileSelection(selection.getStatuses(fs), fileNames, metaRootPath.toString(), cacheFileRoot, selection.wasAllPartitionsPruned());
    newSelection.setExpandedFully();
    newSelection.setMetaContext(selection.getMetaContext());
    return newSelection;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetTableMetadataBase(org.apache.drill.exec.store.parquet.Metadata.ParquetTableMetadataBase) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) ColumnMetadata(org.apache.drill.exec.store.parquet.Metadata.ColumnMetadata) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata)

Example 7 with FileSelection

use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.

the class FileSystemPartitionDescriptor method createNewTableScanFromSelection.

private TableScan createNewTableScanFromSelection(EnumerableTableScan oldScan, List<String> newFiles, String cacheFileRoot, boolean wasAllPartitionsPruned, MetadataContext metaContext) {
    final RelOptTableImpl t = (RelOptTableImpl) oldScan.getTable();
    final FormatSelection formatSelection = (FormatSelection) table.getSelection();
    final FileSelection newFileSelection = new FileSelection(null, newFiles, getBaseTableLocation(), cacheFileRoot, wasAllPartitionsPruned, formatSelection.getSelection().getDirStatus());
    newFileSelection.setMetaContext(metaContext);
    final FormatSelection newFormatSelection = new FormatSelection(formatSelection.getFormat(), newFileSelection);
    final DrillTranslatableTable newTable = new DrillTranslatableTable(new DynamicDrillTable(table.getPlugin(), table.getStorageEngineName(), table.getUserName(), newFormatSelection));
    final RelOptTableImpl newOptTableImpl = RelOptTableImpl.create(t.getRelOptSchema(), t.getRowType(), newTable);
    // return an EnumerableTableScan with fileSelection being part of digest of TableScan node.
    return DirPrunedEnumerableTableScan.create(oldScan.getCluster(), newOptTableImpl, newFileSelection.toString());
}
Also used : FileSelection(org.apache.drill.exec.store.dfs.FileSelection) DrillTranslatableTable(org.apache.drill.exec.planner.logical.DrillTranslatableTable) DynamicDrillTable(org.apache.drill.exec.planner.logical.DynamicDrillTable) RelOptTableImpl(org.apache.calcite.prepare.RelOptTableImpl) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection)

Example 8 with FileSelection

use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.

the class FileSystemPartitionDescriptor method getFileLocationsAndStatus.

protected Pair<Collection<String>, Boolean> getFileLocationsAndStatus() {
    Collection<String> fileLocations = null;
    Pair<Collection<String>, Boolean> fileLocationsAndStatus = null;
    boolean isExpandedPartial = false;
    if (scanRel instanceof DrillScanRel) {
        // If a particular GroupScan provides files, get the list of files from there rather than
        // DrillTable because GroupScan would have the updated version of the selection
        final DrillScanRel drillScan = (DrillScanRel) scanRel;
        if (drillScan.getGroupScan().hasFiles()) {
            fileLocations = drillScan.getGroupScan().getFiles();
            isExpandedPartial = false;
        } else {
            FileSelection selection = ((FormatSelection) table.getSelection()).getSelection();
            fileLocations = selection.getFiles();
            isExpandedPartial = selection.isExpandedPartial();
        }
    } else if (scanRel instanceof EnumerableTableScan) {
        FileSelection selection = ((FormatSelection) table.getSelection()).getSelection();
        fileLocations = selection.getFiles();
        isExpandedPartial = selection.isExpandedPartial();
    }
    fileLocationsAndStatus = Pair.of(fileLocations, isExpandedPartial);
    return fileLocationsAndStatus;
}
Also used : FileSelection(org.apache.drill.exec.store.dfs.FileSelection) DrillScanRel(org.apache.drill.exec.planner.logical.DrillScanRel) EnumerableTableScan(org.apache.calcite.adapter.enumerable.EnumerableTableScan) DirPrunedEnumerableTableScan(org.apache.drill.exec.planner.logical.DirPrunedEnumerableTableScan) Collection(java.util.Collection) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection)

Aggregations

FileSelection (org.apache.drill.exec.store.dfs.FileSelection)8 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 FormatSelection (org.apache.drill.exec.store.dfs.FormatSelection)3 IOException (java.io.IOException)2 EnumerableTableScan (org.apache.calcite.adapter.enumerable.EnumerableTableScan)2 FileGroupScan (org.apache.drill.exec.physical.base.FileGroupScan)2 DirPrunedEnumerableTableScan (org.apache.drill.exec.planner.logical.DirPrunedEnumerableTableScan)2 DrillScanRel (org.apache.drill.exec.planner.logical.DrillScanRel)2 ReadEntryWithPath (org.apache.drill.exec.store.dfs.ReadEntryWithPath)2 ParquetFileMetadata (org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata)2 RowGroupMetadata (org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata)2 Path (org.apache.hadoop.fs.Path)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 RelOptTableImpl (org.apache.calcite.prepare.RelOptTableImpl)1 ErrorCollector (org.apache.drill.common.expression.ErrorCollector)1 ErrorCollectorImpl (org.apache.drill.common.expression.ErrorCollectorImpl)1 LogicalExpression (org.apache.drill.common.expression.LogicalExpression)1 ParquetFilterPredicate (org.apache.drill.exec.expr.stat.ParquetFilterPredicate)1 DrillTranslatableTable (org.apache.drill.exec.planner.logical.DrillTranslatableTable)1