Search in sources :

Example 1 with ColumnExplorer

use of org.apache.drill.exec.store.ColumnExplorer in project drill by axbaretto.

the class ParquetScanBatchCreator method getBatch.

@SuppressWarnings("resource")
@Override
public ScanBatch getBatch(ExecutorFragmentContext context, ParquetRowGroupScan rowGroupScan, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    OperatorContext oContext = context.newOperatorContext(rowGroupScan);
    final ColumnExplorer columnExplorer = new ColumnExplorer(context.getOptions(), rowGroupScan.getColumns());
    if (!columnExplorer.isStarQuery()) {
        rowGroupScan = new ParquetRowGroupScan(rowGroupScan.getUserName(), rowGroupScan.getStorageEngine(), rowGroupScan.getRowGroupReadEntries(), columnExplorer.getTableColumns(), rowGroupScan.getSelectionRoot(), rowGroupScan.getFilter());
        rowGroupScan.setOperatorId(rowGroupScan.getOperatorId());
    }
    DrillFileSystem fs;
    try {
        boolean useAsyncPageReader = context.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_ASYNC).bool_val;
        if (useAsyncPageReader) {
            fs = oContext.newNonTrackingFileSystem(rowGroupScan.getStorageEngine().getFsConf());
        } else {
            fs = oContext.newFileSystem(rowGroupScan.getStorageEngine().getFsConf());
        }
    } catch (IOException e) {
        throw new ExecutionSetupException(String.format("Failed to create DrillFileSystem: %s", e.getMessage()), e);
    }
    Configuration conf = new Configuration(fs.getConf());
    conf.setBoolean(ENABLE_BYTES_READ_COUNTER, false);
    conf.setBoolean(ENABLE_BYTES_TOTAL_COUNTER, false);
    conf.setBoolean(ENABLE_TIME_READ_COUNTER, false);
    // keep footers in a map to avoid re-reading them
    Map<String, ParquetMetadata> footers = Maps.newHashMap();
    List<RecordReader> readers = new LinkedList<>();
    List<Map<String, String>> implicitColumns = Lists.newArrayList();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    for (RowGroupReadEntry e : rowGroupScan.getRowGroupReadEntries()) {
        /*
      Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file
      TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine)
      we should add more information to the RowGroupInfo that will be populated upon the first read to
      provide the reader with all of th file meta-data it needs
      These fields will be added to the constructor below
      */
        try {
            Stopwatch timer = Stopwatch.createUnstarted();
            if (!footers.containsKey(e.getPath())) {
                timer.start();
                ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(e.getPath()));
                long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
                logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", e.getPath(), "", 0, 0, 0, timeToRead);
                footers.put(e.getPath(), footer);
            }
            boolean autoCorrectCorruptDates = rowGroupScan.getFormatConfig().areCorruptDatesAutoCorrected();
            ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footers.get(e.getPath()), rowGroupScan.getColumns(), autoCorrectCorruptDates);
            if (logger.isDebugEnabled()) {
                logger.debug(containsCorruptDates.toString());
            }
            if (!context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER) && !isComplex(footers.get(e.getPath()))) {
                readers.add(new ParquetRecordReader(context, e.getPath(), e.getRowGroupIndex(), e.getNumRecordsToRead(), fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), footers.get(e.getPath()), rowGroupScan.getColumns(), containsCorruptDates));
            } else {
                ParquetMetadata footer = footers.get(e.getPath());
                readers.add(new DrillParquetReader(context, footer, e, columnExplorer.getTableColumns(), fs, containsCorruptDates));
            }
            Map<String, String> implicitValues = columnExplorer.populateImplicitColumns(e, rowGroupScan.getSelectionRoot());
            implicitColumns.add(implicitValues);
            if (implicitValues.size() > mapWithMaxColumns.size()) {
                mapWithMaxColumns = implicitValues;
            }
        } catch (IOException e1) {
            throw new ExecutionSetupException(e1);
        }
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    Map<String, String> diff = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, diff).entriesOnlyOnRight());
    }
    return new ScanBatch(context, oContext, readers, implicitColumns);
}
Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) Stopwatch(com.google.common.base.Stopwatch) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) OperatorContext(org.apache.drill.exec.ops.OperatorContext) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) Path(org.apache.hadoop.fs.Path) DrillParquetReader(org.apache.drill.exec.store.parquet2.DrillParquetReader) IOException(java.io.IOException) LinkedList(java.util.LinkedList) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Map(java.util.Map)

Example 2 with ColumnExplorer

use of org.apache.drill.exec.store.ColumnExplorer in project drill by apache.

the class ParquetTableMetadataUtils method addImplicitColumnsStatistics.

/**
 * Creates new map based on specified {@code columnStatistics} with added statistics
 * for implicit and partition (dir) columns.
 *
 * @param columnsStatistics           map of column statistics to expand
 * @param columns                     list of all columns including implicit or partition ones
 * @param partitionValues             list of partition values
 * @param optionManager               option manager
 * @param location                    location of metadata part
 * @param supportsFileImplicitColumns whether implicit columns are supported
 * @return map with added statistics for implicit and partition (dir) columns
 */
public static Map<SchemaPath, ColumnStatistics<?>> addImplicitColumnsStatistics(Map<SchemaPath, ColumnStatistics<?>> columnsStatistics, List<SchemaPath> columns, List<String> partitionValues, OptionManager optionManager, Path location, boolean supportsFileImplicitColumns) {
    ColumnExplorer columnExplorer = new ColumnExplorer(optionManager, columns);
    Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(location, partitionValues, supportsFileImplicitColumns);
    columnsStatistics = new HashMap<>(columnsStatistics);
    for (Map.Entry<String, String> partitionValue : implicitColValues.entrySet()) {
        columnsStatistics.put(SchemaPath.getCompoundPath(partitionValue.getKey()), StatisticsProvider.getConstantColumnStatistics(partitionValue.getValue(), TypeProtos.MinorType.VARCHAR));
    }
    return columnsStatistics;
}
Also used : ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Example 3 with ColumnExplorer

use of org.apache.drill.exec.store.ColumnExplorer in project drill by axbaretto.

the class ParquetGroupScan method applyFilter.

public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
    if (rowGroupInfos.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
        // -  # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
        return null;
    }
    final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
    final List<RowGroupInfo> qualifiedRGs = new ArrayList<>(rowGroupInfos.size());
    // HashSet keeps a fileName unique.
    Set<String> qualifiedFileNames = Sets.newHashSet();
    ParquetFilterPredicate filterPredicate = null;
    for (RowGroupInfo rowGroup : rowGroupInfos) {
        final ColumnExplorer columnExplorer = new ColumnExplorer(optionManager, this.columns);
        Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(rowGroup.getPath(), selectionRoot);
        ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
        Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
        if (filterPredicate == null) {
            ErrorCollector errorCollector = new ErrorCollectorImpl();
            LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
            if (errorCollector.hasErrors()) {
                logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
                return null;
            }
            // logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
            Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
            filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
            if (filterPredicate == null) {
                return null;
            }
        }
        if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
            continue;
        }
        qualifiedRGs.add(rowGroup);
        // TODO : optimize when 1 file contains m row groups.
        qualifiedFileNames.add(rowGroup.getPath());
    }
    if (qualifiedRGs.size() == rowGroupInfos.size()) {
        // There is no reduction of rowGroups. Return the original groupScan.
        logger.debug("applyFilter does not have any pruning!");
        return null;
    } else if (qualifiedFileNames.size() == 0) {
        logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
        RowGroupInfo rg = rowGroupInfos.iterator().next();
        qualifiedFileNames.add(rg.getPath());
        qualifiedRGs.add(rg);
    }
    try {
        FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
        logger.info("applyFilter {} reduce parquet rowgroup # from {} to {}", ExpressionStringBuilder.toString(filterExpr), rowGroupInfos.size(), qualifiedRGs.size());
        ParquetGroupScan clonegroupscan = this.clone(newSelection);
        clonegroupscan.rowGroupInfos = qualifiedRGs;
        clonegroupscan.updatePartitionColTypeMap();
        return clonegroupscan;
    } catch (IOException e) {
        logger.warn("Could not apply filter prune due to Exception : {}", e);
        return null;
    }
}
Also used : ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) ArrayList(java.util.ArrayList) ErrorCollector(org.apache.drill.common.expression.ErrorCollector) IOException(java.io.IOException) ErrorCollectorImpl(org.apache.drill.common.expression.ErrorCollectorImpl) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFilterPredicate(org.apache.drill.exec.expr.stat.ParquetFilterPredicate) ParquetMetaStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetMetaStatCollector)

Example 4 with ColumnExplorer

use of org.apache.drill.exec.store.ColumnExplorer in project drill by axbaretto.

the class EasyFormatPlugin method getReaderBatch.

@SuppressWarnings("resource")
CloseableRecordBatch getReaderBatch(FragmentContext context, EasySubScan scan) throws ExecutionSetupException {
    final ColumnExplorer columnExplorer = new ColumnExplorer(context.getOptions(), scan.getColumns());
    if (!columnExplorer.isStarQuery()) {
        scan = new EasySubScan(scan.getUserName(), scan.getWorkUnits(), scan.getFormatPlugin(), columnExplorer.getTableColumns(), scan.getSelectionRoot());
        scan.setOperatorId(scan.getOperatorId());
    }
    OperatorContext oContext = context.newOperatorContext(scan);
    final DrillFileSystem dfs;
    try {
        dfs = oContext.newFileSystem(fsConf);
    } catch (IOException e) {
        throw new ExecutionSetupException(String.format("Failed to create FileSystem: %s", e.getMessage()), e);
    }
    List<RecordReader> readers = new LinkedList<>();
    List<Map<String, String>> implicitColumns = Lists.newArrayList();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    for (FileWork work : scan.getWorkUnits()) {
        RecordReader recordReader = getRecordReader(context, dfs, work, scan.getColumns(), scan.getUserName());
        readers.add(recordReader);
        Map<String, String> implicitValues = columnExplorer.populateImplicitColumns(work, scan.getSelectionRoot());
        implicitColumns.add(implicitValues);
        if (implicitValues.size() > mapWithMaxColumns.size()) {
            mapWithMaxColumns = implicitValues;
        }
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    Map<String, String> diff = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, diff).entriesOnlyOnRight());
    }
    return new ScanBatch(context, oContext, readers, implicitColumns);
}
Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) RecordReader(org.apache.drill.exec.store.RecordReader) CompleteFileWork(org.apache.drill.exec.store.schedule.CompleteFileWork) IOException(java.io.IOException) LinkedList(java.util.LinkedList) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) OperatorContext(org.apache.drill.exec.ops.OperatorContext) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) Map(java.util.Map)

Example 5 with ColumnExplorer

use of org.apache.drill.exec.store.ColumnExplorer in project drill by apache.

the class EasyFormatPlugin method buildScanBatch.

/**
 * Use the original scanner based on the {@link RecordReader} interface.
 * Requires that the storage plugin roll its own solutions for null columns.
 * Is not able to limit vector or batch sizes. Retained or backward
 * compatibility with "classic" format plugins which have not yet been
 * upgraded to use the new framework.
 */
private CloseableRecordBatch buildScanBatch(FragmentContext context, EasySubScan scan) throws ExecutionSetupException {
    final ColumnExplorer columnExplorer = new ColumnExplorer(context.getOptions(), scan.getColumns());
    if (!columnExplorer.isStarQuery()) {
        scan = new EasySubScan(scan.getUserName(), scan.getWorkUnits(), scan.getFormatPlugin(), columnExplorer.getTableColumns(), scan.getSelectionRoot(), scan.getPartitionDepth(), scan.getSchema(), scan.getMaxRecords());
        scan.setOperatorId(scan.getOperatorId());
    }
    final OperatorContext oContext = context.newOperatorContext(scan);
    final DrillFileSystem dfs;
    try {
        dfs = oContext.newFileSystem(easyConfig().fsConf);
    } catch (final IOException e) {
        throw new ExecutionSetupException(String.format("Failed to create FileSystem: %s", e.getMessage()), e);
    }
    final List<RecordReader> readers = new LinkedList<>();
    final List<Map<String, String>> implicitColumns = Lists.newArrayList();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    final boolean supportsFileImplicitColumns = scan.getSelectionRoot() != null;
    for (final FileWork work : scan.getWorkUnits()) {
        final RecordReader recordReader = getRecordReader(context, dfs, work, scan.getColumns(), scan.getUserName());
        readers.add(recordReader);
        final List<String> partitionValues = ColumnExplorer.listPartitionValues(work.getPath(), scan.getSelectionRoot(), false);
        final Map<String, String> implicitValues = columnExplorer.populateColumns(work.getPath(), partitionValues, supportsFileImplicitColumns, dfs);
        implicitColumns.add(implicitValues);
        if (implicitValues.size() > mapWithMaxColumns.size()) {
            mapWithMaxColumns = implicitValues;
        }
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    final Map<String, String> diff = Maps.transformValues(mapWithMaxColumns, Functions.constant(null));
    for (final Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, diff).entriesOnlyOnRight());
    }
    return new ScanBatch(context, oContext, readers, implicitColumns);
}
Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) RecordReader(org.apache.drill.exec.store.RecordReader) CompleteFileWork(org.apache.drill.exec.store.schedule.CompleteFileWork) IOException(java.io.IOException) LinkedList(java.util.LinkedList) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) OperatorContext(org.apache.drill.exec.ops.OperatorContext) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) Map(java.util.Map)

Aggregations

ColumnExplorer (org.apache.drill.exec.store.ColumnExplorer)6 IOException (java.io.IOException)5 Map (java.util.Map)5 LinkedList (java.util.LinkedList)4 ExecutionSetupException (org.apache.drill.common.exceptions.ExecutionSetupException)4 ScanBatch (org.apache.drill.exec.physical.impl.ScanBatch)4 DrillFileSystem (org.apache.drill.exec.store.dfs.DrillFileSystem)4 OperatorContext (org.apache.drill.exec.ops.OperatorContext)3 RecordReader (org.apache.drill.exec.store.RecordReader)3 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2 LogicalExpression (org.apache.drill.common.expression.LogicalExpression)2 SchemaPath (org.apache.drill.common.expression.SchemaPath)2 CompleteFileWork (org.apache.drill.exec.store.schedule.CompleteFileWork)2 Path (org.apache.hadoop.fs.Path)2 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)2 Stopwatch (com.google.common.base.Stopwatch)1 ErrorCollector (org.apache.drill.common.expression.ErrorCollector)1 ErrorCollectorImpl (org.apache.drill.common.expression.ErrorCollectorImpl)1