Search in sources :

Example 1 with OperatorContext

use of org.apache.drill.exec.ops.OperatorContext in project drill by apache.

the class HiveDrillNativeScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, HiveDrillNativeParquetSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    final HiveTableWithColumnCache table = config.getTable();
    final List<InputSplit> splits = config.getInputSplits();
    final List<HivePartition> partitions = config.getPartitions();
    final List<SchemaPath> columns = config.getColumns();
    final String partitionDesignator = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val;
    List<Map<String, String>> implicitColumns = Lists.newLinkedList();
    boolean selectAllQuery = AbstractRecordReader.isStarQuery(columns);
    final boolean hasPartitions = (partitions != null && partitions.size() > 0);
    final List<String[]> partitionColumns = Lists.newArrayList();
    final List<Integer> selectedPartitionColumns = Lists.newArrayList();
    List<SchemaPath> newColumns = columns;
    if (!selectAllQuery) {
        // Separate out the partition and non-partition columns. Non-partition columns are passed directly to the
        // ParquetRecordReader. Partition columns are passed to ScanBatch.
        newColumns = Lists.newArrayList();
        Pattern pattern = Pattern.compile(String.format("%s[0-9]+", partitionDesignator));
        for (SchemaPath column : columns) {
            Matcher m = pattern.matcher(column.getAsUnescapedPath());
            if (m.matches()) {
                selectedPartitionColumns.add(Integer.parseInt(column.getAsUnescapedPath().substring(partitionDesignator.length())));
            } else {
                newColumns.add(column);
            }
        }
    }
    final OperatorContext oContext = context.newOperatorContext(config);
    int currentPartitionIndex = 0;
    final List<RecordReader> readers = Lists.newArrayList();
    final HiveConf conf = config.getHiveConf();
    // TODO: In future we can get this cache from Metadata cached on filesystem.
    final Map<String, ParquetMetadata> footerCache = Maps.newHashMap();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    try {
        for (InputSplit split : splits) {
            final FileSplit fileSplit = (FileSplit) split;
            final Path finalPath = fileSplit.getPath();
            final JobConf cloneJob = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(conf), finalPath.getParent());
            final FileSystem fs = finalPath.getFileSystem(cloneJob);
            ParquetMetadata parquetMetadata = footerCache.get(finalPath.toString());
            if (parquetMetadata == null) {
                parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
                footerCache.put(finalPath.toString(), parquetMetadata);
            }
            final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
            for (int rowGroupNum : rowGroupNums) {
                //DRILL-5009 : Skip the row group if the row count is zero
                if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
                    continue;
                }
                // Drill has only ever written a single row group per file, only detect corruption
                // in the first row group
                ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(parquetMetadata, config.getColumns(), true);
                if (logger.isDebugEnabled()) {
                    logger.debug(containsCorruptDates.toString());
                }
                readers.add(new ParquetRecordReader(context, Path.getPathWithoutSchemeAndAuthority(finalPath).toString(), rowGroupNum, fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), parquetMetadata, newColumns, containsCorruptDates));
                Map<String, String> implicitValues = Maps.newLinkedHashMap();
                if (hasPartitions) {
                    List<String> values = partitions.get(currentPartitionIndex).getValues();
                    for (int i = 0; i < values.size(); i++) {
                        if (selectAllQuery || selectedPartitionColumns.contains(i)) {
                            implicitValues.put(partitionDesignator + i, values.get(i));
                        }
                    }
                }
                implicitColumns.add(implicitValues);
                if (implicitValues.size() > mapWithMaxColumns.size()) {
                    mapWithMaxColumns = implicitValues;
                }
            }
            currentPartitionIndex++;
        }
    } catch (final IOException | RuntimeException e) {
        AutoCloseables.close(e, readers);
        throw new ExecutionSetupException("Failed to create RecordReaders. " + e.getMessage(), e);
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    mapWithMaxColumns = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, mapWithMaxColumns).entriesOnlyOnRight());
    }
    // create an empty RecordReader to output the schema
    if (readers.size() == 0) {
        readers.add(new HiveDefaultReader(table, null, null, columns, context, conf, ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName())));
    }
    return new ScanBatch(config, context, oContext, readers.iterator(), implicitColumns);
}
Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Matcher(java.util.regex.Matcher) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ProjectionPusher(org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) AbstractRecordReader(org.apache.drill.exec.store.AbstractRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) SchemaPath(org.apache.drill.common.expression.SchemaPath) OperatorContext(org.apache.drill.exec.ops.OperatorContext) FileSystem(org.apache.hadoop.fs.FileSystem) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) Pattern(java.util.regex.Pattern) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) IOException(java.io.IOException) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) Map(java.util.Map)

Example 2 with OperatorContext

use of org.apache.drill.exec.ops.OperatorContext in project drill by axbaretto.

the class HiveDrillNativeScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(ExecutorFragmentContext context, HiveDrillNativeParquetSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    final HiveTableWithColumnCache table = config.getTable();
    final List<List<InputSplit>> splits = config.getInputSplits();
    final List<HivePartition> partitions = config.getPartitions();
    final List<SchemaPath> columns = config.getColumns();
    final String partitionDesignator = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val;
    List<Map<String, String>> implicitColumns = Lists.newLinkedList();
    boolean selectAllQuery = Utilities.isStarQuery(columns);
    final boolean hasPartitions = (partitions != null && partitions.size() > 0);
    final List<String[]> partitionColumns = Lists.newArrayList();
    final List<Integer> selectedPartitionColumns = Lists.newArrayList();
    List<SchemaPath> tableColumns = columns;
    if (!selectAllQuery) {
        // Separate out the partition and non-partition columns. Non-partition columns are passed directly to the
        // ParquetRecordReader. Partition columns are passed to ScanBatch.
        tableColumns = Lists.newArrayList();
        Pattern pattern = Pattern.compile(String.format("%s[0-9]+", partitionDesignator));
        for (SchemaPath column : columns) {
            Matcher m = pattern.matcher(column.getRootSegmentPath());
            if (m.matches()) {
                selectedPartitionColumns.add(Integer.parseInt(column.getRootSegmentPath().substring(partitionDesignator.length())));
            } else {
                tableColumns.add(column);
            }
        }
    }
    final OperatorContext oContext = context.newOperatorContext(config);
    int currentPartitionIndex = 0;
    final List<RecordReader> readers = new LinkedList<>();
    final HiveConf conf = config.getHiveConf();
    // TODO: In future we can get this cache from Metadata cached on filesystem.
    final Map<String, ParquetMetadata> footerCache = Maps.newHashMap();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    try {
        for (List<InputSplit> splitGroups : splits) {
            for (InputSplit split : splitGroups) {
                final FileSplit fileSplit = (FileSplit) split;
                final Path finalPath = fileSplit.getPath();
                final JobConf cloneJob = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(conf), finalPath.getParent());
                final FileSystem fs = finalPath.getFileSystem(cloneJob);
                ParquetMetadata parquetMetadata = footerCache.get(finalPath.toString());
                if (parquetMetadata == null) {
                    parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
                    footerCache.put(finalPath.toString(), parquetMetadata);
                }
                final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
                for (int rowGroupNum : rowGroupNums) {
                    // DRILL-5009 : Skip the row group if the row count is zero
                    if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
                        continue;
                    }
                    // Drill has only ever written a single row group per file, only detect corruption
                    // in the first row group
                    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(parquetMetadata, config.getColumns(), true);
                    if (logger.isDebugEnabled()) {
                        logger.debug(containsCorruptDates.toString());
                    }
                    readers.add(new ParquetRecordReader(context, Path.getPathWithoutSchemeAndAuthority(finalPath).toString(), rowGroupNum, fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), parquetMetadata, tableColumns, containsCorruptDates));
                    Map<String, String> implicitValues = Maps.newLinkedHashMap();
                    if (hasPartitions) {
                        List<String> values = partitions.get(currentPartitionIndex).getValues();
                        for (int i = 0; i < values.size(); i++) {
                            if (selectAllQuery || selectedPartitionColumns.contains(i)) {
                                implicitValues.put(partitionDesignator + i, values.get(i));
                            }
                        }
                    }
                    implicitColumns.add(implicitValues);
                    if (implicitValues.size() > mapWithMaxColumns.size()) {
                        mapWithMaxColumns = implicitValues;
                    }
                }
                currentPartitionIndex++;
            }
        }
    } catch (final IOException | RuntimeException e) {
        AutoCloseables.close(e, readers);
        throw new ExecutionSetupException("Failed to create RecordReaders. " + e.getMessage(), e);
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    mapWithMaxColumns = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, mapWithMaxColumns).entriesOnlyOnRight());
    }
    // create an empty RecordReader to output the schema
    if (readers.size() == 0) {
        readers.add(new HiveDefaultReader(table, null, null, tableColumns, context, conf, ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName())));
    }
    return new ScanBatch(context, oContext, readers, implicitColumns);
}
Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Matcher(java.util.regex.Matcher) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ProjectionPusher(org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) SchemaPath(org.apache.drill.common.expression.SchemaPath) OperatorContext(org.apache.drill.exec.ops.OperatorContext) FileSystem(org.apache.hadoop.fs.FileSystem) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) LinkedList(java.util.LinkedList) List(java.util.List) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) Pattern(java.util.regex.Pattern) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) HiveDefaultReader(org.apache.drill.exec.store.hive.readers.HiveDefaultReader) IOException(java.io.IOException) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) LinkedList(java.util.LinkedList) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) Map(java.util.Map)

Example 3 with OperatorContext

use of org.apache.drill.exec.ops.OperatorContext in project drill by axbaretto.

the class ParquetScanBatchCreator method getBatch.

@SuppressWarnings("resource")
@Override
public ScanBatch getBatch(ExecutorFragmentContext context, ParquetRowGroupScan rowGroupScan, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    OperatorContext oContext = context.newOperatorContext(rowGroupScan);
    final ColumnExplorer columnExplorer = new ColumnExplorer(context.getOptions(), rowGroupScan.getColumns());
    if (!columnExplorer.isStarQuery()) {
        rowGroupScan = new ParquetRowGroupScan(rowGroupScan.getUserName(), rowGroupScan.getStorageEngine(), rowGroupScan.getRowGroupReadEntries(), columnExplorer.getTableColumns(), rowGroupScan.getSelectionRoot(), rowGroupScan.getFilter());
        rowGroupScan.setOperatorId(rowGroupScan.getOperatorId());
    }
    DrillFileSystem fs;
    try {
        boolean useAsyncPageReader = context.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_ASYNC).bool_val;
        if (useAsyncPageReader) {
            fs = oContext.newNonTrackingFileSystem(rowGroupScan.getStorageEngine().getFsConf());
        } else {
            fs = oContext.newFileSystem(rowGroupScan.getStorageEngine().getFsConf());
        }
    } catch (IOException e) {
        throw new ExecutionSetupException(String.format("Failed to create DrillFileSystem: %s", e.getMessage()), e);
    }
    Configuration conf = new Configuration(fs.getConf());
    conf.setBoolean(ENABLE_BYTES_READ_COUNTER, false);
    conf.setBoolean(ENABLE_BYTES_TOTAL_COUNTER, false);
    conf.setBoolean(ENABLE_TIME_READ_COUNTER, false);
    // keep footers in a map to avoid re-reading them
    Map<String, ParquetMetadata> footers = Maps.newHashMap();
    List<RecordReader> readers = new LinkedList<>();
    List<Map<String, String>> implicitColumns = Lists.newArrayList();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    for (RowGroupReadEntry e : rowGroupScan.getRowGroupReadEntries()) {
        /*
      Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file
      TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine)
      we should add more information to the RowGroupInfo that will be populated upon the first read to
      provide the reader with all of th file meta-data it needs
      These fields will be added to the constructor below
      */
        try {
            Stopwatch timer = Stopwatch.createUnstarted();
            if (!footers.containsKey(e.getPath())) {
                timer.start();
                ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(e.getPath()));
                long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
                logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", e.getPath(), "", 0, 0, 0, timeToRead);
                footers.put(e.getPath(), footer);
            }
            boolean autoCorrectCorruptDates = rowGroupScan.getFormatConfig().areCorruptDatesAutoCorrected();
            ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footers.get(e.getPath()), rowGroupScan.getColumns(), autoCorrectCorruptDates);
            if (logger.isDebugEnabled()) {
                logger.debug(containsCorruptDates.toString());
            }
            if (!context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER) && !isComplex(footers.get(e.getPath()))) {
                readers.add(new ParquetRecordReader(context, e.getPath(), e.getRowGroupIndex(), e.getNumRecordsToRead(), fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), footers.get(e.getPath()), rowGroupScan.getColumns(), containsCorruptDates));
            } else {
                ParquetMetadata footer = footers.get(e.getPath());
                readers.add(new DrillParquetReader(context, footer, e, columnExplorer.getTableColumns(), fs, containsCorruptDates));
            }
            Map<String, String> implicitValues = columnExplorer.populateImplicitColumns(e, rowGroupScan.getSelectionRoot());
            implicitColumns.add(implicitValues);
            if (implicitValues.size() > mapWithMaxColumns.size()) {
                mapWithMaxColumns = implicitValues;
            }
        } catch (IOException e1) {
            throw new ExecutionSetupException(e1);
        }
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    Map<String, String> diff = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, diff).entriesOnlyOnRight());
    }
    return new ScanBatch(context, oContext, readers, implicitColumns);
}
Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) Stopwatch(com.google.common.base.Stopwatch) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) OperatorContext(org.apache.drill.exec.ops.OperatorContext) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) Path(org.apache.hadoop.fs.Path) DrillParquetReader(org.apache.drill.exec.store.parquet2.DrillParquetReader) IOException(java.io.IOException) LinkedList(java.util.LinkedList) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Map(java.util.Map)

Example 4 with OperatorContext

use of org.apache.drill.exec.ops.OperatorContext in project drill by axbaretto.

the class TestCopier method testEmptyInput.

@Test
public void testEmptyInput() throws Exception {
    BatchSchema schema = SortTestUtilities.nonNullSchema();
    List<BatchGroup> batches = new ArrayList<>();
    Sort popConfig = SortTestUtilities.makeCopierConfig(Ordering.ORDER_ASC, Ordering.NULLS_UNSPECIFIED);
    OperatorContext opContext = fixture.newOperatorContext(popConfig);
    PriorityQueueCopierWrapper copier = new PriorityQueueCopierWrapper(opContext);
    VectorContainer dest = new VectorContainer();
    try {
        // TODO: Create a vector allocator to pass as last parameter so
        // that the test uses the same vector allocator as the production
        // code. Only nuisance is that we don't have the required metadata
        // readily at hand here...
        @SuppressWarnings({ "resource", "unused" }) BatchMerger merger = copier.startMerge(schema, batches, dest, 10, null);
        fail();
    } catch (AssertionError e) {
    // Expected
    } finally {
        opContext.close();
    }
}
Also used : BatchSchema(org.apache.drill.exec.record.BatchSchema) BatchMerger(org.apache.drill.exec.physical.impl.xsort.managed.PriorityQueueCopierWrapper.BatchMerger) OperatorContext(org.apache.drill.exec.ops.OperatorContext) ArrayList(java.util.ArrayList) Sort(org.apache.drill.exec.physical.config.Sort) VectorContainer(org.apache.drill.exec.record.VectorContainer) OperatorTest(org.apache.drill.categories.OperatorTest) SubOperatorTest(org.apache.drill.test.SubOperatorTest) Test(org.junit.Test)

Example 5 with OperatorContext

use of org.apache.drill.exec.ops.OperatorContext in project drill by axbaretto.

the class TestSorter method runSorterTest.

public void runSorterTest(Sort popConfig, SingleRowSet rowSet, SingleRowSet expected) throws Exception {
    OperatorContext opContext = fixture.newOperatorContext(popConfig);
    SorterWrapper sorter = new SorterWrapper(opContext);
    try {
        sorter.sortBatch(rowSet.container(), rowSet.getSv2());
        new RowSetComparison(expected).verifyAndClearAll(rowSet);
        sorter.close();
    } finally {
        opContext.close();
    }
}
Also used : RowSetComparison(org.apache.drill.test.rowSet.RowSetComparison) OperatorContext(org.apache.drill.exec.ops.OperatorContext)

Aggregations

OperatorContext (org.apache.drill.exec.ops.OperatorContext)23 IOException (java.io.IOException)9 Map (java.util.Map)8 ExecutionSetupException (org.apache.drill.common.exceptions.ExecutionSetupException)8 Test (org.junit.Test)8 ScanBatch (org.apache.drill.exec.physical.impl.ScanBatch)7 RecordReader (org.apache.drill.exec.store.RecordReader)7 DrillFileSystem (org.apache.drill.exec.store.dfs.DrillFileSystem)6 LinkedList (java.util.LinkedList)5 OperatorTest (org.apache.drill.categories.OperatorTest)4 PhysicalOperator (org.apache.drill.exec.physical.base.PhysicalOperator)4 RowSet (org.apache.drill.exec.physical.rowSet.RowSet)4 VectorContainer (org.apache.drill.exec.record.VectorContainer)4 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)4 DrillBuf (io.netty.buffer.DrillBuf)3 ArrayList (java.util.ArrayList)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 OutOfMemoryException (org.apache.drill.exec.exception.OutOfMemoryException)3 Sort (org.apache.drill.exec.physical.config.Sort)3 MockRecordBatch (org.apache.drill.exec.physical.impl.MockRecordBatch)3