Examples with ScanBatch - org.apache.drill.exec.physical.impl.ScanBatch

Example 1 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by apache.

the class HiveDrillNativeScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, HiveDrillNativeParquetSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    final HiveTableWithColumnCache table = config.getTable();
    final List<InputSplit> splits = config.getInputSplits();
    final List<HivePartition> partitions = config.getPartitions();
    final List<SchemaPath> columns = config.getColumns();
    final String partitionDesignator = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val;
    List<Map<String, String>> implicitColumns = Lists.newLinkedList();
    boolean selectAllQuery = AbstractRecordReader.isStarQuery(columns);
    final boolean hasPartitions = (partitions != null && partitions.size() > 0);
    final List<String[]> partitionColumns = Lists.newArrayList();
    final List<Integer> selectedPartitionColumns = Lists.newArrayList();
    List<SchemaPath> newColumns = columns;
    if (!selectAllQuery) {
        // Separate out the partition and non-partition columns. Non-partition columns are passed directly to the
        // ParquetRecordReader. Partition columns are passed to ScanBatch.
        newColumns = Lists.newArrayList();
        Pattern pattern = Pattern.compile(String.format("%s[0-9]+", partitionDesignator));
        for (SchemaPath column : columns) {
            Matcher m = pattern.matcher(column.getAsUnescapedPath());
            if (m.matches()) {
                selectedPartitionColumns.add(Integer.parseInt(column.getAsUnescapedPath().substring(partitionDesignator.length())));
            } else {
                newColumns.add(column);
            }
        }
    }
    final OperatorContext oContext = context.newOperatorContext(config);
    int currentPartitionIndex = 0;
    final List<RecordReader> readers = Lists.newArrayList();
    final HiveConf conf = config.getHiveConf();
    // TODO: In future we can get this cache from Metadata cached on filesystem.
    final Map<String, ParquetMetadata> footerCache = Maps.newHashMap();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    try {
        for (InputSplit split : splits) {
            final FileSplit fileSplit = (FileSplit) split;
            final Path finalPath = fileSplit.getPath();
            final JobConf cloneJob = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(conf), finalPath.getParent());
            final FileSystem fs = finalPath.getFileSystem(cloneJob);
            ParquetMetadata parquetMetadata = footerCache.get(finalPath.toString());
            if (parquetMetadata == null) {
                parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
                footerCache.put(finalPath.toString(), parquetMetadata);
            }
            final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
            for (int rowGroupNum : rowGroupNums) {
                //DRILL-5009 : Skip the row group if the row count is zero
                if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
                    continue;
                }
                // Drill has only ever written a single row group per file, only detect corruption
                // in the first row group
                ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(parquetMetadata, config.getColumns(), true);
                if (logger.isDebugEnabled()) {
                    logger.debug(containsCorruptDates.toString());
                }
                readers.add(new ParquetRecordReader(context, Path.getPathWithoutSchemeAndAuthority(finalPath).toString(), rowGroupNum, fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), parquetMetadata, newColumns, containsCorruptDates));
                Map<String, String> implicitValues = Maps.newLinkedHashMap();
                if (hasPartitions) {
                    List<String> values = partitions.get(currentPartitionIndex).getValues();
                    for (int i = 0; i < values.size(); i++) {
                        if (selectAllQuery || selectedPartitionColumns.contains(i)) {
                            implicitValues.put(partitionDesignator + i, values.get(i));
                        }
                    }
                }
                implicitColumns.add(implicitValues);
                if (implicitValues.size() > mapWithMaxColumns.size()) {
                    mapWithMaxColumns = implicitValues;
                }
            }
            currentPartitionIndex++;
        }
    } catch (final IOException | RuntimeException e) {
        AutoCloseables.close(e, readers);
        throw new ExecutionSetupException("Failed to create RecordReaders. " + e.getMessage(), e);
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    mapWithMaxColumns = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, mapWithMaxColumns).entriesOnlyOnRight());
    }
    // create an empty RecordReader to output the schema
    if (readers.size() == 0) {
        readers.add(new HiveDefaultReader(table, null, null, columns, context, conf, ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName())));
    }
    return new ScanBatch(config, context, oContext, readers.iterator(), implicitColumns);
}

Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Matcher(java.util.regex.Matcher) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ProjectionPusher(org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) AbstractRecordReader(org.apache.drill.exec.store.AbstractRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) SchemaPath(org.apache.drill.common.expression.SchemaPath) OperatorContext(org.apache.drill.exec.ops.OperatorContext) FileSystem(org.apache.hadoop.fs.FileSystem) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) Pattern(java.util.regex.Pattern) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) IOException(java.io.IOException) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) Map(java.util.Map)

Example 2 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by apache.

the class MongoScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, MongoSubScan subScan, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    List<RecordReader> readers = Lists.newArrayList();
    List<SchemaPath> columns = null;
    for (MongoSubScan.MongoSubScanSpec scanSpec : subScan.getChunkScanSpecList()) {
        try {
            if ((columns = subScan.getColumns()) == null) {
                columns = GroupScan.ALL_COLUMNS;
            }
            readers.add(new MongoRecordReader(scanSpec, columns, context, subScan.getMongoStoragePlugin()));
        } catch (Exception e) {
            logger.error("MongoRecordReader creation failed for subScan:  " + subScan + ".");
            logger.error(e.getMessage(), e);
            throw new ExecutionSetupException(e);
        }
    }
    logger.info("Number of record readers initialized : " + readers.size());
    return new ScanBatch(subScan, context, readers.iterator());
}

Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) SchemaPath(org.apache.drill.common.expression.SchemaPath) RecordReader(org.apache.drill.exec.store.RecordReader) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException)

Example 3 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by apache.

the class MapRDBScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, MapRDBSubScan subScan, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    List<RecordReader> readers = Lists.newArrayList();
    for (MapRDBSubScanSpec scanSpec : subScan.getRegionScanSpecList()) {
        try {
            if (BinaryTableGroupScan.TABLE_BINARY.equals(subScan.getTableType())) {
                readers.add(new HBaseRecordReader(subScan.getFormatPlugin().getConnection(), getHBaseSubScanSpec(scanSpec), subScan.getColumns(), context));
            } else {
                readers.add(new MaprDBJsonRecordReader(scanSpec, subScan.getFormatPluginConfig(), subScan.getColumns(), context));
            }
        } catch (Exception e1) {
            throw new ExecutionSetupException(e1);
        }
    }
    return new ScanBatch(subScan, context, readers.iterator());
}

Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) HBaseRecordReader(org.apache.drill.exec.store.hbase.HBaseRecordReader) MaprDBJsonRecordReader(org.apache.drill.exec.store.mapr.db.json.MaprDBJsonRecordReader) HBaseRecordReader(org.apache.drill.exec.store.hbase.HBaseRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) MaprDBJsonRecordReader(org.apache.drill.exec.store.mapr.db.json.MaprDBJsonRecordReader) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException)

Example 4 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by apache.

the class MockScanBatchCreator method getBatch.

//private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MockScanBatchCreator.class);
@Override
public ScanBatch getBatch(FragmentContext context, MockSubScanPOP config, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    final List<MockScanEntry> entries = config.getReadEntries();
    final List<RecordReader> readers = Lists.newArrayList();
    for (final MockTableDef.MockScanEntry e : entries) {
        if (e.isExtended()) {
            readers.add(new ExtendedMockRecordReader(context, e));
        } else {
            readers.add(new MockRecordReader(context, e));
        }
    }
    return new ScanBatch(config, context, readers.iterator());
}

Also used : MockScanEntry(org.apache.drill.exec.store.mock.MockTableDef.MockScanEntry) RecordReader(org.apache.drill.exec.store.RecordReader) MockScanEntry(org.apache.drill.exec.store.mock.MockTableDef.MockScanEntry) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch)

Example 5 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by axbaretto.

the class HBaseScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(ExecutorFragmentContext context, HBaseSubScan subScan, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    List<RecordReader> readers = new LinkedList<>();
    List<SchemaPath> columns = null;
    for (HBaseSubScan.HBaseSubScanSpec scanSpec : subScan.getRegionScanSpecList()) {
        try {
            if ((columns = subScan.getColumns()) == null) {
                columns = GroupScan.ALL_COLUMNS;
            }
            readers.add(new HBaseRecordReader(subScan.getStorageEngine().getConnection(), scanSpec, columns));
        } catch (Exception e1) {
            throw new ExecutionSetupException(e1);
        }
    }
    return new ScanBatch(subScan, context, readers);
}

Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) SchemaPath(org.apache.drill.common.expression.SchemaPath) RecordReader(org.apache.drill.exec.store.RecordReader) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) LinkedList(java.util.LinkedList) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException)

Aggregations

ScanBatch (org.apache.drill.exec.physical.impl.ScanBatch)40 RecordReader (org.apache.drill.exec.store.RecordReader)31 ExecutionSetupException (org.apache.drill.common.exceptions.ExecutionSetupException)26 LinkedList (java.util.LinkedList)16 SchemaPath (org.apache.drill.common.expression.SchemaPath)15 IOException (java.io.IOException)8 Map (java.util.Map)8 OperatorContext (org.apache.drill.exec.ops.OperatorContext)7 RecordBatch (org.apache.drill.exec.record.RecordBatch)7 RecordBatchSizer (org.apache.drill.exec.record.RecordBatchSizer)6 VectorAccessible (org.apache.drill.exec.record.VectorAccessible)6 DrillFileSystem (org.apache.drill.exec.store.dfs.DrillFileSystem)6 ValueVector (org.apache.drill.exec.vector.ValueVector)5 Path (org.apache.hadoop.fs.Path)5 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)5 ColumnExplorer (org.apache.drill.exec.store.ColumnExplorer)4 UInt4Vector (org.apache.drill.exec.vector.UInt4Vector)4 RepeatedListVector (org.apache.drill.exec.vector.complex.RepeatedListVector)4 RepeatedValueVector (org.apache.drill.exec.vector.complex.RepeatedValueVector)4 Test (org.junit.Test)4