Search in sources :

Example 21 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by apache.

the class JdbcBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, JdbcSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    JdbcStoragePlugin plugin = config.getPlugin();
    RecordReader reader = new JdbcRecordReader(context, plugin.getSource(), config.getSql(), plugin.getName());
    return new ScanBatch(config, context, Collections.singletonList(reader).iterator());
}
Also used : RecordReader(org.apache.drill.exec.store.RecordReader) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch)

Example 22 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by apache.

the class ParquetScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, ParquetRowGroupScan rowGroupScan, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    OperatorContext oContext = context.newOperatorContext(rowGroupScan);
    final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(context, rowGroupScan.getColumns());
    if (!columnExplorer.isStarQuery()) {
        rowGroupScan = new ParquetRowGroupScan(rowGroupScan.getUserName(), rowGroupScan.getStorageEngine(), rowGroupScan.getRowGroupReadEntries(), columnExplorer.getTableColumns(), rowGroupScan.getSelectionRoot(), rowGroupScan.getFilter());
        rowGroupScan.setOperatorId(rowGroupScan.getOperatorId());
    }
    DrillFileSystem fs;
    try {
        boolean useAsyncPageReader = context.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_ASYNC).bool_val;
        if (useAsyncPageReader) {
            fs = oContext.newNonTrackingFileSystem(rowGroupScan.getStorageEngine().getFsConf());
        } else {
            fs = oContext.newFileSystem(rowGroupScan.getStorageEngine().getFsConf());
        }
    } catch (IOException e) {
        throw new ExecutionSetupException(String.format("Failed to create DrillFileSystem: %s", e.getMessage()), e);
    }
    Configuration conf = new Configuration(fs.getConf());
    conf.setBoolean(ENABLE_BYTES_READ_COUNTER, false);
    conf.setBoolean(ENABLE_BYTES_TOTAL_COUNTER, false);
    conf.setBoolean(ENABLE_TIME_READ_COUNTER, false);
    // keep footers in a map to avoid re-reading them
    Map<String, ParquetMetadata> footers = Maps.newHashMap();
    List<RecordReader> readers = Lists.newArrayList();
    List<Map<String, String>> implicitColumns = Lists.newArrayList();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    for (RowGroupReadEntry e : rowGroupScan.getRowGroupReadEntries()) {
        /*
      Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file
      TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine)
      we should add more information to the RowGroupInfo that will be populated upon the first read to
      provide the reader with all of th file meta-data it needs
      These fields will be added to the constructor below
      */
        try {
            Stopwatch timer = Stopwatch.createUnstarted();
            if (!footers.containsKey(e.getPath())) {
                timer.start();
                ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(e.getPath()));
                long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
                logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", e.getPath(), "", 0, 0, 0, timeToRead);
                footers.put(e.getPath(), footer);
            }
            boolean autoCorrectCorruptDates = rowGroupScan.formatConfig.autoCorrectCorruptDates;
            ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footers.get(e.getPath()), rowGroupScan.getColumns(), autoCorrectCorruptDates);
            if (logger.isDebugEnabled()) {
                logger.debug(containsCorruptDates.toString());
            }
            if (!context.getOptions().getOption(ExecConstants.PARQUET_NEW_RECORD_READER).bool_val && !isComplex(footers.get(e.getPath()))) {
                readers.add(new ParquetRecordReader(context, e.getPath(), e.getRowGroupIndex(), e.getNumRecordsToRead(), fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), footers.get(e.getPath()), rowGroupScan.getColumns(), containsCorruptDates));
            } else {
                ParquetMetadata footer = footers.get(e.getPath());
                readers.add(new DrillParquetReader(context, footer, e, columnExplorer.getTableColumns(), fs, containsCorruptDates));
            }
            Map<String, String> implicitValues = columnExplorer.populateImplicitColumns(e, rowGroupScan.getSelectionRoot());
            implicitColumns.add(implicitValues);
            if (implicitValues.size() > mapWithMaxColumns.size()) {
                mapWithMaxColumns = implicitValues;
            }
        } catch (IOException e1) {
            throw new ExecutionSetupException(e1);
        }
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    Map<String, String> diff = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, diff).entriesOnlyOnRight());
    }
    return new ScanBatch(rowGroupScan, context, oContext, readers.iterator(), implicitColumns);
}
Also used : ImplicitColumnExplorer(org.apache.drill.exec.store.ImplicitColumnExplorer) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) Stopwatch(com.google.common.base.Stopwatch) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) OperatorContext(org.apache.drill.exec.ops.OperatorContext) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) Path(org.apache.hadoop.fs.Path) DrillParquetReader(org.apache.drill.exec.store.parquet2.DrillParquetReader) IOException(java.io.IOException) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) Map(java.util.Map)

Example 23 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by apache.

the class SystemTableBatchCreator method getBatch.

//  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(SystemTableBatchCreator.class);
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public ScanBatch getBatch(final FragmentContext context, final SystemTableScan scan, final List<RecordBatch> children) throws ExecutionSetupException {
    final SystemTable table = scan.getTable();
    final Iterator<Object> iterator = table.getIterator(context);
    final RecordReader reader = new PojoRecordReader(table.getPojoClass(), iterator);
    return new ScanBatch(scan, context, Collections.singleton(reader).iterator());
}
Also used : RecordReader(org.apache.drill.exec.store.RecordReader) PojoRecordReader(org.apache.drill.exec.store.pojo.PojoRecordReader) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) PojoRecordReader(org.apache.drill.exec.store.pojo.PojoRecordReader)

Example 24 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by apache.

the class HiveScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, HiveSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    List<RecordReader> readers = Lists.newArrayList();
    HiveTableWithColumnCache table = config.getTable();
    List<InputSplit> splits = config.getInputSplits();
    List<HivePartition> partitions = config.getPartitions();
    boolean hasPartitions = (partitions != null && partitions.size() > 0);
    int i = 0;
    final UserGroupInformation proxyUgi = ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName());
    final HiveConf hiveConf = config.getHiveConf();
    final String formatName = table.getSd().getInputFormat();
    Class<? extends HiveAbstractReader> readerClass = HiveDefaultReader.class;
    if (readerMap.containsKey(formatName)) {
        readerClass = readerMap.get(formatName);
    }
    Constructor<? extends HiveAbstractReader> readerConstructor = null;
    try {
        readerConstructor = readerClass.getConstructor(HiveTableWithColumnCache.class, HivePartition.class, InputSplit.class, List.class, FragmentContext.class, HiveConf.class, UserGroupInformation.class);
        for (InputSplit split : splits) {
            readers.add(readerConstructor.newInstance(table, (hasPartitions ? partitions.get(i++) : null), split, config.getColumns(), context, hiveConf, proxyUgi));
        }
        if (readers.size() == 0) {
            readers.add(readerConstructor.newInstance(table, null, null, config.getColumns(), context, hiveConf, proxyUgi));
        }
    } catch (Exception e) {
        logger.error("No constructor for {}, thrown {}", readerClass.getName(), e);
    }
    return new ScanBatch(config, context, readers.iterator());
}
Also used : FragmentContext(org.apache.drill.exec.ops.FragmentContext) RecordReader(org.apache.drill.exec.store.RecordReader) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) HiveConf(org.apache.hadoop.hive.conf.HiveConf) List(java.util.List) InputSplit(org.apache.hadoop.mapred.InputSplit) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 25 with ScanBatch

use of org.apache.drill.exec.physical.impl.ScanBatch in project drill by axbaretto.

the class JdbcBatchCreator method getBatch.

@Override
public ScanBatch getBatch(ExecutorFragmentContext context, JdbcSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    Preconditions.checkArgument(children.isEmpty());
    JdbcStoragePlugin plugin = config.getPlugin();
    RecordReader reader = new JdbcRecordReader(plugin.getSource(), config.getSql(), plugin.getName());
    return new ScanBatch(config, context, Collections.singletonList(reader));
}
Also used : RecordReader(org.apache.drill.exec.store.RecordReader) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch)

Aggregations

ScanBatch (org.apache.drill.exec.physical.impl.ScanBatch)40 RecordReader (org.apache.drill.exec.store.RecordReader)31 ExecutionSetupException (org.apache.drill.common.exceptions.ExecutionSetupException)26 LinkedList (java.util.LinkedList)16 SchemaPath (org.apache.drill.common.expression.SchemaPath)15 IOException (java.io.IOException)8 Map (java.util.Map)8 OperatorContext (org.apache.drill.exec.ops.OperatorContext)7 RecordBatch (org.apache.drill.exec.record.RecordBatch)7 RecordBatchSizer (org.apache.drill.exec.record.RecordBatchSizer)6 VectorAccessible (org.apache.drill.exec.record.VectorAccessible)6 DrillFileSystem (org.apache.drill.exec.store.dfs.DrillFileSystem)6 ValueVector (org.apache.drill.exec.vector.ValueVector)5 Path (org.apache.hadoop.fs.Path)5 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)5 ColumnExplorer (org.apache.drill.exec.store.ColumnExplorer)4 UInt4Vector (org.apache.drill.exec.vector.UInt4Vector)4 RepeatedListVector (org.apache.drill.exec.vector.complex.RepeatedListVector)4 RepeatedValueVector (org.apache.drill.exec.vector.complex.RepeatedValueVector)4 Test (org.junit.Test)4