Search in sources :

Example 16 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class DynamicPartitionPruner method applyFilterToPartitions.

@SuppressWarnings("rawtypes")
private void applyFilterToPartitions(Converter converter, ExprNodeEvaluator eval, String columnName, Set<Object> values) throws HiveException {
    Object[] row = new Object[1];
    Iterator<Path> it = work.getPathToPartitionInfo().keySet().iterator();
    while (it.hasNext()) {
        Path p = it.next();
        PartitionDesc desc = work.getPathToPartitionInfo().get(p);
        Map<String, String> spec = desc.getPartSpec();
        if (spec == null) {
            throw new IllegalStateException("No partition spec found in dynamic pruning");
        }
        String partValueString = spec.get(columnName);
        if (partValueString == null) {
            throw new IllegalStateException("Could not find partition value for column: " + columnName);
        }
        Object partValue = converter.convert(partValueString);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Converted partition value: " + partValue + " original (" + partValueString + ")");
        }
        row[0] = partValue;
        partValue = eval.evaluate(row);
        if (LOG.isDebugEnabled()) {
            LOG.debug("part key expr applied: " + partValue);
        }
        if (!values.contains(partValue)) {
            LOG.info("Pruning path: " + p);
            it.remove();
            // work.removePathToPartitionInfo(p);
            work.removePathToAlias(p);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 17 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class HiveContextAwareRecordReader method doNext.

public boolean doNext(K key, V value) throws IOException {
    if (this.isSorted) {
        if (this.getIOContext().shouldEndBinarySearch() || (!this.getIOContext().useSorted() && this.wasUsingSortedSearch)) {
            beginLinearSearch();
            this.wasUsingSortedSearch = false;
            this.getIOContext().setEndBinarySearch(false);
        }
        if (this.getIOContext().useSorted()) {
            if (this.genericUDFClassName == null && this.getIOContext().getGenericUDFClassName() != null) {
                setGenericUDFClassName(this.getIOContext().getGenericUDFClassName());
            }
            if (this.getIOContext().isBinarySearching()) {
                // Proceed with a binary search
                if (this.getIOContext().getComparison() != null) {
                    switch(this.getIOContext().getComparison()) {
                        case GREATER:
                        case EQUAL:
                            // Indexes have only one entry per value, could go linear from here, if we want to
                            // use this for any sorted table, we'll need to continue the search
                            rangeEnd = previousPosition;
                            break;
                        case LESS:
                            rangeStart = previousPosition;
                            break;
                        default:
                            break;
                    }
                }
                long position = (rangeStart + rangeEnd) / 2;
                sync(position);
                long newPosition = getSyncedPosition();
                // matching rows must be in the final block, so we can end the binary search.
                if (newPosition == previousPosition || newPosition >= splitEnd) {
                    this.getIOContext().setBinarySearching(false);
                    sync(rangeStart);
                }
                previousPosition = newPosition;
            } else if (foundAllTargets()) {
                // Found all possible rows which will not be filtered
                return false;
            }
        }
    }
    try {
        /**
       * When start reading new file, check header, footer rows.
       * If file contains header, skip header lines before reading the records.
       * If file contains footer, used a FooterBuffer to remove footer lines
       * at the end of the table file.
       **/
        if (this.ioCxtRef.getCurrentBlockStart() == 0) {
            // Check if the table file has header to skip.
            footerBuffer = null;
            Path filePath = this.ioCxtRef.getInputPath();
            PartitionDesc part = null;
            try {
                if (pathToPartitionInfo == null) {
                    pathToPartitionInfo = Utilities.getMapWork(jobConf).getPathToPartitionInfo();
                }
                part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, filePath, IOPrepareCache.get().getPartitionDescMap());
            } catch (AssertionError ae) {
                LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath() + "because " + ae.getMessage());
                part = null;
            } catch (Exception e) {
                LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath() + "because " + e.getMessage());
                part = null;
            }
            TableDesc table = (part == null) ? null : part.getTableDesc();
            if (table != null) {
                headerCount = Utilities.getHeaderCount(table);
                footerCount = Utilities.getFooterCount(table, jobConf);
            }
            // If input contains header, skip header.
            if (!Utilities.skipHeader(recordReader, headerCount, (WritableComparable) key, (Writable) value)) {
                return false;
            }
            if (footerCount > 0) {
                footerBuffer = new FooterBuffer();
                if (!footerBuffer.initializeBuffer(jobConf, recordReader, footerCount, (WritableComparable) key, (Writable) value)) {
                    return false;
                }
            }
        }
        if (footerBuffer == null) {
            // Table files don't have footer rows.
            return recordReader.next(key, value);
        } else {
            return footerBuffer.updateBuffer(jobConf, recordReader, (WritableComparable) key, (Writable) value);
        }
    } catch (Exception e) {
        return HiveIOExceptionHandlerUtil.handleRecordReaderNextException(e, jobConf);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) WritableComparable(org.apache.hadoop.io.WritableComparable) Writable(org.apache.hadoop.io.Writable) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) IOException(java.io.IOException) FooterBuffer(org.apache.hadoop.hive.ql.exec.FooterBuffer)

Example 18 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class HiveInputFormat method getSplits.

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    init(job);
    Path[] dirs = getInputPaths(job);
    JobConf newjob = new JobConf(job);
    List<InputSplit> result = new ArrayList<InputSplit>();
    List<Path> currentDirs = new ArrayList<Path>();
    Class<? extends InputFormat> currentInputFormatClass = null;
    TableDesc currentTable = null;
    TableScanOperator currentTableScan = null;
    boolean pushDownProjection = false;
    //Buffers to hold filter pushdown information
    StringBuilder readColumnsBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""));
    ;
    StringBuilder readColumnNamesBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));
    // for each dir, get the InputFormat, and do getSplits.
    for (Path dir : dirs) {
        PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
        Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
        TableDesc table = part.getTableDesc();
        TableScanOperator tableScan = null;
        List<String> aliases = mrwork.getPathToAliases().get(dir);
        // Make filter pushdown information available to getSplits.
        if ((aliases != null) && (aliases.size() == 1)) {
            Operator op = mrwork.getAliasToWork().get(aliases.get(0));
            if ((op != null) && (op instanceof TableScanOperator)) {
                tableScan = (TableScanOperator) op;
                //Reset buffers to store filter push down columns
                readColumnsBuffer.setLength(0);
                readColumnNamesBuffer.setLength(0);
                // push down projections.
                ColumnProjectionUtils.appendReadColumns(readColumnsBuffer, readColumnNamesBuffer, tableScan.getNeededColumnIDs(), tableScan.getNeededColumns());
                pushDownProjection = true;
                // push down filters
                pushFilters(newjob, tableScan);
            }
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("aliases: {} pathToAliases: {} dir: {}", aliases, mrwork.getPathToAliases(), dir);
            }
        }
        if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass) && table.equals(currentTable) && tableScan == currentTableScan) {
            currentDirs.add(dir);
            continue;
        }
        if (!currentDirs.isEmpty()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Generating splits as currentDirs is not empty. currentDirs: {}", currentDirs);
            }
            // set columns to read in conf
            if (pushDownProjection) {
                pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
            }
            addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
        }
        currentDirs.clear();
        currentDirs.add(dir);
        currentTableScan = tableScan;
        currentTable = table;
        currentInputFormatClass = inputFormatClass;
    }
    // set columns to read in conf
    if (pushDownProjection) {
        pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
    }
    if (dirs.length != 0) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Generating splits for dirs: {}", dirs);
        }
        addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
    }
    Utilities.clearWorkMapForConf(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("number of splits " + result.size());
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
    return result.toArray(new HiveInputSplit[result.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 19 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class HiveInputFormat method getRecordReader.

public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    HiveInputSplit hsplit = (HiveInputSplit) split;
    InputSplit inputSplit = hsplit.getInputSplit();
    String inputFormatClassName = null;
    Class inputFormatClass = null;
    try {
        inputFormatClassName = hsplit.inputFormatClassName();
        inputFormatClass = job.getClassByName(inputFormatClassName);
    } catch (Exception e) {
        throw new IOException("cannot find class " + inputFormatClassName, e);
    }
    if (this.mrwork == null || pathToPartitionInfo == null) {
        init(job);
    }
    boolean nonNative = false;
    PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, hsplit.getPath(), null);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Found spec for " + hsplit.getPath() + " " + part + " from " + pathToPartitionInfo);
    }
    if ((part != null) && (part.getTableDesc() != null)) {
        Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), job);
        nonNative = part.getTableDesc().isNonNative();
    }
    Path splitPath = hsplit.getPath();
    pushProjectionsAndFilters(job, inputFormatClass, splitPath, nonNative);
    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
    try {
        inputFormat = HiveInputFormat.wrapForLlap(inputFormat, job, part);
    } catch (HiveException e) {
        throw new IOException(e);
    }
    RecordReader innerReader = null;
    try {
        innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
    } catch (Exception e) {
        innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
    }
    HiveRecordReader<K, V> rr = new HiveRecordReader(innerReader, job);
    rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
    return rr;
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) RecordReader(org.apache.hadoop.mapred.RecordReader) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 20 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class BucketizedHiveInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    init(job);
    Path[] dirs = getInputPaths(job);
    JobConf newjob = new JobConf(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    int numOrigSplits = 0;
    // and then create a BucketizedHiveInputSplit on it
    for (Path dir : dirs) {
        PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
        // create a new InputFormat instance if this is the first time to see this
        // class
        Class inputFormatClass = part.getInputFileFormatClass();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        newjob.setInputFormat(inputFormat.getClass());
        FileStatus[] listStatus = listStatus(newjob, dir);
        for (FileStatus status : listStatus) {
            LOG.info("block size: " + status.getBlockSize());
            LOG.info("file length: " + status.getLen());
            FileInputFormat.setInputPaths(newjob, status.getPath());
            InputSplit[] iss = inputFormat.getSplits(newjob, 0);
            if (iss != null && iss.length > 0) {
                numOrigSplits += iss.length;
                result.add(new BucketizedHiveInputSplit(iss, inputFormatClass.getName()));
            }
        }
    }
    LOG.info(result.size() + " bucketized splits generated from " + numOrigSplits + " original splits.");
    return result.toArray(new BucketizedHiveInputSplit[result.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)58 Path (org.apache.hadoop.fs.Path)47 ArrayList (java.util.ArrayList)31 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)27 LinkedHashMap (java.util.LinkedHashMap)19 HashMap (java.util.HashMap)14 Map (java.util.Map)13 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 JobConf (org.apache.hadoop.mapred.JobConf)13 IOException (java.io.IOException)11 Properties (java.util.Properties)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)10 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)10 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)10 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 Configuration (org.apache.hadoop.conf.Configuration)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)7 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)7