Search in sources :

Example 76 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method createMRWorkForMergingFiles.

/**
 * Create a MapredWork based on input path, the top operator and the input
 * table descriptor.
 *
 * @param conf
 * @param topOp
 *          the table scan operator that is the root of the MapReduce task.
 * @param fsDesc
 *          the file sink descriptor that serves as the input to this merge task.
 * @param parentMR
 *          the parent MapReduce work
 * @param parentFS
 *          the last FileSinkOperator in the parent MapReduce work
 * @return the MapredWork
 */
private static MapWork createMRWorkForMergingFiles(HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) {
    ArrayList<String> aliases = new ArrayList<String>();
    Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getMergeInputDirName());
    String inputDirStr = inputDir.toString().intern();
    TableDesc tblDesc = fsDesc.getTableInfo();
    // dummy alias: just use the input path
    aliases.add(inputDirStr);
    // constructing the default MapredWork
    MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
    MapWork cplan = cMrPlan.getMapWork();
    cplan.addPathToAlias(inputDir, aliases);
    cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null));
    cplan.getAliasToWork().put(inputDirStr, topOp);
    cplan.setMapperCannotSpanPartns(true);
    return cplan;
}
Also used : Path(org.apache.hadoop.fs.Path) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 77 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method setTaskPlan.

/**
 * set the current task in the mapredWork.
 *
 * @param alias
 *          current alias
 * @param topOp
 *          the top operator of the stack
 * @param plan
 *          current plan
 * @param local
 *          whether you need to add to map-reduce or local work
 * @param ttDesc
 *          table descriptor
 * @throws SemanticException
 */
public static void setTaskPlan(Path path, String alias, Operator<? extends OperatorDesc> topOp, MapWork plan, boolean local, TableDesc ttDesc) throws SemanticException {
    if (path == null || alias == null) {
        return;
    }
    if (topOp instanceof TableScanOperator) {
        try {
            Utilities.addSchemaEvolutionToTableScanOperator((StructObjectInspector) ttDesc.getSerDe().getObjectInspector(), (TableScanOperator) topOp);
        } catch (Exception e) {
            throw new SemanticException(e);
        }
    }
    if (!local) {
        plan.addPathToAlias(path, alias);
        plan.addPathToPartitionInfo(path, new PartitionDesc(ttDesc, null));
        plan.getAliasToWork().put(alias, topOp);
    } else {
        // populate local work if needed
        MapredLocalWork localPlan = plan.getMapRedLocalWork();
        if (localPlan == null) {
            localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        }
        assert localPlan.getAliasToWork().get(alias) == null;
        assert localPlan.getAliasToFetchWork().get(alias) == null;
        localPlan.getAliasToWork().put(alias, topOp);
        localPlan.getAliasToFetchWork().put(alias, new FetchWork(new Path(alias), ttDesc));
        plan.setMapRedLocalWork(localPlan);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) LinkedHashMap(java.util.LinkedHashMap)

Example 78 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class SplitGrouper method generateGroupedSplits.

/**
 * Generate groups of splits, separated by schema evolution boundaries
 * OR
 * When used from compactor, group splits based on the bucket number of the input files
 * (in this case, splits for same logical bucket but different schema, end up in same group)
 */
public Multimap<Integer, InputSplit> generateGroupedSplits(JobConf jobConf, Configuration conf, InputSplit[] splits, float waves, int availableSlots, String inputName, boolean groupAcrossFiles, SplitLocationProvider locationProvider) throws Exception {
    boolean isMinorCompaction = true;
    MapWork mapWork = populateMapWork(jobConf, inputName);
    // ArrayListMultimap is important here to retain the ordering for the splits.
    Multimap<Integer, InputSplit> schemaGroupedSplitMultiMap = ArrayListMultimap.<Integer, InputSplit>create();
    if (HiveConf.getVar(jobConf, HiveConf.ConfVars.SPLIT_GROUPING_MODE).equalsIgnoreCase("compactor")) {
        List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
        for (Path path : paths) {
            List<String> aliases = mapWork.getPathToAliases().get(path);
            if ((aliases != null) && (aliases.size() == 1)) {
                Operator<? extends OperatorDesc> op = mapWork.getAliasToWork().get(aliases.get(0));
                if ((op != null) && (op instanceof TableScanOperator)) {
                    TableScanOperator tableScan = (TableScanOperator) op;
                    PartitionDesc partitionDesc = mapWork.getAliasToPartnInfo().get(aliases.get(0));
                    isMinorCompaction &= AcidUtils.isCompactionTable(partitionDesc.getTableDesc().getProperties());
                    if (!tableScan.getConf().isTranscationalTable() && !isMinorCompaction) {
                        String splitPath = getFirstSplitPath(splits);
                        String errorMessage = "Compactor split grouping is enabled only for transactional tables. Please check the path: " + splitPath;
                        LOG.error(errorMessage);
                        throw new RuntimeException(errorMessage);
                    }
                }
            }
        }
        /**
         * The expectation is that each InputSplit is a {@link org.apache.hadoop.hive.ql.io.HiveInputFormat.HiveInputSplit}
         * wrapping an OrcSplit. So group these splits by bucketId and within each bucketId, sort by writeId, stmtId,
         * rowIdOffset or splitStart. For 'original' splits (w/o acid meta cols in the file) SyntheticBucketProperties
         * should always be there and so rowIdOffset is there. For 'native' acid files, OrcSplit doesn't have
         * the 1st rowid in the split, so splitStart is used to sort. This should achieve the required sorting invariance
         * (sort by: writeId, stmtId, rowIdOffset within each bucket) needed for Acid tables.
         * See: {@link org.apache.hadoop.hive.ql.io.AcidInputFormat}
         * Create a TezGroupedSplit for each bucketId and return.
         * TODO: Are there any other config values (split size etc) that can override this per writer split grouping?
         */
        return getCompactorSplitGroups(splits, conf, isMinorCompaction);
    }
    int i = 0;
    InputSplit prevSplit = null;
    for (InputSplit s : splits) {
        // this is the bit where we make sure we don't group across partition schema boundaries
        if (schemaEvolved(s, prevSplit, groupAcrossFiles, mapWork)) {
            ++i;
            prevSplit = s;
        }
        schemaGroupedSplitMultiMap.put(i, s);
    }
    LOG.info("# Src groups for split generation: " + (i + 1));
    // group them into the chunks we want
    Multimap<Integer, InputSplit> groupedSplits = this.group(jobConf, schemaGroupedSplitMultiMap, availableSlots, waves, locationProvider);
    return groupedSplits;
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 79 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class MapJoinProcessor method genMapJoinLocalWork.

/**
 * Generate the MapRed Local Work for the given map-join operator
 *
 * @param newWork
 * @param mapJoinOp
 *          map-join operator for which local work needs to be generated.
 * @param bigTablePos
 * @throws SemanticException
 */
private static void genMapJoinLocalWork(MapredWork newWork, MapJoinOperator mapJoinOp, int bigTablePos) throws SemanticException {
    // keep the small table alias to avoid concurrent modification exception
    ArrayList<String> smallTableAliasList = new ArrayList<String>();
    // create a new  MapredLocalWork
    MapredLocalWork newLocalWork = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : newWork.getMapWork().getAliasToWork().entrySet()) {
        String alias = entry.getKey();
        Operator<? extends OperatorDesc> op = entry.getValue();
        // if the table scan is for big table; then skip it
        // tracing down the operator tree from the table scan operator
        Operator<? extends OperatorDesc> parentOp = op;
        Operator<? extends OperatorDesc> childOp = op.getChildOperators().get(0);
        while ((childOp != null) && (!childOp.equals(mapJoinOp))) {
            parentOp = childOp;
            assert parentOp.getChildOperators().size() == 1;
            childOp = parentOp.getChildOperators().get(0);
        }
        if (childOp == null) {
            throw new SemanticException("Cannot find join op by tracing down the table scan operator tree");
        }
        // skip the big table pos
        int i = childOp.getParentOperators().indexOf(parentOp);
        if (i == bigTablePos) {
            continue;
        }
        // set alias to work and put into smallTableAliasList
        newLocalWork.getAliasToWork().put(alias, op);
        smallTableAliasList.add(alias);
        // get input path and remove this alias from pathToAlias
        // because this file will be fetched by fetch operator
        Map<Path, List<String>> pathToAliases = newWork.getMapWork().getPathToAliases();
        // keep record all the input path for this alias
        HashSet<Path> pathSet = new HashSet<>();
        HashSet<Path> emptyPath = new HashSet<>();
        for (Map.Entry<Path, List<String>> entry2 : pathToAliases.entrySet()) {
            Path path = entry2.getKey();
            List<String> list = entry2.getValue();
            if (list.contains(alias)) {
                // add to path set
                pathSet.add(path);
                // remove this alias from the alias list
                list.remove(alias);
                if (list.size() == 0) {
                    emptyPath.add(path);
                }
            }
        }
        // remove the path, with which no alias associates
        for (Path path : emptyPath) {
            newWork.getMapWork().removePathToAlias(path);
        }
        // create fetch work
        FetchWork fetchWork = null;
        List<Path> partDir = new ArrayList<Path>();
        List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
        for (Path tablePath : pathSet) {
            PartitionDesc partitionDesc = newWork.getMapWork().getPathToPartitionInfo().get(tablePath);
            // create fetchwork for non partitioned table
            if (partitionDesc.getPartSpec() == null || partitionDesc.getPartSpec().size() == 0) {
                fetchWork = new FetchWork(tablePath, partitionDesc.getTableDesc());
                break;
            }
            // if table is partitioned,add partDir and partitionDesc
            partDir.add(tablePath);
            partDesc.add(partitionDesc);
        }
        // create fetchwork for partitioned table
        if (fetchWork == null) {
            TableDesc table = newWork.getMapWork().getAliasToPartnInfo().get(alias).getTableDesc();
            fetchWork = new FetchWork(partDir, partDesc, table);
        }
        // set alias to fetch work
        newLocalWork.getAliasToFetchWork().put(alias, fetchWork);
    }
    // remove small table alias from aliasToWork;Avoid concurrent modification
    for (String alias : smallTableAliasList) {
        newWork.getMapWork().getAliasToWork().remove(alias);
    }
    // set up local work
    newWork.getMapWork().setMapRedLocalWork(newLocalWork);
    // remove reducer
    newWork.setReduceWork(null);
}
Also used : LateralViewJoinOperator(org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ScriptOperator(org.apache.hadoop.hive.ql.exec.ScriptOperator) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Example 80 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class SparkDynamicPartitionPruner method applyFilterToPartitions.

private void applyFilterToPartitions(MapWork work, ObjectInspectorConverters.Converter converter, ExprNodeEvaluator eval, String columnName, Set<Object> values) throws HiveException {
    Object[] row = new Object[1];
    Iterator<Path> it = work.getPathToPartitionInfo().keySet().iterator();
    while (it.hasNext()) {
        Path p = it.next();
        PartitionDesc desc = work.getPathToPartitionInfo().get(p);
        Map<String, String> spec = desc.getPartSpec();
        Preconditions.checkNotNull(spec, "No partition spec found in dynamic pruning");
        String partValueString = spec.get(columnName);
        Preconditions.checkNotNull(partValueString, "Could not find partition value for column: " + columnName);
        Object partValue = converter.convert(partValueString);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Converted partition value: " + partValue + " original (" + partValueString + ")");
        }
        row[0] = partValue;
        partValue = eval.evaluate(row);
        if (LOG.isDebugEnabled()) {
            LOG.debug("part key expr applied: " + partValue);
        }
        if (!values.contains(partValue)) {
            LOG.info("Pruning path: " + p);
            it.remove();
            work.removePathToAlias(p);
            work.removePathToPartitionInfo(p);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)90 Path (org.apache.hadoop.fs.Path)67 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)41 ArrayList (java.util.ArrayList)39 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)27 LinkedHashMap (java.util.LinkedHashMap)24 List (java.util.List)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Map (java.util.Map)18 Properties (java.util.Properties)18 HashMap (java.util.HashMap)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 IOException (java.io.IOException)15 Operator (org.apache.hadoop.hive.ql.exec.Operator)15 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)14 Configuration (org.apache.hadoop.conf.Configuration)13 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)9 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)9