Search in sources :

Example 76 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method createMRWorkForMergingFiles.

 * Create a MapredWork based on input path, the top operator and the input
 * table descriptor.
 * @param conf
 * @param topOp
 *          the table scan operator that is the root of the MapReduce task.
 * @param fsDesc
 *          the file sink descriptor that serves as the input to this merge task.
 * @param parentMR
 *          the parent MapReduce work
 * @param parentFS
 *          the last FileSinkOperator in the parent MapReduce work
 * @return the MapredWork
private static MapWork createMRWorkForMergingFiles(HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) {
    ArrayList<String> aliases = new ArrayList<String>();
    Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getMergeInputDirName());
    String inputDirStr = inputDir.toString().intern();
    TableDesc tblDesc = fsDesc.getTableInfo();
    // dummy alias: just use the input path
    // constructing the default MapredWork
    MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
    MapWork cplan = cMrPlan.getMapWork();
    cplan.addPathToAlias(inputDir, aliases);
    cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null));
    cplan.getAliasToWork().put(inputDirStr, topOp);
    return cplan;
Also used : Path(org.apache.hadoop.fs.Path) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 77 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMapRedUtils method setTaskPlan.

 * set the current task in the mapredWork.
 * @param alias
 *          current alias
 * @param topOp
 *          the top operator of the stack
 * @param plan
 *          current plan
 * @param local
 *          whether you need to add to map-reduce or local work
 * @param ttDesc
 *          table descriptor
 * @throws SemanticException
public static void setTaskPlan(Path path, String alias, Operator<? extends OperatorDesc> topOp, MapWork plan, boolean local, TableDesc ttDesc) throws SemanticException {
    if (path == null || alias == null) {
    if (topOp instanceof TableScanOperator) {
        try {
            Utilities.addSchemaEvolutionToTableScanOperator((StructObjectInspector) ttDesc.getSerDe().getObjectInspector(), (TableScanOperator) topOp);
        } catch (Exception e) {
            throw new SemanticException(e);
    if (!local) {
        plan.addPathToAlias(path, alias);
        plan.addPathToPartitionInfo(path, new PartitionDesc(ttDesc, null));
        plan.getAliasToWork().put(alias, topOp);
    } else {
        // populate local work if needed
        MapredLocalWork localPlan = plan.getMapRedLocalWork();
        if (localPlan == null) {
            localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        assert localPlan.getAliasToWork().get(alias) == null;
        assert localPlan.getAliasToFetchWork().get(alias) == null;
        localPlan.getAliasToWork().put(alias, topOp);
        localPlan.getAliasToFetchWork().put(alias, new FetchWork(new Path(alias), ttDesc));
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) LinkedHashMap(java.util.LinkedHashMap)

Example 78 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class SplitGrouper method generateGroupedSplits.

 * Generate groups of splits, separated by schema evolution boundaries
 * OR
 * When used from compactor, group splits based on the bucket number of the input files
 * (in this case, splits for same logical bucket but different schema, end up in same group)
public Multimap<Integer, InputSplit> generateGroupedSplits(JobConf jobConf, Configuration conf, InputSplit[] splits, float waves, int availableSlots, String inputName, boolean groupAcrossFiles, SplitLocationProvider locationProvider) throws Exception {
    boolean isMinorCompaction = true;
    MapWork mapWork = populateMapWork(jobConf, inputName);
    // ArrayListMultimap is important here to retain the ordering for the splits.
    Multimap<Integer, InputSplit> schemaGroupedSplitMultiMap = ArrayListMultimap.<Integer, InputSplit>create();
    if (HiveConf.getVar(jobConf, HiveConf.ConfVars.SPLIT_GROUPING_MODE).equalsIgnoreCase("compactor")) {
        List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
        for (Path path : paths) {
            List<String> aliases = mapWork.getPathToAliases().get(path);
            if ((aliases != null) && (aliases.size() == 1)) {
                Operator<? extends OperatorDesc> op = mapWork.getAliasToWork().get(aliases.get(0));
                if ((op != null) && (op instanceof TableScanOperator)) {
                    TableScanOperator tableScan = (TableScanOperator) op;
                    PartitionDesc partitionDesc = mapWork.getAliasToPartnInfo().get(aliases.get(0));
                    isMinorCompaction &= AcidUtils.isCompactionTable(partitionDesc.getTableDesc().getProperties());
                    if (!tableScan.getConf().isTranscationalTable() && !isMinorCompaction) {
                        String splitPath = getFirstSplitPath(splits);
                        String errorMessage = "Compactor split grouping is enabled only for transactional tables. Please check the path: " + splitPath;
                        throw new RuntimeException(errorMessage);
         * The expectation is that each InputSplit is a {@link}
         * wrapping an OrcSplit. So group these splits by bucketId and within each bucketId, sort by writeId, stmtId,
         * rowIdOffset or splitStart. For 'original' splits (w/o acid meta cols in the file) SyntheticBucketProperties
         * should always be there and so rowIdOffset is there. For 'native' acid files, OrcSplit doesn't have
         * the 1st rowid in the split, so splitStart is used to sort. This should achieve the required sorting invariance
         * (sort by: writeId, stmtId, rowIdOffset within each bucket) needed for Acid tables.
         * See: {@link}
         * Create a TezGroupedSplit for each bucketId and return.
         * TODO: Are there any other config values (split size etc) that can override this per writer split grouping?
        return getCompactorSplitGroups(splits, conf, isMinorCompaction);
    int i = 0;
    InputSplit prevSplit = null;
    for (InputSplit s : splits) {
        // this is the bit where we make sure we don't group across partition schema boundaries
        if (schemaEvolved(s, prevSplit, groupAcrossFiles, mapWork)) {
            prevSplit = s;
        schemaGroupedSplitMultiMap.put(i, s);
    }"# Src groups for split generation: " + (i + 1));
    // group them into the chunks we want
    Multimap<Integer, InputSplit> groupedSplits =, schemaGroupedSplitMultiMap, availableSlots, waves, locationProvider);
    return groupedSplits;
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 79 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class MapJoinProcessor method genMapJoinLocalWork.

 * Generate the MapRed Local Work for the given map-join operator
 * @param newWork
 * @param mapJoinOp
 *          map-join operator for which local work needs to be generated.
 * @param bigTablePos
 * @throws SemanticException
private static void genMapJoinLocalWork(MapredWork newWork, MapJoinOperator mapJoinOp, int bigTablePos) throws SemanticException {
    // keep the small table alias to avoid concurrent modification exception
    ArrayList<String> smallTableAliasList = new ArrayList<String>();
    // create a new  MapredLocalWork
    MapredLocalWork newLocalWork = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : newWork.getMapWork().getAliasToWork().entrySet()) {
        String alias = entry.getKey();
        Operator<? extends OperatorDesc> op = entry.getValue();
        // if the table scan is for big table; then skip it
        // tracing down the operator tree from the table scan operator
        Operator<? extends OperatorDesc> parentOp = op;
        Operator<? extends OperatorDesc> childOp = op.getChildOperators().get(0);
        while ((childOp != null) && (!childOp.equals(mapJoinOp))) {
            parentOp = childOp;
            assert parentOp.getChildOperators().size() == 1;
            childOp = parentOp.getChildOperators().get(0);
        if (childOp == null) {
            throw new SemanticException("Cannot find join op by tracing down the table scan operator tree");
        // skip the big table pos
        int i = childOp.getParentOperators().indexOf(parentOp);
        if (i == bigTablePos) {
        // set alias to work and put into smallTableAliasList
        newLocalWork.getAliasToWork().put(alias, op);
        // get input path and remove this alias from pathToAlias
        // because this file will be fetched by fetch operator
        Map<Path, List<String>> pathToAliases = newWork.getMapWork().getPathToAliases();
        // keep record all the input path for this alias
        HashSet<Path> pathSet = new HashSet<>();
        HashSet<Path> emptyPath = new HashSet<>();
        for (Map.Entry<Path, List<String>> entry2 : pathToAliases.entrySet()) {
            Path path = entry2.getKey();
            List<String> list = entry2.getValue();
            if (list.contains(alias)) {
                // add to path set
                // remove this alias from the alias list
                if (list.size() == 0) {
        // remove the path, with which no alias associates
        for (Path path : emptyPath) {
        // create fetch work
        FetchWork fetchWork = null;
        List<Path> partDir = new ArrayList<Path>();
        List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
        for (Path tablePath : pathSet) {
            PartitionDesc partitionDesc = newWork.getMapWork().getPathToPartitionInfo().get(tablePath);
            // create fetchwork for non partitioned table
            if (partitionDesc.getPartSpec() == null || partitionDesc.getPartSpec().size() == 0) {
                fetchWork = new FetchWork(tablePath, partitionDesc.getTableDesc());
            // if table is partitioned,add partDir and partitionDesc
        // create fetchwork for partitioned table
        if (fetchWork == null) {
            TableDesc table = newWork.getMapWork().getAliasToPartnInfo().get(alias).getTableDesc();
            fetchWork = new FetchWork(partDir, partDesc, table);
        // set alias to fetch work
        newLocalWork.getAliasToFetchWork().put(alias, fetchWork);
    // remove small table alias from aliasToWork;Avoid concurrent modification
    for (String alias : smallTableAliasList) {
    // set up local work
    // remove reducer
Also used : LateralViewJoinOperator(org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ScriptOperator(org.apache.hadoop.hive.ql.exec.ScriptOperator) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Example 80 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class SparkDynamicPartitionPruner method applyFilterToPartitions.

private void applyFilterToPartitions(MapWork work, ObjectInspectorConverters.Converter converter, ExprNodeEvaluator eval, String columnName, Set<Object> values) throws HiveException {
    Object[] row = new Object[1];
    Iterator<Path> it = work.getPathToPartitionInfo().keySet().iterator();
    while (it.hasNext()) {
        Path p =;
        PartitionDesc desc = work.getPathToPartitionInfo().get(p);
        Map<String, String> spec = desc.getPartSpec();
        Preconditions.checkNotNull(spec, "No partition spec found in dynamic pruning");
        String partValueString = spec.get(columnName);
        Preconditions.checkNotNull(partValueString, "Could not find partition value for column: " + columnName);
        Object partValue = converter.convert(partValueString);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Converted partition value: " + partValue + " original (" + partValueString + ")");
        row[0] = partValue;
        partValue = eval.evaluate(row);
        if (LOG.isDebugEnabled()) {
            LOG.debug("part key expr applied: " + partValue);
        if (!values.contains(partValue)) {
  "Pruning path: " + p);
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)


PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)90 Path (org.apache.hadoop.fs.Path)67 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)41 ArrayList (java.util.ArrayList)39 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)27 LinkedHashMap (java.util.LinkedHashMap)24 List (java.util.List)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Map (java.util.Map)18 Properties (java.util.Properties)18 HashMap (java.util.HashMap)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 IOException ( Operator (org.apache.hadoop.hive.ql.exec.Operator)15 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)14 Configuration (org.apache.hadoop.conf.Configuration)13 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)9 HiveInputFormat (