Search in sources :

Example 6 with RowSchema

use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.

the class MapJoinProcessor method convertSMBJoinToMapJoin.

/**
   * convert a sortmerge join to a a map-side join.
   *
   * @param opParseCtxMap
   * @param smbJoinOp
   *          join operator
   * @param joinTree
   *          qb join tree
   * @param bigTablePos
   *          position of the source to be read as part of map-reduce framework. All other sources
   *          are cached in memory
   * @param noCheckOuterJoin
   */
public static MapJoinOperator convertSMBJoinToMapJoin(HiveConf hconf, SMBMapJoinOperator smbJoinOp, int bigTablePos, boolean noCheckOuterJoin) throws SemanticException {
    // Create a new map join operator
    SMBJoinDesc smbJoinDesc = smbJoinOp.getConf();
    List<ExprNodeDesc> keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0));
    TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf, PlanUtils.getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
    MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), keyTableDesc, smbJoinDesc.getExprs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getOutputColumnNames(), bigTablePos, smbJoinDesc.getConds(), smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix());
    mapJoinDesc.setStatistics(smbJoinDesc.getStatistics());
    RowSchema joinRS = smbJoinOp.getSchema();
    // The mapjoin has the same schema as the join operator
    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(smbJoinOp.getCompilationOpContext(), mapJoinDesc, joinRS, new ArrayList<Operator<? extends OperatorDesc>>());
    // change the children of the original join operator to point to the map
    // join operator
    List<Operator<? extends OperatorDesc>> childOps = smbJoinOp.getChildOperators();
    for (Operator<? extends OperatorDesc> childOp : childOps) {
        childOp.replaceParent(smbJoinOp, mapJoinOp);
    }
    mapJoinOp.setChildOperators(childOps);
    smbJoinOp.setChildOperators(null);
    // change the parent of the original SMBjoin operator to point to the map
    // join operator
    List<Operator<? extends OperatorDesc>> parentOps = smbJoinOp.getParentOperators();
    for (Operator<? extends OperatorDesc> parentOp : parentOps) {
        parentOp.replaceChild(smbJoinOp, mapJoinOp);
    }
    mapJoinOp.setParentOperators(parentOps);
    smbJoinOp.setParentOperators(null);
    return mapJoinOp;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LateralViewJoinOperator(org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ScriptOperator(org.apache.hadoop.hive.ql.exec.ScriptOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) ArrayList(java.util.ArrayList) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 7 with RowSchema

use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.

the class CorrelationUtilities method removeReduceSinkForGroupBy.

protected static void removeReduceSinkForGroupBy(ReduceSinkOperator cRS, GroupByOperator cGBYr, ParseContext context, AbstractCorrelationProcCtx procCtx) throws SemanticException {
    Operator<?> parent = getSingleParent(cRS);
    if ((parent instanceof GroupByOperator) && procCtx.isMapAggr()) {
        // pRS-cGBYm-cRS-cGBYr (map aggregation) --> pRS-cGBYr(COMPLETE)
        // copies desc of cGBYm to cGBYr and remove cGBYm and cRS
        GroupByOperator cGBYm = (GroupByOperator) parent;
        cGBYr.getConf().setKeys(ExprNodeDescUtils.backtrack(ExprNodeDescUtils.backtrack(cGBYr.getConf().getKeys(), cGBYr, cRS), cRS, cGBYm));
        cGBYr.getConf().setAggregators(cGBYm.getConf().getAggregators());
        for (AggregationDesc aggr : cGBYm.getConf().getAggregators()) {
            aggr.setMode(GenericUDAFEvaluator.Mode.COMPLETE);
        }
        cGBYr.setColumnExprMap(cGBYm.getColumnExprMap());
        cGBYr.setSchema(cGBYm.getSchema());
    } else {
        // pRS-cRS-cGBYr (no map aggregation) --> pRS-cGBYr(COMPLETE)
        // revert expressions of cGBYr to that of cRS
        cGBYr.getConf().setKeys(ExprNodeDescUtils.backtrack(cGBYr.getConf().getKeys(), cGBYr, cRS));
        for (AggregationDesc aggr : cGBYr.getConf().getAggregators()) {
            aggr.setParameters(ExprNodeDescUtils.backtrack(aggr.getParameters(), cGBYr, cRS));
        }
        Map<String, ExprNodeDesc> oldMap = cGBYr.getColumnExprMap();
        RowSchema oldRS = cGBYr.getSchema();
        Map<String, ExprNodeDesc> newMap = new HashMap<String, ExprNodeDesc>();
        ArrayList<ColumnInfo> newRS = new ArrayList<ColumnInfo>();
        List<String> outputCols = cGBYr.getConf().getOutputColumnNames();
        for (int i = 0; i < outputCols.size(); i++) {
            String colName = outputCols.get(i);
            ColumnInfo colInfo = oldRS.getColumnInfo(colName);
            newRS.add(colInfo);
            ExprNodeDesc colExpr = ExprNodeDescUtils.backtrack(oldMap.get(colName), cGBYr, cRS);
            if (colExpr != null) {
                newMap.put(colInfo.getInternalName(), colExpr);
            }
        }
        cGBYr.setColumnExprMap(newMap);
        cGBYr.setSchema(new RowSchema(newRS));
    }
    cGBYr.getConf().setMode(GroupByDesc.Mode.COMPLETE);
    removeOperator(cRS, cGBYr, parent, context);
    procCtx.addRemovedOperator(cRS);
    if ((parent instanceof GroupByOperator) && procCtx.isMapAggr()) {
        removeOperator(parent, cGBYr, getSingleParent(parent), context);
        procCtx.addRemovedOperator(cGBYr);
    }
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) AggregationDesc(org.apache.hadoop.hive.ql.plan.AggregationDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 8 with RowSchema

use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.

the class HiveOpConverter method genReduceSinkAndBacktrackSelect.

private static SelectOperator genReduceSinkAndBacktrackSelect(Operator<?> input, ExprNodeDesc[] keys, int tag, ArrayList<ExprNodeDesc> partitionCols, String order, String nullOrder, int numReducers, Operation acidOperation, HiveConf hiveConf, List<String> keepColNames) throws SemanticException {
    // 1. Generate RS operator
    // 1.1 Prune the tableNames, only count the tableNames that are not empty strings
    // as empty string in table aliases is only allowed for virtual columns.
    String tableAlias = null;
    Set<String> tableNames = input.getSchema().getTableNames();
    for (String tableName : tableNames) {
        if (tableName != null) {
            if (tableName.length() == 0) {
                if (tableAlias == null) {
                    tableAlias = tableName;
                }
            } else {
                if (tableAlias == null || tableAlias.length() == 0) {
                    tableAlias = tableName;
                } else {
                    if (!tableName.equals(tableAlias)) {
                        throw new SemanticException("In CBO return path, genReduceSinkAndBacktrackSelect is expecting only one tableAlias but there is more than one");
                    }
                }
            }
        }
    }
    if (tableAlias == null) {
        throw new SemanticException("In CBO return path, genReduceSinkAndBacktrackSelect is expecting only one tableAlias but there is none");
    }
    // 1.2 Now generate RS operator
    ReduceSinkOperator rsOp = genReduceSink(input, tableAlias, keys, tag, partitionCols, order, nullOrder, numReducers, acidOperation, hiveConf);
    // 2. Generate backtrack Select operator
    Map<String, ExprNodeDesc> descriptors = buildBacktrackFromReduceSink(keepColNames, rsOp.getConf().getOutputKeyColumnNames(), rsOp.getConf().getOutputValueColumnNames(), rsOp.getValueIndex(), input);
    SelectDesc selectDesc = new SelectDesc(new ArrayList<ExprNodeDesc>(descriptors.values()), new ArrayList<String>(descriptors.keySet()));
    ArrayList<ColumnInfo> cinfoLst = createColInfosSubset(input, keepColNames);
    SelectOperator selectOp = (SelectOperator) OperatorFactory.getAndMakeChild(selectDesc, new RowSchema(cinfoLst), rsOp);
    selectOp.setColumnExprMap(descriptors);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Generated " + selectOp + " with row schema: [" + selectOp.getSchema() + "]");
    }
    return selectOp;
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SelectDesc(org.apache.hadoop.hive.ql.plan.SelectDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 9 with RowSchema

use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.

the class HiveOpConverter method genReduceSink.

@SuppressWarnings({ "rawtypes", "unchecked" })
private static ReduceSinkOperator genReduceSink(Operator<?> input, String tableAlias, ExprNodeDesc[] keys, int tag, ArrayList<ExprNodeDesc> partitionCols, String order, String nullOrder, int numReducers, Operation acidOperation, HiveConf hiveConf) throws SemanticException {
    // dummy for backtracking
    Operator dummy = Operator.createDummy();
    dummy.setParentOperators(Arrays.asList(input));
    ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>();
    ArrayList<ExprNodeDesc> reduceKeysBack = new ArrayList<ExprNodeDesc>();
    // Compute join keys and store in reduceKeys
    for (ExprNodeDesc key : keys) {
        reduceKeys.add(key);
        reduceKeysBack.add(ExprNodeDescUtils.backtrack(key, dummy, input));
    }
    // Walk over the input schema and copy in the output
    ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
    ArrayList<ExprNodeDesc> reduceValuesBack = new ArrayList<ExprNodeDesc>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    List<ColumnInfo> inputColumns = input.getSchema().getSignature();
    ArrayList<ColumnInfo> outputColumns = new ArrayList<ColumnInfo>();
    List<String> outputColumnNames = new ArrayList<String>();
    int[] index = new int[inputColumns.size()];
    for (int i = 0; i < inputColumns.size(); i++) {
        ColumnInfo colInfo = inputColumns.get(i);
        String outputColName = colInfo.getInternalName();
        ExprNodeColumnDesc expr = new ExprNodeColumnDesc(colInfo);
        // backtrack can be null when input is script operator
        ExprNodeDesc exprBack = ExprNodeDescUtils.backtrack(expr, dummy, input);
        int kindex = exprBack == null ? -1 : ExprNodeDescUtils.indexOf(exprBack, reduceKeysBack);
        if (kindex >= 0) {
            ColumnInfo newColInfo = new ColumnInfo(colInfo);
            newColInfo.setInternalName(Utilities.ReduceField.KEY + ".reducesinkkey" + kindex);
            newColInfo.setAlias(outputColName);
            newColInfo.setTabAlias(tableAlias);
            outputColumns.add(newColInfo);
            index[i] = kindex;
            continue;
        }
        int vindex = exprBack == null ? -1 : ExprNodeDescUtils.indexOf(exprBack, reduceValuesBack);
        if (vindex >= 0) {
            index[i] = -vindex - 1;
            continue;
        }
        index[i] = -reduceValues.size() - 1;
        reduceValues.add(expr);
        reduceValuesBack.add(exprBack);
        ColumnInfo newColInfo = new ColumnInfo(colInfo);
        newColInfo.setInternalName(Utilities.ReduceField.VALUE + "." + outputColName);
        newColInfo.setAlias(outputColName);
        newColInfo.setTabAlias(tableAlias);
        outputColumns.add(newColInfo);
        outputColumnNames.add(outputColName);
    }
    dummy.setParentOperators(null);
    // Use only 1 reducer if no reduce keys
    if (reduceKeys.size() == 0) {
        numReducers = 1;
        // Cartesian product is not supported in strict mode
        String error = StrictChecks.checkCartesian(hiveConf);
        if (error != null)
            throw new SemanticException(error);
    }
    ReduceSinkDesc rsDesc;
    if (order.isEmpty()) {
        rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, false, tag, reduceKeys.size(), numReducers, acidOperation);
    } else {
        rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, false, tag, partitionCols, order, nullOrder, numReducers, acidOperation);
    }
    ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(outputColumns), input);
    List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
    for (int i = 0; i < keyColNames.size(); i++) {
        colExprMap.put(Utilities.ReduceField.KEY + "." + keyColNames.get(i), reduceKeys.get(i));
    }
    List<String> valColNames = rsDesc.getOutputValueColumnNames();
    for (int i = 0; i < valColNames.size(); i++) {
        colExprMap.put(Utilities.ReduceField.VALUE + "." + valColNames.get(i), reduceValues.get(i));
    }
    rsOp.setValueIndex(index);
    rsOp.setColumnExprMap(colExprMap);
    rsOp.setInputAliases(input.getSchema().getTableNames().toArray(new String[input.getSchema().getTableNames().size()]));
    if (LOG.isDebugEnabled()) {
        LOG.debug("Generated " + rsOp + " with row schema: [" + rsOp.getSchema() + "]");
    }
    return rsOp;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 10 with RowSchema

use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.

the class HiveOpConverter method visit.

/**
   * TODO: 1. PPD needs to get pushed in to TS
   *
   * @param scanRel
   * @return
   */
OpAttr visit(HiveTableScan scanRel) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("Translating operator rel#" + scanRel.getId() + ":" + scanRel.getRelTypeName() + " with row type: [" + scanRel.getRowType() + "]");
    }
    RelOptHiveTable ht = (RelOptHiveTable) scanRel.getTable();
    // 1. Setup TableScan Desc
    // 1.1 Build col details used by scan
    ArrayList<ColumnInfo> colInfos = new ArrayList<ColumnInfo>();
    List<VirtualColumn> virtualCols = new ArrayList<VirtualColumn>();
    List<Integer> neededColumnIDs = new ArrayList<Integer>();
    List<String> neededColumnNames = new ArrayList<String>();
    Set<Integer> vcolsInCalcite = new HashSet<Integer>();
    List<String> partColNames = new ArrayList<String>();
    Map<Integer, VirtualColumn> VColsMap = HiveCalciteUtil.getVColsMap(ht.getVirtualCols(), ht.getNoOfNonVirtualCols());
    Map<Integer, ColumnInfo> posToPartColInfo = ht.getPartColInfoMap();
    Map<Integer, ColumnInfo> posToNonPartColInfo = ht.getNonPartColInfoMap();
    List<Integer> neededColIndxsFrmReloptHT = scanRel.getNeededColIndxsFrmReloptHT();
    List<String> scanColNames = scanRel.getRowType().getFieldNames();
    String tableAlias = scanRel.getConcatQbIDAlias();
    String colName;
    ColumnInfo colInfo;
    VirtualColumn vc;
    for (int index = 0; index < scanRel.getRowType().getFieldList().size(); index++) {
        colName = scanColNames.get(index);
        if (VColsMap.containsKey(index)) {
            vc = VColsMap.get(index);
            virtualCols.add(vc);
            colInfo = new ColumnInfo(vc.getName(), vc.getTypeInfo(), tableAlias, true, vc.getIsHidden());
            vcolsInCalcite.add(index);
        } else if (posToPartColInfo.containsKey(index)) {
            partColNames.add(colName);
            colInfo = posToPartColInfo.get(index);
            vcolsInCalcite.add(index);
        } else {
            colInfo = posToNonPartColInfo.get(index);
        }
        colInfos.add(colInfo);
        if (neededColIndxsFrmReloptHT.contains(index)) {
            neededColumnIDs.add(index);
            neededColumnNames.add(colName);
        }
    }
    // 1.2 Create TableScanDesc
    TableScanDesc tsd = new TableScanDesc(tableAlias, virtualCols, ht.getHiveTableMD());
    // 1.3. Set Partition cols in TSDesc
    tsd.setPartColumns(partColNames);
    // 1.4. Set needed cols in TSDesc
    tsd.setNeededColumnIDs(neededColumnIDs);
    tsd.setNeededColumns(neededColumnNames);
    // 2. Setup TableScan
    TableScanOperator ts = (TableScanOperator) OperatorFactory.get(semanticAnalyzer.getOpContext(), tsd, new RowSchema(colInfos));
    // tablescan with same alias.
    if (topOps.get(tableAlias) != null) {
        tableAlias = tableAlias + this.uniqueCounter;
    }
    topOps.put(tableAlias, ts);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Generated " + ts + " with row schema: [" + ts.getSchema() + "]");
    }
    return new OpAttr(tableAlias, vcolsInCalcite, ts);
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ArrayList(java.util.ArrayList) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) RelOptHiveTable(org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable) VirtualColumn(org.apache.hadoop.hive.ql.metadata.VirtualColumn) HashSet(java.util.HashSet)

Aggregations

RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)76 ArrayList (java.util.ArrayList)59 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)57 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)56 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)42 HashMap (java.util.HashMap)39 Operator (org.apache.hadoop.hive.ql.exec.Operator)36 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)32 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)31 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)31 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)30 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)28 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)28 LinkedHashMap (java.util.LinkedHashMap)26 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)25 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)23 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)23 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)22 List (java.util.List)14 SelectDesc (org.apache.hadoop.hive.ql.plan.SelectDesc)14