Search in sources :

Example 1 with Dependency

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency in project hive by apache.

the class PostExecutePrinter method run.

public void run(QueryState queryState, Set<ReadEntity> inputs, Set<WriteEntity> outputs, LineageInfo linfo, UserGroupInformation ugi) throws Exception {
    LogHelper console = SessionState.getConsole();
    if (console == null) {
        return;
    }
    if (queryState != null) {
        console.printError("POSTHOOK: query: " + queryState.getQueryString().trim());
        console.printError("POSTHOOK: type: " + queryState.getCommandType());
    }
    PreExecutePrinter.printEntities(console, inputs, "POSTHOOK: Input: ");
    PreExecutePrinter.printEntities(console, outputs, "POSTHOOK: Output: ");
    // Also print out the generic lineage information if there is any
    if (linfo != null) {
        LinkedList<Map.Entry<DependencyKey, Dependency>> entry_list = new LinkedList<Map.Entry<DependencyKey, Dependency>>(linfo.entrySet());
        Collections.sort(entry_list, new DependencyKeyComp());
        Iterator<Map.Entry<DependencyKey, Dependency>> iter = entry_list.iterator();
        while (iter.hasNext()) {
            Map.Entry<DependencyKey, Dependency> it = iter.next();
            Dependency dep = it.getValue();
            DependencyKey depK = it.getKey();
            if (dep == null) {
                continue;
            }
            StringBuilder sb = new StringBuilder();
            sb.append("POSTHOOK: Lineage: ");
            if (depK.getDataContainer().isPartition()) {
                Partition part = depK.getDataContainer().getPartition();
                sb.append(part.getTableName());
                sb.append(" PARTITION(");
                int i = 0;
                for (FieldSchema fs : depK.getDataContainer().getTable().getPartitionKeys()) {
                    if (i != 0) {
                        sb.append(",");
                    }
                    sb.append(fs.getName() + "=" + part.getValues().get(i++));
                }
                sb.append(")");
            } else {
                sb.append(depK.getDataContainer().getTable().getTableName());
            }
            sb.append("." + depK.getFieldSchema().getName() + " " + dep.getType() + " ");
            sb.append("[");
            for (BaseColumnInfo col : dep.getBaseCols()) {
                sb.append("(" + col.getTabAlias().getTable().getTableName() + ")" + col.getTabAlias().getAlias() + "." + col.getColumn() + ", ");
            }
            sb.append("]");
            console.printError(sb.toString());
        }
    }
}
Also used : Partition(org.apache.hadoop.hive.metastore.api.Partition) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) DependencyKey(org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyKey) Dependency(org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency) LinkedList(java.util.LinkedList) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo) Map(java.util.Map)

Example 2 with Dependency

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency in project hive by apache.

the class ExprProcFactory method getExprString.

/**
   * Get the expression string of an expression node.
   */
public static String getExprString(RowSchema rs, ExprNodeDesc expr, LineageCtx lctx, Operator<? extends OperatorDesc> inpOp, Predicate cond) {
    if (expr instanceof ExprNodeColumnDesc) {
        ExprNodeColumnDesc col = (ExprNodeColumnDesc) expr;
        String internalName = col.getColumn();
        String alias = internalName;
        String tabAlias = col.getTabAlias();
        ColumnInfo ci = rs.getColumnInfo(internalName);
        if (ci != null) {
            if (ci.getAlias() != null) {
                alias = ci.getAlias();
            }
            if (ci.getTabAlias() != null) {
                tabAlias = ci.getTabAlias();
            }
        }
        Dependency dep = lctx.getIndex().getDependency(inpOp, internalName);
        if ((tabAlias == null || tabAlias.startsWith("_") || tabAlias.startsWith("$")) && (dep != null && dep.getType() == DependencyType.SIMPLE)) {
            Set<BaseColumnInfo> baseCols = dep.getBaseCols();
            if (baseCols != null && !baseCols.isEmpty()) {
                BaseColumnInfo baseCol = baseCols.iterator().next();
                tabAlias = baseCol.getTabAlias().getAlias();
                alias = baseCol.getColumn().getName();
            }
        }
        if (tabAlias != null && tabAlias.length() > 0 && !tabAlias.startsWith("_") && !tabAlias.startsWith("$")) {
            if (cond != null && !findSourceColumn(lctx, cond, tabAlias, alias) && dep != null) {
                cond.getBaseCols().addAll(dep.getBaseCols());
            }
            return tabAlias + "." + alias;
        }
        if (dep != null) {
            if (cond != null) {
                cond.getBaseCols().addAll(dep.getBaseCols());
            }
            if (dep.getExpr() != null) {
                return dep.getExpr();
            }
        }
        if (alias.startsWith("_")) {
            ci = inpOp.getSchema().getColumnInfo(internalName);
            if (ci != null && ci.getAlias() != null) {
                alias = ci.getAlias();
            }
        }
        return alias;
    } else if (expr instanceof ExprNodeGenericFuncDesc) {
        ExprNodeGenericFuncDesc func = (ExprNodeGenericFuncDesc) expr;
        List<ExprNodeDesc> children = func.getChildren();
        String[] childrenExprStrings = new String[children.size()];
        for (int i = 0; i < childrenExprStrings.length; i++) {
            childrenExprStrings[i] = getExprString(rs, children.get(i), lctx, inpOp, cond);
        }
        return func.getGenericUDF().getDisplayString(childrenExprStrings);
    }
    return expr.getExprString();
}
Also used : ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) ArrayList(java.util.ArrayList) List(java.util.List) Dependency(org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo)

Example 3 with Dependency

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency in project hive by apache.

the class LineageLogger method getEdges.

/**
   * Based on the final select operator, find out all the target columns.
   * For each target column, find out its sources based on the dependency index.
   */
private List<Edge> getEdges(QueryPlan plan, Index index) {
    LinkedHashMap<String, ObjectPair<SelectOperator, org.apache.hadoop.hive.ql.metadata.Table>> finalSelOps = index.getFinalSelectOps();
    Map<String, Vertex> vertexCache = new LinkedHashMap<String, Vertex>();
    List<Edge> edges = new ArrayList<Edge>();
    for (ObjectPair<SelectOperator, org.apache.hadoop.hive.ql.metadata.Table> pair : finalSelOps.values()) {
        List<FieldSchema> fieldSchemas = plan.getResultSchema().getFieldSchemas();
        SelectOperator finalSelOp = pair.getFirst();
        org.apache.hadoop.hive.ql.metadata.Table t = pair.getSecond();
        String destTableName = null;
        List<String> colNames = null;
        if (t != null) {
            destTableName = t.getDbName() + "." + t.getTableName();
            fieldSchemas = t.getCols();
        } else {
            // Based on the plan outputs, find out the target table name and column names.
            for (WriteEntity output : plan.getOutputs()) {
                Entity.Type entityType = output.getType();
                if (entityType == Entity.Type.TABLE || entityType == Entity.Type.PARTITION) {
                    t = output.getTable();
                    destTableName = t.getDbName() + "." + t.getTableName();
                    List<FieldSchema> cols = t.getCols();
                    if (cols != null && !cols.isEmpty()) {
                        colNames = Utilities.getColumnNamesFromFieldSchema(cols);
                    }
                    break;
                }
            }
        }
        Map<ColumnInfo, Dependency> colMap = index.getDependencies(finalSelOp);
        List<Dependency> dependencies = colMap != null ? Lists.newArrayList(colMap.values()) : null;
        int fields = fieldSchemas.size();
        if (t != null && colMap != null && fields < colMap.size()) {
            // Dynamic partition keys should be added to field schemas.
            List<FieldSchema> partitionKeys = t.getPartitionKeys();
            int dynamicKeyCount = colMap.size() - fields;
            int keyOffset = partitionKeys.size() - dynamicKeyCount;
            if (keyOffset >= 0) {
                fields += dynamicKeyCount;
                for (int i = 0; i < dynamicKeyCount; i++) {
                    FieldSchema field = partitionKeys.get(keyOffset + i);
                    fieldSchemas.add(field);
                    if (colNames != null) {
                        colNames.add(field.getName());
                    }
                }
            }
        }
        if (dependencies == null || dependencies.size() != fields) {
            log("Result schema has " + fields + " fields, but we don't get as many dependencies");
        } else {
            // Go through each target column, generate the lineage edges.
            Set<Vertex> targets = new LinkedHashSet<Vertex>();
            for (int i = 0; i < fields; i++) {
                Vertex target = getOrCreateVertex(vertexCache, getTargetFieldName(i, destTableName, colNames, fieldSchemas), Vertex.Type.COLUMN);
                targets.add(target);
                Dependency dep = dependencies.get(i);
                addEdge(vertexCache, edges, dep.getBaseCols(), target, dep.getExpr(), Edge.Type.PROJECTION);
            }
            Set<Predicate> conds = index.getPredicates(finalSelOp);
            if (conds != null && !conds.isEmpty()) {
                for (Predicate cond : conds) {
                    addEdge(vertexCache, edges, cond.getBaseCols(), new LinkedHashSet<Vertex>(targets), cond.getExpr(), Edge.Type.PREDICATE);
                }
            }
        }
    }
    return edges;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo) LinkedHashMap(java.util.LinkedHashMap) Predicate(org.apache.hadoop.hive.ql.hooks.LineageInfo.Predicate) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) Table(org.apache.hadoop.hive.metastore.api.Table) Dependency(org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency) ObjectPair(org.apache.hadoop.hive.common.ObjectPair)

Example 4 with Dependency

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency in project hive by apache.

the class ExprProcFactory method getExprDependency.

/**
   * Gets the expression dependencies for the expression.
   *
   * @param lctx
   *          The lineage context containing the input operators dependencies.
   * @param inpOp
   *          The input operator to the current operator.
   * @param expr
   *          The expression that is being processed.
   * @throws SemanticException
   */
public static Dependency getExprDependency(LineageCtx lctx, Operator<? extends OperatorDesc> inpOp, ExprNodeDesc expr) throws SemanticException {
    // Create the walker, the rules dispatcher and the context.
    ExprProcCtx exprCtx = new ExprProcCtx(lctx, inpOp);
    // create a walker which walks the tree in a DFS manner while maintaining
    // the operator stack. The dispatcher
    // generates the plan from the operator tree
    Map<Rule, NodeProcessor> exprRules = new LinkedHashMap<Rule, NodeProcessor>();
    exprRules.put(new RuleRegExp("R1", ExprNodeColumnDesc.class.getName() + "%"), getColumnProcessor());
    exprRules.put(new RuleRegExp("R2", ExprNodeFieldDesc.class.getName() + "%"), getFieldProcessor());
    exprRules.put(new RuleRegExp("R3", ExprNodeGenericFuncDesc.class.getName() + "%"), getGenericFuncProcessor());
    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(getDefaultExprProcessor(), exprRules, exprCtx);
    GraphWalker egw = new DefaultGraphWalker(disp);
    List<Node> startNodes = new ArrayList<Node>();
    startNodes.add(expr);
    HashMap<Node, Object> outputMap = new HashMap<Node, Object>();
    egw.startWalking(startNodes, outputMap);
    return (Dependency) outputMap.get(expr);
}
Also used : NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) Dependency(org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) LinkedHashMap(java.util.LinkedHashMap) ExprNodeFieldDesc(org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) Rule(org.apache.hadoop.hive.ql.lib.Rule) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker)

Aggregations

Dependency (org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency)4 ArrayList (java.util.ArrayList)3 BaseColumnInfo (org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo)3 LinkedHashMap (java.util.LinkedHashMap)2 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)2 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)2 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)2 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)2 HashMap (java.util.HashMap)1 LinkedHashSet (java.util.LinkedHashSet)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 ObjectPair (org.apache.hadoop.hive.common.ObjectPair)1 Partition (org.apache.hadoop.hive.metastore.api.Partition)1 Table (org.apache.hadoop.hive.metastore.api.Table)1 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)1 DependencyKey (org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyKey)1 Predicate (org.apache.hadoop.hive.ql.hooks.LineageInfo.Predicate)1 DefaultGraphWalker (org.apache.hadoop.hive.ql.lib.DefaultGraphWalker)1