use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency in project hive by apache.
the class PostExecutePrinter method run.
public void run(QueryState queryState, Set<ReadEntity> inputs, Set<WriteEntity> outputs, LineageInfo linfo, UserGroupInformation ugi) throws Exception {
LogHelper console = SessionState.getConsole();
if (console == null) {
return;
}
if (queryState != null) {
console.printError("POSTHOOK: query: " + queryState.getQueryString().trim());
console.printError("POSTHOOK: type: " + queryState.getCommandType());
}
PreExecutePrinter.printEntities(console, inputs, "POSTHOOK: Input: ");
PreExecutePrinter.printEntities(console, outputs, "POSTHOOK: Output: ");
// Also print out the generic lineage information if there is any
if (linfo != null) {
LinkedList<Map.Entry<DependencyKey, Dependency>> entry_list = new LinkedList<Map.Entry<DependencyKey, Dependency>>(linfo.entrySet());
Collections.sort(entry_list, new DependencyKeyComp());
Iterator<Map.Entry<DependencyKey, Dependency>> iter = entry_list.iterator();
while (iter.hasNext()) {
Map.Entry<DependencyKey, Dependency> it = iter.next();
Dependency dep = it.getValue();
DependencyKey depK = it.getKey();
if (dep == null) {
continue;
}
StringBuilder sb = new StringBuilder();
sb.append("POSTHOOK: Lineage: ");
if (depK.getDataContainer().isPartition()) {
Partition part = depK.getDataContainer().getPartition();
sb.append(part.getTableName());
sb.append(" PARTITION(");
int i = 0;
for (FieldSchema fs : depK.getDataContainer().getTable().getPartitionKeys()) {
if (i != 0) {
sb.append(",");
}
sb.append(fs.getName() + "=" + part.getValues().get(i++));
}
sb.append(")");
} else {
sb.append(depK.getDataContainer().getTable().getTableName());
}
sb.append("." + depK.getFieldSchema().getName() + " " + dep.getType() + " ");
sb.append("[");
for (BaseColumnInfo col : dep.getBaseCols()) {
sb.append("(" + col.getTabAlias().getTable().getTableName() + ")" + col.getTabAlias().getAlias() + "." + col.getColumn() + ", ");
}
sb.append("]");
console.printError(sb.toString());
}
}
}
use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency in project hive by apache.
the class ExprProcFactory method getExprString.
/**
* Get the expression string of an expression node.
*/
public static String getExprString(RowSchema rs, ExprNodeDesc expr, LineageCtx lctx, Operator<? extends OperatorDesc> inpOp, Predicate cond) {
if (expr instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc col = (ExprNodeColumnDesc) expr;
String internalName = col.getColumn();
String alias = internalName;
String tabAlias = col.getTabAlias();
ColumnInfo ci = rs.getColumnInfo(internalName);
if (ci != null) {
if (ci.getAlias() != null) {
alias = ci.getAlias();
}
if (ci.getTabAlias() != null) {
tabAlias = ci.getTabAlias();
}
}
Dependency dep = lctx.getIndex().getDependency(inpOp, internalName);
if ((tabAlias == null || tabAlias.startsWith("_") || tabAlias.startsWith("$")) && (dep != null && dep.getType() == DependencyType.SIMPLE)) {
Set<BaseColumnInfo> baseCols = dep.getBaseCols();
if (baseCols != null && !baseCols.isEmpty()) {
BaseColumnInfo baseCol = baseCols.iterator().next();
tabAlias = baseCol.getTabAlias().getAlias();
alias = baseCol.getColumn().getName();
}
}
if (tabAlias != null && tabAlias.length() > 0 && !tabAlias.startsWith("_") && !tabAlias.startsWith("$")) {
if (cond != null && !findSourceColumn(lctx, cond, tabAlias, alias) && dep != null) {
cond.getBaseCols().addAll(dep.getBaseCols());
}
return tabAlias + "." + alias;
}
if (dep != null) {
if (cond != null) {
cond.getBaseCols().addAll(dep.getBaseCols());
}
if (dep.getExpr() != null) {
return dep.getExpr();
}
}
if (alias.startsWith("_")) {
ci = inpOp.getSchema().getColumnInfo(internalName);
if (ci != null && ci.getAlias() != null) {
alias = ci.getAlias();
}
}
return alias;
} else if (expr instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc func = (ExprNodeGenericFuncDesc) expr;
List<ExprNodeDesc> children = func.getChildren();
String[] childrenExprStrings = new String[children.size()];
for (int i = 0; i < childrenExprStrings.length; i++) {
childrenExprStrings[i] = getExprString(rs, children.get(i), lctx, inpOp, cond);
}
return func.getGenericUDF().getDisplayString(childrenExprStrings);
}
return expr.getExprString();
}
use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency in project hive by apache.
the class LineageLogger method getEdges.
/**
* Based on the final select operator, find out all the target columns.
* For each target column, find out its sources based on the dependency index.
*/
private List<Edge> getEdges(QueryPlan plan, Index index) {
LinkedHashMap<String, ObjectPair<SelectOperator, org.apache.hadoop.hive.ql.metadata.Table>> finalSelOps = index.getFinalSelectOps();
Map<String, Vertex> vertexCache = new LinkedHashMap<String, Vertex>();
List<Edge> edges = new ArrayList<Edge>();
for (ObjectPair<SelectOperator, org.apache.hadoop.hive.ql.metadata.Table> pair : finalSelOps.values()) {
List<FieldSchema> fieldSchemas = plan.getResultSchema().getFieldSchemas();
SelectOperator finalSelOp = pair.getFirst();
org.apache.hadoop.hive.ql.metadata.Table t = pair.getSecond();
String destTableName = null;
List<String> colNames = null;
if (t != null) {
destTableName = t.getDbName() + "." + t.getTableName();
fieldSchemas = t.getCols();
} else {
// Based on the plan outputs, find out the target table name and column names.
for (WriteEntity output : plan.getOutputs()) {
Entity.Type entityType = output.getType();
if (entityType == Entity.Type.TABLE || entityType == Entity.Type.PARTITION) {
t = output.getTable();
destTableName = t.getDbName() + "." + t.getTableName();
List<FieldSchema> cols = t.getCols();
if (cols != null && !cols.isEmpty()) {
colNames = Utilities.getColumnNamesFromFieldSchema(cols);
}
break;
}
}
}
Map<ColumnInfo, Dependency> colMap = index.getDependencies(finalSelOp);
List<Dependency> dependencies = colMap != null ? Lists.newArrayList(colMap.values()) : null;
int fields = fieldSchemas.size();
if (t != null && colMap != null && fields < colMap.size()) {
// Dynamic partition keys should be added to field schemas.
List<FieldSchema> partitionKeys = t.getPartitionKeys();
int dynamicKeyCount = colMap.size() - fields;
int keyOffset = partitionKeys.size() - dynamicKeyCount;
if (keyOffset >= 0) {
fields += dynamicKeyCount;
for (int i = 0; i < dynamicKeyCount; i++) {
FieldSchema field = partitionKeys.get(keyOffset + i);
fieldSchemas.add(field);
if (colNames != null) {
colNames.add(field.getName());
}
}
}
}
if (dependencies == null || dependencies.size() != fields) {
log("Result schema has " + fields + " fields, but we don't get as many dependencies");
} else {
// Go through each target column, generate the lineage edges.
Set<Vertex> targets = new LinkedHashSet<Vertex>();
for (int i = 0; i < fields; i++) {
Vertex target = getOrCreateVertex(vertexCache, getTargetFieldName(i, destTableName, colNames, fieldSchemas), Vertex.Type.COLUMN);
targets.add(target);
Dependency dep = dependencies.get(i);
addEdge(vertexCache, edges, dep.getBaseCols(), target, dep.getExpr(), Edge.Type.PROJECTION);
}
Set<Predicate> conds = index.getPredicates(finalSelOp);
if (conds != null && !conds.isEmpty()) {
for (Predicate cond : conds) {
addEdge(vertexCache, edges, cond.getBaseCols(), new LinkedHashSet<Vertex>(targets), cond.getExpr(), Edge.Type.PREDICATE);
}
}
}
}
return edges;
}
use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency in project hive by apache.
the class ExprProcFactory method getExprDependency.
/**
* Gets the expression dependencies for the expression.
*
* @param lctx
* The lineage context containing the input operators dependencies.
* @param inpOp
* The input operator to the current operator.
* @param expr
* The expression that is being processed.
* @throws SemanticException
*/
public static Dependency getExprDependency(LineageCtx lctx, Operator<? extends OperatorDesc> inpOp, ExprNodeDesc expr) throws SemanticException {
// Create the walker, the rules dispatcher and the context.
ExprProcCtx exprCtx = new ExprProcCtx(lctx, inpOp);
// create a walker which walks the tree in a DFS manner while maintaining
// the operator stack. The dispatcher
// generates the plan from the operator tree
Map<Rule, NodeProcessor> exprRules = new LinkedHashMap<Rule, NodeProcessor>();
exprRules.put(new RuleRegExp("R1", ExprNodeColumnDesc.class.getName() + "%"), getColumnProcessor());
exprRules.put(new RuleRegExp("R2", ExprNodeFieldDesc.class.getName() + "%"), getFieldProcessor());
exprRules.put(new RuleRegExp("R3", ExprNodeGenericFuncDesc.class.getName() + "%"), getGenericFuncProcessor());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(getDefaultExprProcessor(), exprRules, exprCtx);
GraphWalker egw = new DefaultGraphWalker(disp);
List<Node> startNodes = new ArrayList<Node>();
startNodes.add(expr);
HashMap<Node, Object> outputMap = new HashMap<Node, Object>();
egw.startWalking(startNodes, outputMap);
return (Dependency) outputMap.get(expr);
}
Aggregations