Search in sources :

Example 1 with Predicate

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.Predicate in project hive by apache.

the class LineageLogger method getEdges.

/**
   * Based on the final select operator, find out all the target columns.
   * For each target column, find out its sources based on the dependency index.
   */
private List<Edge> getEdges(QueryPlan plan, Index index) {
    LinkedHashMap<String, ObjectPair<SelectOperator, org.apache.hadoop.hive.ql.metadata.Table>> finalSelOps = index.getFinalSelectOps();
    Map<String, Vertex> vertexCache = new LinkedHashMap<String, Vertex>();
    List<Edge> edges = new ArrayList<Edge>();
    for (ObjectPair<SelectOperator, org.apache.hadoop.hive.ql.metadata.Table> pair : finalSelOps.values()) {
        List<FieldSchema> fieldSchemas = plan.getResultSchema().getFieldSchemas();
        SelectOperator finalSelOp = pair.getFirst();
        org.apache.hadoop.hive.ql.metadata.Table t = pair.getSecond();
        String destTableName = null;
        List<String> colNames = null;
        if (t != null) {
            destTableName = t.getDbName() + "." + t.getTableName();
            fieldSchemas = t.getCols();
        } else {
            // Based on the plan outputs, find out the target table name and column names.
            for (WriteEntity output : plan.getOutputs()) {
                Entity.Type entityType = output.getType();
                if (entityType == Entity.Type.TABLE || entityType == Entity.Type.PARTITION) {
                    t = output.getTable();
                    destTableName = t.getDbName() + "." + t.getTableName();
                    List<FieldSchema> cols = t.getCols();
                    if (cols != null && !cols.isEmpty()) {
                        colNames = Utilities.getColumnNamesFromFieldSchema(cols);
                    }
                    break;
                }
            }
        }
        Map<ColumnInfo, Dependency> colMap = index.getDependencies(finalSelOp);
        List<Dependency> dependencies = colMap != null ? Lists.newArrayList(colMap.values()) : null;
        int fields = fieldSchemas.size();
        if (t != null && colMap != null && fields < colMap.size()) {
            // Dynamic partition keys should be added to field schemas.
            List<FieldSchema> partitionKeys = t.getPartitionKeys();
            int dynamicKeyCount = colMap.size() - fields;
            int keyOffset = partitionKeys.size() - dynamicKeyCount;
            if (keyOffset >= 0) {
                fields += dynamicKeyCount;
                for (int i = 0; i < dynamicKeyCount; i++) {
                    FieldSchema field = partitionKeys.get(keyOffset + i);
                    fieldSchemas.add(field);
                    if (colNames != null) {
                        colNames.add(field.getName());
                    }
                }
            }
        }
        if (dependencies == null || dependencies.size() != fields) {
            log("Result schema has " + fields + " fields, but we don't get as many dependencies");
        } else {
            // Go through each target column, generate the lineage edges.
            Set<Vertex> targets = new LinkedHashSet<Vertex>();
            for (int i = 0; i < fields; i++) {
                Vertex target = getOrCreateVertex(vertexCache, getTargetFieldName(i, destTableName, colNames, fieldSchemas), Vertex.Type.COLUMN);
                targets.add(target);
                Dependency dep = dependencies.get(i);
                addEdge(vertexCache, edges, dep.getBaseCols(), target, dep.getExpr(), Edge.Type.PROJECTION);
            }
            Set<Predicate> conds = index.getPredicates(finalSelOp);
            if (conds != null && !conds.isEmpty()) {
                for (Predicate cond : conds) {
                    addEdge(vertexCache, edges, cond.getBaseCols(), new LinkedHashSet<Vertex>(targets), cond.getExpr(), Edge.Type.PREDICATE);
                }
            }
        }
    }
    return edges;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo) LinkedHashMap(java.util.LinkedHashMap) Predicate(org.apache.hadoop.hive.ql.hooks.LineageInfo.Predicate) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) Table(org.apache.hadoop.hive.metastore.api.Table) Dependency(org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency) ObjectPair(org.apache.hadoop.hive.common.ObjectPair)

Aggregations

ArrayList (java.util.ArrayList)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedHashSet (java.util.LinkedHashSet)1 ObjectPair (org.apache.hadoop.hive.common.ObjectPair)1 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)1 Table (org.apache.hadoop.hive.metastore.api.Table)1 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)1 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)1 BaseColumnInfo (org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo)1 Dependency (org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency)1 Predicate (org.apache.hadoop.hive.ql.hooks.LineageInfo.Predicate)1