Search in sources :

Example 1 with BaseColumnInfo

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo in project atlas by apache.

the class CreateHiveProcess method processColumnLineage.

private void processColumnLineage(AtlasEntity hiveProcess, AtlasEntitiesWithExtInfo entities) {
    LineageInfo lineageInfo = getHiveContext().getLinfo();
    if (lineageInfo == null || CollectionUtils.isEmpty(lineageInfo.entrySet())) {
        return;
    }
    for (Map.Entry<DependencyKey, Dependency> entry : lineageInfo.entrySet()) {
        String outputColName = getQualifiedName(entry.getKey());
        AtlasEntity outputColumn = context.getEntity(outputColName);
        if (outputColumn == null) {
            LOG.warn("column-lineage: non-existing output-column {}", outputColName);
            continue;
        }
        List<AtlasEntity> inputColumns = new ArrayList<>();
        for (BaseColumnInfo baseColumn : entry.getValue().getBaseCols()) {
            String inputColName = getQualifiedName(baseColumn);
            AtlasEntity inputColumn = context.getEntity(inputColName);
            if (inputColumn == null) {
                LOG.warn("column-lineage: non-existing input-column {} for output-column={}", inputColName, outputColName);
                continue;
            }
            inputColumns.add(inputColumn);
        }
        if (inputColumns.isEmpty()) {
            continue;
        }
        AtlasEntity columnLineageProcess = new AtlasEntity(HIVE_TYPE_COLUMN_LINEAGE);
        columnLineageProcess.setAttribute(ATTRIBUTE_NAME, hiveProcess.getAttribute(ATTRIBUTE_NAME) + ":" + outputColumn.getAttribute(ATTRIBUTE_NAME));
        columnLineageProcess.setAttribute(ATTRIBUTE_QUALIFIED_NAME, hiveProcess.getAttribute(ATTRIBUTE_QUALIFIED_NAME) + ":" + outputColumn.getAttribute(ATTRIBUTE_NAME));
        columnLineageProcess.setAttribute(ATTRIBUTE_INPUTS, getObjectIds(inputColumns));
        columnLineageProcess.setAttribute(ATTRIBUTE_OUTPUTS, Collections.singletonList(getObjectId(outputColumn)));
        columnLineageProcess.setAttribute(ATTRIBUTE_QUERY, getObjectId(hiveProcess));
        columnLineageProcess.setAttribute(ATTRIBUTE_DEPENDENCY_TYPE, entry.getValue().getType());
        columnLineageProcess.setAttribute(ATTRIBUTE_EXPRESSION, entry.getValue().getExpr());
        entities.addEntity(columnLineageProcess);
    }
}
Also used : AtlasEntity(org.apache.atlas.model.instance.AtlasEntity) ArrayList(java.util.ArrayList) DependencyKey(org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyKey) Dependency(org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo) LineageInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo) Map(java.util.Map)

Example 2 with BaseColumnInfo

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo in project hive by apache.

the class PostExecutePrinter method run.

public void run(QueryState queryState, Set<ReadEntity> inputs, Set<WriteEntity> outputs, LineageInfo linfo, UserGroupInformation ugi) throws Exception {
    LogHelper console = SessionState.getConsole();
    if (console == null) {
        return;
    }
    if (queryState != null) {
        console.printInfo("POSTHOOK: query: " + queryState.getQueryString().trim(), false);
        console.printInfo("POSTHOOK: type: " + queryState.getCommandType(), false);
    }
    PreExecutePrinter.printEntities(console, inputs, "POSTHOOK: Input: ");
    PreExecutePrinter.printEntities(console, outputs, "POSTHOOK: Output: ");
    // Also print out the generic lineage information if there is any
    if (linfo != null) {
        LinkedList<Map.Entry<DependencyKey, Dependency>> entry_list = new LinkedList<Map.Entry<DependencyKey, Dependency>>(linfo.entrySet());
        Collections.sort(entry_list, new DependencyKeyComp());
        Iterator<Map.Entry<DependencyKey, Dependency>> iter = entry_list.iterator();
        while (iter.hasNext()) {
            Map.Entry<DependencyKey, Dependency> it = iter.next();
            Dependency dep = it.getValue();
            DependencyKey depK = it.getKey();
            if (dep == null) {
                continue;
            }
            StringBuilder sb = new StringBuilder();
            sb.append("POSTHOOK: Lineage: ");
            if (depK.getDataContainer().isPartition()) {
                Partition part = depK.getDataContainer().getPartition();
                sb.append(part.getTableName());
                sb.append(" PARTITION(");
                int i = 0;
                for (FieldSchema fs : depK.getDataContainer().getTable().getPartitionKeys()) {
                    if (i != 0) {
                        sb.append(",");
                    }
                    sb.append(fs.getName() + "=" + part.getValues().get(i++));
                }
                sb.append(")");
            } else {
                sb.append(depK.getDataContainer().getTable().getTableName());
            }
            sb.append("." + depK.getFieldSchema().getName() + " " + dep.getType() + " ");
            sb.append("[");
            for (BaseColumnInfo col : dep.getBaseCols()) {
                sb.append("(" + col.getTabAlias().getTable().getTableName() + ")" + col.getTabAlias().getAlias() + "." + col.getColumn() + ", ");
            }
            sb.append("]");
            console.printInfo(sb.toString(), false);
        }
    }
}
Also used : Partition(org.apache.hadoop.hive.metastore.api.Partition) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) DependencyKey(org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyKey) Dependency(org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency) LinkedList(java.util.LinkedList) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo) Map(java.util.Map)

Example 3 with BaseColumnInfo

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo in project hive by apache.

the class LineageLogger method createSourceVertices.

/**
 * Convert a list of columns to a set of vertices.
 * Use cached vertices if possible.
 */
private static Set<Vertex> createSourceVertices(Map<String, Vertex> vertexCache, Collection<BaseColumnInfo> baseCols) {
    Set<Vertex> sources = new LinkedHashSet<Vertex>();
    if (baseCols != null && !baseCols.isEmpty()) {
        for (BaseColumnInfo col : baseCols) {
            Table table = col.getTabAlias().getTable();
            if (table.isTemporary()) {
                // Ignore temporary tables
                continue;
            }
            Vertex.Type type = Vertex.Type.TABLE;
            String tableName = Warehouse.getQualifiedName(table);
            FieldSchema fieldSchema = col.getColumn();
            String label = tableName;
            if (fieldSchema != null) {
                type = Vertex.Type.COLUMN;
                label = tableName + "." + fieldSchema.getName();
            }
            sources.add(getOrCreateVertex(vertexCache, label, type));
        }
    }
    return sources;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) Table(org.apache.hadoop.hive.metastore.api.Table) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo)

Example 4 with BaseColumnInfo

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo in project hive by apache.

the class ExprProcFactory method getExprString.

/**
 * Get the expression string of an expression node.
 */
public static String getExprString(RowSchema rs, ExprNodeDesc expr, LineageCtx lctx, Operator<? extends OperatorDesc> inpOp, Predicate cond) {
    if (expr instanceof ExprNodeColumnDesc) {
        ExprNodeColumnDesc col = (ExprNodeColumnDesc) expr;
        String internalName = col.getColumn();
        String alias = internalName;
        String tabAlias = col.getTabAlias();
        ColumnInfo ci = rs.getColumnInfo(internalName);
        if (ci != null) {
            if (ci.getAlias() != null) {
                alias = ci.getAlias();
            }
            if (ci.getTabAlias() != null) {
                tabAlias = ci.getTabAlias();
            }
        }
        Dependency dep = lctx.getIndex().getDependency(inpOp, internalName);
        if ((tabAlias == null || tabAlias.startsWith("_") || tabAlias.startsWith("$")) && (dep != null && dep.getType() == DependencyType.SIMPLE)) {
            Set<BaseColumnInfo> baseCols = dep.getBaseCols();
            if (baseCols != null && !baseCols.isEmpty()) {
                BaseColumnInfo baseCol = baseCols.iterator().next();
                tabAlias = baseCol.getTabAlias().getAlias();
                alias = baseCol.getColumn().getName();
            }
        }
        if (tabAlias != null && tabAlias.length() > 0 && !tabAlias.startsWith("_") && !tabAlias.startsWith("$")) {
            if (cond != null && !findSourceColumn(lctx, cond, tabAlias, alias) && dep != null) {
                cond.getBaseCols().addAll(dep.getBaseCols());
            }
            return tabAlias + "." + alias;
        }
        if (dep != null) {
            if (cond != null) {
                cond.getBaseCols().addAll(dep.getBaseCols());
            }
            if (dep.getExpr() != null) {
                return dep.getExpr();
            }
        }
        if (alias.startsWith("_")) {
            ci = inpOp.getSchema().getColumnInfo(internalName);
            if (ci != null && ci.getAlias() != null) {
                alias = ci.getAlias();
            }
        }
        return alias;
    } else if (expr instanceof ExprNodeGenericFuncDesc) {
        ExprNodeGenericFuncDesc func = (ExprNodeGenericFuncDesc) expr;
        List<ExprNodeDesc> children = func.getChildren();
        String[] childrenExprStrings = new String[children.size()];
        for (int i = 0; i < childrenExprStrings.length; i++) {
            childrenExprStrings[i] = getExprString(rs, children.get(i), lctx, inpOp, cond);
        }
        return func.getGenericUDF().getDisplayString(childrenExprStrings);
    }
    return expr.getExprString();
}
Also used : ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) List(java.util.List) Dependency(org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo)

Example 5 with BaseColumnInfo

use of org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo in project hive by apache.

the class ExprProcFactory method findSourceColumn.

private static boolean findSourceColumn(LineageCtx lctx, Predicate cond, String tabAlias, String alias) {
    for (Map.Entry<String, TableScanOperator> topOpMap : lctx.getParseCtx().getTopOps().entrySet()) {
        TableScanOperator tableScanOp = topOpMap.getValue();
        Table tbl = tableScanOp.getConf().getTableMetadata();
        if (tbl.getTableName().equals(tabAlias) || tabAlias.equals(tableScanOp.getConf().getAlias())) {
            for (FieldSchema column : tbl.getCols()) {
                if (column.getName().equals(alias)) {
                    TableAliasInfo table = new TableAliasInfo();
                    table.setTable(tbl.getTTable());
                    table.setAlias(tabAlias);
                    BaseColumnInfo colInfo = new BaseColumnInfo();
                    colInfo.setColumn(column);
                    colInfo.setTabAlias(table);
                    cond.getBaseCols().add(colInfo);
                    return true;
                }
            }
        }
    }
    return false;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) TableAliasInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.TableAliasInfo) BaseColumnInfo(org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

BaseColumnInfo (org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo)5 Map (java.util.Map)3 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)3 Dependency (org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency)3 DependencyKey (org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyKey)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedHashSet (java.util.LinkedHashSet)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 AtlasEntity (org.apache.atlas.model.instance.AtlasEntity)1 Partition (org.apache.hadoop.hive.metastore.api.Partition)1 Table (org.apache.hadoop.hive.metastore.api.Table)1 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)1 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)1 LineageInfo (org.apache.hadoop.hive.ql.hooks.LineageInfo)1 TableAliasInfo (org.apache.hadoop.hive.ql.hooks.LineageInfo.TableAliasInfo)1 Table (org.apache.hadoop.hive.ql.metadata.Table)1 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)1