Search in sources :

Example 96 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project atlas by apache.

the class HiveHookIT method testUpdateProcess.

@Test
public void testUpdateProcess() throws Exception {
    String tableName = createTable();
    String pFile1 = createTestDFSPath("somedfspath1");
    String query = "insert overwrite DIRECTORY '" + pFile1 + "' select id, name from " + tableName;
    runCommand(query);
    Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
    Set<WriteEntity> outputs = getOutputs(pFile1, Entity.Type.DFS_DIR);
    outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
    HiveEventContext hiveEventContext = constructEvent(query, HiveOperation.QUERY, inputs, outputs);
    AtlasEntity processEntity = validateProcess(hiveEventContext);
    validateHDFSPaths(processEntity, OUTPUTS, pFile1);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    validateInputTables(processEntity, inputs);
    // Rerun same query with same HDFS path
    runCommandWithDelay(query, 1000);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    AtlasEntity process2Entity = validateProcess(hiveEventContext);
    validateHDFSPaths(process2Entity, OUTPUTS, pFile1);
    Assert.assertEquals(process2Entity.getGuid(), processEntity.getGuid());
    // Rerun same query with a new HDFS path. Will result in same process since HDFS paths is not part of qualified name for QUERY operations
    String pFile2 = createTestDFSPath("somedfspath2");
    query = "insert overwrite DIRECTORY '" + pFile2 + "' select id, name from " + tableName;
    runCommandWithDelay(query, 1000);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    Set<WriteEntity> p3Outputs = new LinkedHashSet<WriteEntity>() {

        {
            addAll(getOutputs(pFile2, Entity.Type.DFS_DIR));
            addAll(outputs);
        }
    };
    AtlasEntity process3Entity = validateProcess(constructEvent(query, HiveOperation.QUERY, inputs, p3Outputs));
    validateHDFSPaths(process3Entity, OUTPUTS, pFile2);
    Assert.assertEquals(process3Entity.getGuid(), processEntity.getGuid());
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) AtlasEntity(org.apache.atlas.model.instance.AtlasEntity) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Example 97 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project atlas by apache.

the class HiveHookIT method getInputs.

private Set<ReadEntity> getInputs(String inputName, Entity.Type entityType) throws HiveException {
    final ReadEntity entity = new ReadEntity();
    if (Entity.Type.DFS_DIR.equals(entityType)) {
        entity.setName(lower(new Path(inputName).toString()));
        entity.setTyp(Entity.Type.DFS_DIR);
    } else {
        entity.setName(getQualifiedTblName(inputName));
        entity.setTyp(entityType);
    }
    if (entityType == Entity.Type.TABLE) {
        entity.setT(hiveMetaStoreBridge.getHiveClient().getTable(DEFAULT_DB, inputName));
    }
    return new LinkedHashSet<ReadEntity>() {

        {
            add(entity);
        }
    };
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Path(org.apache.hadoop.fs.Path)

Example 98 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project atlas by apache.

the class HiveHookIT method testColumnLevelLineage.

/*
       The test is disabled by default
       Reason : Atlas uses Hive version 1.2.x and the Hive patch HIVE-13112 which enables column level lineage is not
       committed in Hive version 1.2.x
       This test will fail if the lineage information is not available from Hive
       Once the patch for HIVE-13112 is committed to Hive branch 1.2.x, the test can be enabled
       Please track HIVE-14706 to know the status of column lineage availability in latest Hive versions i.e 2.1.x
        */
@Test(enabled = false)
public void testColumnLevelLineage() throws Exception {
    String sourceTable = "table" + random();
    runCommand("create table " + sourceTable + "(a int, b int)");
    String sourceTableGUID = assertTableIsRegistered(DEFAULT_DB, sourceTable);
    String a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "a"));
    String b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "b"));
    String ctasTableName = "table" + random();
    String query = "create table " + ctasTableName + " as " + "select sum(a+b) as a, count(*) as b from " + sourceTable;
    runCommand(query);
    String dest_a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "a"));
    String dest_b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "b"));
    Set<ReadEntity> inputs = getInputs(sourceTable, Entity.Type.TABLE);
    Set<WriteEntity> outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
    HiveEventContext event = constructEvent(query, HiveOperation.CREATETABLE_AS_SELECT, inputs, outputs);
    assertProcessIsRegistered(event);
    assertTableIsRegistered(DEFAULT_DB, ctasTableName);
    String processQName = sortEventsAndGetProcessQualifiedName(event);
    List<String> aLineageInputs = Arrays.asList(a_guid, b_guid);
    String aLineageProcessName = processQName + ":" + "a";
    LOG.debug("Searching for column lineage process {} ", aLineageProcessName);
    String guid = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), ATTRIBUTE_QUALIFIED_NAME, aLineageProcessName, null);
    AtlasEntity colLineageEntity = atlasClientV2.getEntityByGuid(guid).getEntity();
    List<AtlasObjectId> processInputs = toAtlasObjectIdList(colLineageEntity.getAttribute("inputs"));
    List<String> processInputsAsString = new ArrayList<>();
    for (AtlasObjectId input : processInputs) {
        processInputsAsString.add(input.getGuid());
    }
    Collections.sort(processInputsAsString);
    Collections.sort(aLineageInputs);
    Assert.assertEquals(processInputsAsString, aLineageInputs);
    List<String> bLineageInputs = Arrays.asList(sourceTableGUID);
    String bLineageProcessName = processQName + ":" + "b";
    LOG.debug("Searching for column lineage process {} ", bLineageProcessName);
    String guid1 = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), ATTRIBUTE_QUALIFIED_NAME, bLineageProcessName, null);
    AtlasEntity colLineageEntity1 = atlasClientV2.getEntityByGuid(guid1).getEntity();
    List<AtlasObjectId> bProcessInputs = toAtlasObjectIdList(colLineageEntity1.getAttribute("inputs"));
    List<String> bProcessInputsAsString = new ArrayList<>();
    for (AtlasObjectId input : bProcessInputs) {
        bProcessInputsAsString.add(input.getGuid());
    }
    Collections.sort(bProcessInputsAsString);
    Collections.sort(bLineageInputs);
    Assert.assertEquals(bProcessInputsAsString, bLineageInputs);
    // Test lineage API response
    AtlasLineageInfo atlasLineageInfoInput = atlasClientV2.getLineageInfo(dest_a_guid, AtlasLineageInfo.LineageDirection.INPUT, 0);
    Map<String, AtlasEntityHeader> entityMap = atlasLineageInfoInput.getGuidEntityMap();
    ObjectNode response = atlasClient.getInputGraphForEntity(dest_a_guid);
    JsonNode vertices = response.get("values").get("vertices");
    JsonNode dest_a_val = vertices.get(dest_a_guid);
    JsonNode src_a_val = vertices.get(a_guid);
    JsonNode src_b_val = vertices.get(b_guid);
    Assert.assertNotNull(dest_a_val);
    Assert.assertNotNull(src_a_val);
    Assert.assertNotNull(src_b_val);
    ObjectNode b_response = atlasClient.getInputGraphForEntity(dest_b_guid);
    JsonNode b_vertices = b_response.get("values").get("vertices");
    JsonNode b_val = b_vertices.get(dest_b_guid);
    JsonNode src_tbl_val = b_vertices.get(sourceTableGUID);
    Assert.assertNotNull(b_val);
    Assert.assertNotNull(src_tbl_val);
}
Also used : AtlasLineageInfo(org.apache.atlas.model.lineage.AtlasLineageInfo) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) AtlasObjectId(org.apache.atlas.model.instance.AtlasObjectId) JsonNode(com.fasterxml.jackson.databind.JsonNode) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) AtlasEntity(org.apache.atlas.model.instance.AtlasEntity) AtlasEntityHeader(org.apache.atlas.model.instance.AtlasEntityHeader) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Example 99 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project atlas by apache.

the class HiveHookIT method testInsertIntoTable.

@Test
public void testInsertIntoTable() throws Exception {
    String inputTable1Name = createTable();
    String inputTable2Name = createTable();
    String insertTableName = createTable();
    assertTableIsRegistered(DEFAULT_DB, inputTable1Name);
    assertTableIsRegistered(DEFAULT_DB, insertTableName);
    String query = "insert into " + insertTableName + " select t1.id, t1.name from " + inputTable2Name + " as t2, " + inputTable1Name + " as t1 where t1.id=t2.id";
    runCommand(query);
    Set<ReadEntity> inputs = getInputs(inputTable1Name, Entity.Type.TABLE);
    inputs.addAll(getInputs(inputTable2Name, Entity.Type.TABLE));
    Set<WriteEntity> outputs = getOutputs(insertTableName, Entity.Type.TABLE);
    (outputs.iterator().next()).setWriteType(WriteEntity.WriteType.INSERT);
    HiveEventContext event = constructEvent(query, HiveOperation.QUERY, inputs, outputs);
    Set<ReadEntity> expectedInputs = new TreeSet<ReadEntity>(entityComparator) {

        {
            addAll(inputs);
        }
    };
    assertTableIsRegistered(DEFAULT_DB, insertTableName);
    AtlasEntity processEntity1 = validateProcess(event, expectedInputs, outputs);
    // Test sorting of tbl names
    SortedSet<String> sortedTblNames = new TreeSet<>();
    sortedTblNames.add(inputTable1Name.toLowerCase());
    sortedTblNames.add(inputTable2Name.toLowerCase());
    // Verify sorted order of inputs in qualified name
    Assert.assertEquals(processEntity1.getAttribute(ATTRIBUTE_QUALIFIED_NAME), Joiner.on(SEP).join("QUERY", getQualifiedTblName(sortedTblNames.first()), HiveMetaStoreBridge.getTableCreatedTime(hiveMetaStoreBridge.getHiveClient().getTable(DEFAULT_DB, sortedTblNames.first())), getQualifiedTblName(sortedTblNames.last()), HiveMetaStoreBridge.getTableCreatedTime(hiveMetaStoreBridge.getHiveClient().getTable(DEFAULT_DB, sortedTblNames.last()))) + IO_SEP + SEP + Joiner.on(SEP).join(WriteEntity.WriteType.INSERT.name(), getQualifiedTblName(insertTableName), HiveMetaStoreBridge.getTableCreatedTime(hiveMetaStoreBridge.getHiveClient().getTable(DEFAULT_DB, insertTableName))));
    // Rerun same query. Should result in same process
    runCommandWithDelay(query, 1000);
    AtlasEntity processEntity2 = validateProcess(event, expectedInputs, outputs);
    Assert.assertEquals(processEntity1.getGuid(), processEntity2.getGuid());
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) AtlasEntity(org.apache.atlas.model.instance.AtlasEntity) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Example 100 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project hive by apache.

the class PlanUtils method addPartitionInputs.

public static void addPartitionInputs(Collection<Partition> parts, Collection<ReadEntity> inputs, ReadEntity parentViewInfo, boolean isDirectRead) {
    // Store the inputs in a HashMap since we can't get a ReadEntity from inputs since it is
    // implemented as a set.ReadEntity is used as the key so that the HashMap has the same behavior
    // of equals and hashCode
    Map<ReadEntity, ReadEntity> readEntityMap = new LinkedHashMap<ReadEntity, ReadEntity>(inputs.size());
    for (ReadEntity input : inputs) {
        readEntityMap.put(input, input);
    }
    for (Partition part : parts) {
        // Don't add the partition or table created during the execution as the input source
        if (isValuesTempTable(part.getTable().getTableName())) {
            continue;
        }
        ReadEntity newInput = null;
        if (part.getTable().isPartitioned()) {
            newInput = new ReadEntity(part, parentViewInfo, isDirectRead);
        } else {
            newInput = new ReadEntity(part.getTable(), parentViewInfo, isDirectRead);
        }
        if (readEntityMap.containsKey(newInput)) {
            ReadEntity input = readEntityMap.get(newInput);
            if ((newInput.getParents() != null) && (!newInput.getParents().isEmpty())) {
                input.getParents().addAll(newInput.getParents());
                input.setDirect(input.isDirect() || newInput.isDirect());
            }
        } else {
            readEntityMap.put(newInput, newInput);
        }
    }
    // Add the new ReadEntity that were added to readEntityMap in PlanUtils.addInput
    if (inputs.size() != readEntityMap.size()) {
        inputs.addAll(readEntityMap.keySet());
    }
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Partition(org.apache.hadoop.hive.ql.metadata.Partition) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

ReadEntity (org.apache.hadoop.hive.ql.hooks.ReadEntity)139 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)70 Table (org.apache.hadoop.hive.ql.metadata.Table)69 DDLWork (org.apache.hadoop.hive.ql.ddl.DDLWork)31 Partition (org.apache.hadoop.hive.ql.metadata.Partition)29 ArrayList (java.util.ArrayList)27 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)27 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)24 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)22 HashMap (java.util.HashMap)16 Test (org.testng.annotations.Test)16 Map (java.util.Map)13 LinkedHashMap (java.util.LinkedHashMap)12 Path (org.apache.hadoop.fs.Path)12 List (java.util.List)11 Database (org.apache.hadoop.hive.metastore.api.Database)11 AtlasEntity (org.apache.atlas.model.instance.AtlasEntity)10 Referenceable (org.apache.atlas.typesystem.Referenceable)10 HashSet (java.util.HashSet)9 FileNotFoundException (java.io.FileNotFoundException)7