Examples with ReadEntity - org.apache.hadoop.hive.ql.hooks.ReadEntity

Example 31 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHookIT method getPartitionInput.

private ReadEntity getPartitionInput() {
    ReadEntity partEntity = new ReadEntity();
    partEntity.setName(PART_FILE);
    partEntity.setTyp(Entity.Type.PARTITION);
    return partEntity;
}

Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity)

Example 32 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHookIT method getInputs.

private Set<ReadEntity> getInputs(String inputName, Entity.Type entityType) throws HiveException {
    final ReadEntity entity = new ReadEntity();
    if (Entity.Type.DFS_DIR.equals(entityType)) {
        entity.setName(lower(new Path(inputName).toString()));
        entity.setTyp(Entity.Type.DFS_DIR);
    } else {
        entity.setName(getQualifiedTblName(inputName));
        entity.setTyp(entityType);
    }
    if (entityType == Entity.Type.TABLE) {
        entity.setT(hiveMetaStoreBridge.hiveClient.getTable(DEFAULT_DB, inputName));
    }
    return new LinkedHashSet<ReadEntity>() {

        {
            add(entity);
        }
    };
}

Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Path(org.apache.hadoop.fs.Path)

Example 33 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHook method getProcessQualifiedName.

@VisibleForTesting
static String getProcessQualifiedName(HiveMetaStoreBridge dgiBridge, HiveEventContext eventContext, final SortedSet<ReadEntity> sortedHiveInputs, final SortedSet<WriteEntity> sortedHiveOutputs, SortedMap<ReadEntity, Referenceable> hiveInputsMap, SortedMap<WriteEntity, Referenceable> hiveOutputsMap) throws HiveException {
    HiveOperation op = eventContext.getOperation();
    if (isCreateOp(eventContext)) {
        Entity entity = getEntityByType(sortedHiveOutputs, Type.TABLE);
        if (entity != null) {
            Table outTable = entity.getTable();
            // refresh table
            outTable = dgiBridge.hiveClient.getTable(outTable.getDbName(), outTable.getTableName());
            return HiveMetaStoreBridge.getTableProcessQualifiedName(dgiBridge.getClusterName(), outTable);
        }
    }
    StringBuilder buffer = new StringBuilder(op.getOperationName());
    boolean ignoreHDFSPathsinQFName = ignoreHDFSPathsinQFName(op, sortedHiveInputs, sortedHiveOutputs);
    if (ignoreHDFSPathsinQFName && LOG.isDebugEnabled()) {
        LOG.debug("Ignoring HDFS paths in qualifiedName for {} {} ", op, eventContext.getQueryStr());
    }
    addInputs(dgiBridge, op, sortedHiveInputs, buffer, hiveInputsMap, ignoreHDFSPathsinQFName);
    buffer.append(IO_SEP);
    addOutputs(dgiBridge, op, sortedHiveOutputs, buffer, hiveOutputsMap, ignoreHDFSPathsinQFName);
    LOG.info("Setting process qualified name to {}", buffer);
    return buffer.toString();
}

Also used : HiveOperation(org.apache.hadoop.hive.ql.plan.HiveOperation) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Entity(org.apache.hadoop.hive.ql.hooks.Entity) Table(org.apache.hadoop.hive.ql.metadata.Table) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 34 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHookIT method testColumnLevelLineage.

/*
    The test is disabled by default
    Reason : Atlas uses Hive version 1.2.x and the Hive patch HIVE-13112 which enables column level lineage is not
    committed in Hive version 1.2.x
    This test will fail if the lineage information is not available from Hive
    Once the patch for HIVE-13112 is committed to Hive branch 1.2.x, the test can be enabled
    Please track HIVE-14706 to know the status of column lineage availability in latest Hive versions i.e 2.1.x
     */
@Test(enabled = false)
public void testColumnLevelLineage() throws Exception {
    String sourceTable = "table" + random();
    runCommand("create table " + sourceTable + "(a int, b int)");
    String sourceTableGUID = assertTableIsRegistered(DEFAULT_DB, sourceTable);
    String a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "a"));
    String b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "b"));
    String ctasTableName = "table" + random();
    String query = "create table " + ctasTableName + " as " + "select sum(a+b) as a, count(*) as b from " + sourceTable;
    runCommand(query);
    String dest_a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "a"));
    String dest_b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "b"));
    final Set<ReadEntity> inputs = getInputs(sourceTable, Entity.Type.TABLE);
    final Set<WriteEntity> outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
    HiveHook.HiveEventContext event = constructEvent(query, HiveOperation.CREATETABLE_AS_SELECT, inputs, outputs);
    assertProcessIsRegistered(event);
    assertTableIsRegistered(DEFAULT_DB, ctasTableName);
    String processQName = sortEventsAndGetProcessQualifiedName(event);
    List<String> aLineageInputs = Arrays.asList(a_guid, b_guid);
    String aLineageProcessName = processQName + ":" + "a";
    LOG.debug("Searching for column lineage process {} ", aLineageProcessName);
    String guid = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, aLineageProcessName, null);
    List<Id> processInputs = (List<Id>) atlasClient.getEntity(guid).get("inputs");
    List<String> processInputsAsString = new ArrayList<>();
    for (Id input : processInputs) {
        processInputsAsString.add(input._getId());
    }
    Collections.sort(processInputsAsString);
    Collections.sort(aLineageInputs);
    Assert.assertEquals(processInputsAsString, aLineageInputs);
    List<String> bLineageInputs = Arrays.asList(sourceTableGUID);
    String bLineageProcessName = processQName + ":" + "b";
    LOG.debug("Searching for column lineage process {} ", bLineageProcessName);
    String guid1 = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, bLineageProcessName, null);
    List<Id> bProcessInputs = (List<Id>) atlasClient.getEntity(guid1).get("inputs");
    List<String> bProcessInputsAsString = new ArrayList<>();
    for (Id input : bProcessInputs) {
        bProcessInputsAsString.add(input._getId());
    }
    Collections.sort(bProcessInputsAsString);
    Collections.sort(bLineageInputs);
    Assert.assertEquals(bProcessInputsAsString, bLineageInputs);
    // Test lineage API response
    JSONObject response = atlasClient.getInputGraphForEntity(dest_a_guid);
    JSONObject vertices = response.getJSONObject("values").getJSONObject("vertices");
    JSONObject dest_a_val = (JSONObject) vertices.get(dest_a_guid);
    JSONObject src_a_val = (JSONObject) vertices.get(a_guid);
    JSONObject src_b_val = (JSONObject) vertices.get(b_guid);
    Assert.assertNotNull(dest_a_val);
    Assert.assertNotNull(src_a_val);
    Assert.assertNotNull(src_b_val);
    JSONObject b_response = atlasClient.getInputGraphForEntity(dest_b_guid);
    JSONObject b_vertices = b_response.getJSONObject("values").getJSONObject("vertices");
    JSONObject b_val = (JSONObject) b_vertices.get(dest_b_guid);
    JSONObject src_tbl_val = (JSONObject) b_vertices.get(sourceTableGUID);
    Assert.assertNotNull(b_val);
    Assert.assertNotNull(src_tbl_val);
}

Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) JSONObject(org.codehaus.jettison.json.JSONObject) ImmutableList(com.google.common.collect.ImmutableList) Id(org.apache.atlas.typesystem.persistence.Id) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Example 35 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHookIT method testUpdateProcess.

@Test
public void testUpdateProcess() throws Exception {
    String tableName = createTable();
    String pFile1 = createTestDFSPath("somedfspath1");
    String query = "insert overwrite DIRECTORY '" + pFile1 + "' select id, name from " + tableName;
    runCommand(query);
    Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
    final Set<WriteEntity> outputs = getOutputs(pFile1, Entity.Type.DFS_DIR);
    outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
    final HiveHook.HiveEventContext hiveEventContext = constructEvent(query, HiveOperation.QUERY, inputs, outputs);
    Referenceable processReference = validateProcess(hiveEventContext);
    validateHDFSPaths(processReference, OUTPUTS, pFile1);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    validateInputTables(processReference, inputs);
    // Rerun same query with same HDFS path
    runCommandWithDelay(query, 1000);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    Referenceable process2Reference = validateProcess(hiveEventContext);
    validateHDFSPaths(process2Reference, OUTPUTS, pFile1);
    Assert.assertEquals(process2Reference.getId()._getId(), processReference.getId()._getId());
    // Rerun same query with a new HDFS path. Will result in same process since HDFS paths is not part of qualified name for QUERY operations
    final String pFile2 = createTestDFSPath("somedfspath2");
    query = "insert overwrite DIRECTORY '" + pFile2 + "' select id, name from " + tableName;
    runCommandWithDelay(query, 1000);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    Set<WriteEntity> p3Outputs = new LinkedHashSet<WriteEntity>() {

        {
            addAll(getOutputs(pFile2, Entity.Type.DFS_DIR));
            addAll(outputs);
        }
    };
    Referenceable process3Reference = validateProcess(constructEvent(query, HiveOperation.QUERY, inputs, p3Outputs));
    validateHDFSPaths(process3Reference, OUTPUTS, pFile2);
    Assert.assertEquals(process3Reference.getId()._getId(), processReference.getId()._getId());
}

Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Referenceable(org.apache.atlas.typesystem.Referenceable) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Aggregations

ReadEntity (org.apache.hadoop.hive.ql.hooks.ReadEntity)139 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)70 Table (org.apache.hadoop.hive.ql.metadata.Table)69 DDLWork (org.apache.hadoop.hive.ql.ddl.DDLWork)31 Partition (org.apache.hadoop.hive.ql.metadata.Partition)29 ArrayList (java.util.ArrayList)27 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)27 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)24 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)22 HashMap (java.util.HashMap)16 Test (org.testng.annotations.Test)16 Map (java.util.Map)13 LinkedHashMap (java.util.LinkedHashMap)12 Path (org.apache.hadoop.fs.Path)12 List (java.util.List)11 Database (org.apache.hadoop.hive.metastore.api.Database)11 AtlasEntity (org.apache.atlas.model.instance.AtlasEntity)10 Referenceable (org.apache.atlas.typesystem.Referenceable)10 HashSet (java.util.HashSet)9 FileNotFoundException (java.io.FileNotFoundException)7