use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project atlas by apache.
the class HiveHookIT method testUpdateProcess.
@Test
public void testUpdateProcess() throws Exception {
String tableName = createTable();
String pFile1 = createTestDFSPath("somedfspath1");
String query = "insert overwrite DIRECTORY '" + pFile1 + "' select id, name from " + tableName;
runCommand(query);
Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
Set<WriteEntity> outputs = getOutputs(pFile1, Entity.Type.DFS_DIR);
outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
HiveEventContext hiveEventContext = constructEvent(query, HiveOperation.QUERY, inputs, outputs);
AtlasEntity processEntity = validateProcess(hiveEventContext);
validateHDFSPaths(processEntity, OUTPUTS, pFile1);
assertTableIsRegistered(DEFAULT_DB, tableName);
validateInputTables(processEntity, inputs);
// Rerun same query with same HDFS path
runCommandWithDelay(query, 1000);
assertTableIsRegistered(DEFAULT_DB, tableName);
AtlasEntity process2Entity = validateProcess(hiveEventContext);
validateHDFSPaths(process2Entity, OUTPUTS, pFile1);
Assert.assertEquals(process2Entity.getGuid(), processEntity.getGuid());
// Rerun same query with a new HDFS path. Will result in same process since HDFS paths is not part of qualified name for QUERY operations
String pFile2 = createTestDFSPath("somedfspath2");
query = "insert overwrite DIRECTORY '" + pFile2 + "' select id, name from " + tableName;
runCommandWithDelay(query, 1000);
assertTableIsRegistered(DEFAULT_DB, tableName);
Set<WriteEntity> p3Outputs = new LinkedHashSet<WriteEntity>() {
{
addAll(getOutputs(pFile2, Entity.Type.DFS_DIR));
addAll(outputs);
}
};
AtlasEntity process3Entity = validateProcess(constructEvent(query, HiveOperation.QUERY, inputs, p3Outputs));
validateHDFSPaths(process3Entity, OUTPUTS, pFile2);
Assert.assertEquals(process3Entity.getGuid(), processEntity.getGuid());
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project atlas by apache.
the class HiveHookIT method getInputs.
private Set<ReadEntity> getInputs(String inputName, Entity.Type entityType) throws HiveException {
final ReadEntity entity = new ReadEntity();
if (Entity.Type.DFS_DIR.equals(entityType)) {
entity.setName(lower(new Path(inputName).toString()));
entity.setTyp(Entity.Type.DFS_DIR);
} else {
entity.setName(getQualifiedTblName(inputName));
entity.setTyp(entityType);
}
if (entityType == Entity.Type.TABLE) {
entity.setT(hiveMetaStoreBridge.getHiveClient().getTable(DEFAULT_DB, inputName));
}
return new LinkedHashSet<ReadEntity>() {
{
add(entity);
}
};
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project atlas by apache.
the class HiveHookIT method testColumnLevelLineage.
/*
The test is disabled by default
Reason : Atlas uses Hive version 1.2.x and the Hive patch HIVE-13112 which enables column level lineage is not
committed in Hive version 1.2.x
This test will fail if the lineage information is not available from Hive
Once the patch for HIVE-13112 is committed to Hive branch 1.2.x, the test can be enabled
Please track HIVE-14706 to know the status of column lineage availability in latest Hive versions i.e 2.1.x
*/
@Test(enabled = false)
public void testColumnLevelLineage() throws Exception {
String sourceTable = "table" + random();
runCommand("create table " + sourceTable + "(a int, b int)");
String sourceTableGUID = assertTableIsRegistered(DEFAULT_DB, sourceTable);
String a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "a"));
String b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "b"));
String ctasTableName = "table" + random();
String query = "create table " + ctasTableName + " as " + "select sum(a+b) as a, count(*) as b from " + sourceTable;
runCommand(query);
String dest_a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "a"));
String dest_b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "b"));
Set<ReadEntity> inputs = getInputs(sourceTable, Entity.Type.TABLE);
Set<WriteEntity> outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
HiveEventContext event = constructEvent(query, HiveOperation.CREATETABLE_AS_SELECT, inputs, outputs);
assertProcessIsRegistered(event);
assertTableIsRegistered(DEFAULT_DB, ctasTableName);
String processQName = sortEventsAndGetProcessQualifiedName(event);
List<String> aLineageInputs = Arrays.asList(a_guid, b_guid);
String aLineageProcessName = processQName + ":" + "a";
LOG.debug("Searching for column lineage process {} ", aLineageProcessName);
String guid = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), ATTRIBUTE_QUALIFIED_NAME, aLineageProcessName, null);
AtlasEntity colLineageEntity = atlasClientV2.getEntityByGuid(guid).getEntity();
List<AtlasObjectId> processInputs = toAtlasObjectIdList(colLineageEntity.getAttribute("inputs"));
List<String> processInputsAsString = new ArrayList<>();
for (AtlasObjectId input : processInputs) {
processInputsAsString.add(input.getGuid());
}
Collections.sort(processInputsAsString);
Collections.sort(aLineageInputs);
Assert.assertEquals(processInputsAsString, aLineageInputs);
List<String> bLineageInputs = Arrays.asList(sourceTableGUID);
String bLineageProcessName = processQName + ":" + "b";
LOG.debug("Searching for column lineage process {} ", bLineageProcessName);
String guid1 = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), ATTRIBUTE_QUALIFIED_NAME, bLineageProcessName, null);
AtlasEntity colLineageEntity1 = atlasClientV2.getEntityByGuid(guid1).getEntity();
List<AtlasObjectId> bProcessInputs = toAtlasObjectIdList(colLineageEntity1.getAttribute("inputs"));
List<String> bProcessInputsAsString = new ArrayList<>();
for (AtlasObjectId input : bProcessInputs) {
bProcessInputsAsString.add(input.getGuid());
}
Collections.sort(bProcessInputsAsString);
Collections.sort(bLineageInputs);
Assert.assertEquals(bProcessInputsAsString, bLineageInputs);
// Test lineage API response
AtlasLineageInfo atlasLineageInfoInput = atlasClientV2.getLineageInfo(dest_a_guid, AtlasLineageInfo.LineageDirection.INPUT, 0);
Map<String, AtlasEntityHeader> entityMap = atlasLineageInfoInput.getGuidEntityMap();
ObjectNode response = atlasClient.getInputGraphForEntity(dest_a_guid);
JsonNode vertices = response.get("values").get("vertices");
JsonNode dest_a_val = vertices.get(dest_a_guid);
JsonNode src_a_val = vertices.get(a_guid);
JsonNode src_b_val = vertices.get(b_guid);
Assert.assertNotNull(dest_a_val);
Assert.assertNotNull(src_a_val);
Assert.assertNotNull(src_b_val);
ObjectNode b_response = atlasClient.getInputGraphForEntity(dest_b_guid);
JsonNode b_vertices = b_response.get("values").get("vertices");
JsonNode b_val = b_vertices.get(dest_b_guid);
JsonNode src_tbl_val = b_vertices.get(sourceTableGUID);
Assert.assertNotNull(b_val);
Assert.assertNotNull(src_tbl_val);
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project atlas by apache.
the class HiveHookIT method testInsertIntoTable.
@Test
public void testInsertIntoTable() throws Exception {
String inputTable1Name = createTable();
String inputTable2Name = createTable();
String insertTableName = createTable();
assertTableIsRegistered(DEFAULT_DB, inputTable1Name);
assertTableIsRegistered(DEFAULT_DB, insertTableName);
String query = "insert into " + insertTableName + " select t1.id, t1.name from " + inputTable2Name + " as t2, " + inputTable1Name + " as t1 where t1.id=t2.id";
runCommand(query);
Set<ReadEntity> inputs = getInputs(inputTable1Name, Entity.Type.TABLE);
inputs.addAll(getInputs(inputTable2Name, Entity.Type.TABLE));
Set<WriteEntity> outputs = getOutputs(insertTableName, Entity.Type.TABLE);
(outputs.iterator().next()).setWriteType(WriteEntity.WriteType.INSERT);
HiveEventContext event = constructEvent(query, HiveOperation.QUERY, inputs, outputs);
Set<ReadEntity> expectedInputs = new TreeSet<ReadEntity>(entityComparator) {
{
addAll(inputs);
}
};
assertTableIsRegistered(DEFAULT_DB, insertTableName);
AtlasEntity processEntity1 = validateProcess(event, expectedInputs, outputs);
// Test sorting of tbl names
SortedSet<String> sortedTblNames = new TreeSet<>();
sortedTblNames.add(inputTable1Name.toLowerCase());
sortedTblNames.add(inputTable2Name.toLowerCase());
// Verify sorted order of inputs in qualified name
Assert.assertEquals(processEntity1.getAttribute(ATTRIBUTE_QUALIFIED_NAME), Joiner.on(SEP).join("QUERY", getQualifiedTblName(sortedTblNames.first()), HiveMetaStoreBridge.getTableCreatedTime(hiveMetaStoreBridge.getHiveClient().getTable(DEFAULT_DB, sortedTblNames.first())), getQualifiedTblName(sortedTblNames.last()), HiveMetaStoreBridge.getTableCreatedTime(hiveMetaStoreBridge.getHiveClient().getTable(DEFAULT_DB, sortedTblNames.last()))) + IO_SEP + SEP + Joiner.on(SEP).join(WriteEntity.WriteType.INSERT.name(), getQualifiedTblName(insertTableName), HiveMetaStoreBridge.getTableCreatedTime(hiveMetaStoreBridge.getHiveClient().getTable(DEFAULT_DB, insertTableName))));
// Rerun same query. Should result in same process
runCommandWithDelay(query, 1000);
AtlasEntity processEntity2 = validateProcess(event, expectedInputs, outputs);
Assert.assertEquals(processEntity1.getGuid(), processEntity2.getGuid());
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project hive by apache.
the class PlanUtils method addPartitionInputs.
public static void addPartitionInputs(Collection<Partition> parts, Collection<ReadEntity> inputs, ReadEntity parentViewInfo, boolean isDirectRead) {
// Store the inputs in a HashMap since we can't get a ReadEntity from inputs since it is
// implemented as a set.ReadEntity is used as the key so that the HashMap has the same behavior
// of equals and hashCode
Map<ReadEntity, ReadEntity> readEntityMap = new LinkedHashMap<ReadEntity, ReadEntity>(inputs.size());
for (ReadEntity input : inputs) {
readEntityMap.put(input, input);
}
for (Partition part : parts) {
// Don't add the partition or table created during the execution as the input source
if (isValuesTempTable(part.getTable().getTableName())) {
continue;
}
ReadEntity newInput = null;
if (part.getTable().isPartitioned()) {
newInput = new ReadEntity(part, parentViewInfo, isDirectRead);
} else {
newInput = new ReadEntity(part.getTable(), parentViewInfo, isDirectRead);
}
if (readEntityMap.containsKey(newInput)) {
ReadEntity input = readEntityMap.get(newInput);
if ((newInput.getParents() != null) && (!newInput.getParents().isEmpty())) {
input.getParents().addAll(newInput.getParents());
input.setDirect(input.isDirect() || newInput.isDirect());
}
} else {
readEntityMap.put(newInput, newInput);
}
}
// Add the new ReadEntity that were added to readEntityMap in PlanUtils.addInput
if (inputs.size() != readEntityMap.size()) {
inputs.addAll(readEntityMap.keySet());
}
}
Aggregations