use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHookIT method getPartitionInput.
private ReadEntity getPartitionInput() {
ReadEntity partEntity = new ReadEntity();
partEntity.setName(PART_FILE);
partEntity.setTyp(Entity.Type.PARTITION);
return partEntity;
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHookIT method getInputs.
private Set<ReadEntity> getInputs(String inputName, Entity.Type entityType) throws HiveException {
final ReadEntity entity = new ReadEntity();
if (Entity.Type.DFS_DIR.equals(entityType)) {
entity.setName(lower(new Path(inputName).toString()));
entity.setTyp(Entity.Type.DFS_DIR);
} else {
entity.setName(getQualifiedTblName(inputName));
entity.setTyp(entityType);
}
if (entityType == Entity.Type.TABLE) {
entity.setT(hiveMetaStoreBridge.hiveClient.getTable(DEFAULT_DB, inputName));
}
return new LinkedHashSet<ReadEntity>() {
{
add(entity);
}
};
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHook method getProcessQualifiedName.
@VisibleForTesting
static String getProcessQualifiedName(HiveMetaStoreBridge dgiBridge, HiveEventContext eventContext, final SortedSet<ReadEntity> sortedHiveInputs, final SortedSet<WriteEntity> sortedHiveOutputs, SortedMap<ReadEntity, Referenceable> hiveInputsMap, SortedMap<WriteEntity, Referenceable> hiveOutputsMap) throws HiveException {
HiveOperation op = eventContext.getOperation();
if (isCreateOp(eventContext)) {
Entity entity = getEntityByType(sortedHiveOutputs, Type.TABLE);
if (entity != null) {
Table outTable = entity.getTable();
// refresh table
outTable = dgiBridge.hiveClient.getTable(outTable.getDbName(), outTable.getTableName());
return HiveMetaStoreBridge.getTableProcessQualifiedName(dgiBridge.getClusterName(), outTable);
}
}
StringBuilder buffer = new StringBuilder(op.getOperationName());
boolean ignoreHDFSPathsinQFName = ignoreHDFSPathsinQFName(op, sortedHiveInputs, sortedHiveOutputs);
if (ignoreHDFSPathsinQFName && LOG.isDebugEnabled()) {
LOG.debug("Ignoring HDFS paths in qualifiedName for {} {} ", op, eventContext.getQueryStr());
}
addInputs(dgiBridge, op, sortedHiveInputs, buffer, hiveInputsMap, ignoreHDFSPathsinQFName);
buffer.append(IO_SEP);
addOutputs(dgiBridge, op, sortedHiveOutputs, buffer, hiveOutputsMap, ignoreHDFSPathsinQFName);
LOG.info("Setting process qualified name to {}", buffer);
return buffer.toString();
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHookIT method testColumnLevelLineage.
/*
The test is disabled by default
Reason : Atlas uses Hive version 1.2.x and the Hive patch HIVE-13112 which enables column level lineage is not
committed in Hive version 1.2.x
This test will fail if the lineage information is not available from Hive
Once the patch for HIVE-13112 is committed to Hive branch 1.2.x, the test can be enabled
Please track HIVE-14706 to know the status of column lineage availability in latest Hive versions i.e 2.1.x
*/
@Test(enabled = false)
public void testColumnLevelLineage() throws Exception {
String sourceTable = "table" + random();
runCommand("create table " + sourceTable + "(a int, b int)");
String sourceTableGUID = assertTableIsRegistered(DEFAULT_DB, sourceTable);
String a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "a"));
String b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "b"));
String ctasTableName = "table" + random();
String query = "create table " + ctasTableName + " as " + "select sum(a+b) as a, count(*) as b from " + sourceTable;
runCommand(query);
String dest_a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "a"));
String dest_b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "b"));
final Set<ReadEntity> inputs = getInputs(sourceTable, Entity.Type.TABLE);
final Set<WriteEntity> outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
HiveHook.HiveEventContext event = constructEvent(query, HiveOperation.CREATETABLE_AS_SELECT, inputs, outputs);
assertProcessIsRegistered(event);
assertTableIsRegistered(DEFAULT_DB, ctasTableName);
String processQName = sortEventsAndGetProcessQualifiedName(event);
List<String> aLineageInputs = Arrays.asList(a_guid, b_guid);
String aLineageProcessName = processQName + ":" + "a";
LOG.debug("Searching for column lineage process {} ", aLineageProcessName);
String guid = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, aLineageProcessName, null);
List<Id> processInputs = (List<Id>) atlasClient.getEntity(guid).get("inputs");
List<String> processInputsAsString = new ArrayList<>();
for (Id input : processInputs) {
processInputsAsString.add(input._getId());
}
Collections.sort(processInputsAsString);
Collections.sort(aLineageInputs);
Assert.assertEquals(processInputsAsString, aLineageInputs);
List<String> bLineageInputs = Arrays.asList(sourceTableGUID);
String bLineageProcessName = processQName + ":" + "b";
LOG.debug("Searching for column lineage process {} ", bLineageProcessName);
String guid1 = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, bLineageProcessName, null);
List<Id> bProcessInputs = (List<Id>) atlasClient.getEntity(guid1).get("inputs");
List<String> bProcessInputsAsString = new ArrayList<>();
for (Id input : bProcessInputs) {
bProcessInputsAsString.add(input._getId());
}
Collections.sort(bProcessInputsAsString);
Collections.sort(bLineageInputs);
Assert.assertEquals(bProcessInputsAsString, bLineageInputs);
// Test lineage API response
JSONObject response = atlasClient.getInputGraphForEntity(dest_a_guid);
JSONObject vertices = response.getJSONObject("values").getJSONObject("vertices");
JSONObject dest_a_val = (JSONObject) vertices.get(dest_a_guid);
JSONObject src_a_val = (JSONObject) vertices.get(a_guid);
JSONObject src_b_val = (JSONObject) vertices.get(b_guid);
Assert.assertNotNull(dest_a_val);
Assert.assertNotNull(src_a_val);
Assert.assertNotNull(src_b_val);
JSONObject b_response = atlasClient.getInputGraphForEntity(dest_b_guid);
JSONObject b_vertices = b_response.getJSONObject("values").getJSONObject("vertices");
JSONObject b_val = (JSONObject) b_vertices.get(dest_b_guid);
JSONObject src_tbl_val = (JSONObject) b_vertices.get(sourceTableGUID);
Assert.assertNotNull(b_val);
Assert.assertNotNull(src_tbl_val);
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHookIT method testUpdateProcess.
@Test
public void testUpdateProcess() throws Exception {
String tableName = createTable();
String pFile1 = createTestDFSPath("somedfspath1");
String query = "insert overwrite DIRECTORY '" + pFile1 + "' select id, name from " + tableName;
runCommand(query);
Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
final Set<WriteEntity> outputs = getOutputs(pFile1, Entity.Type.DFS_DIR);
outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
final HiveHook.HiveEventContext hiveEventContext = constructEvent(query, HiveOperation.QUERY, inputs, outputs);
Referenceable processReference = validateProcess(hiveEventContext);
validateHDFSPaths(processReference, OUTPUTS, pFile1);
assertTableIsRegistered(DEFAULT_DB, tableName);
validateInputTables(processReference, inputs);
// Rerun same query with same HDFS path
runCommandWithDelay(query, 1000);
assertTableIsRegistered(DEFAULT_DB, tableName);
Referenceable process2Reference = validateProcess(hiveEventContext);
validateHDFSPaths(process2Reference, OUTPUTS, pFile1);
Assert.assertEquals(process2Reference.getId()._getId(), processReference.getId()._getId());
// Rerun same query with a new HDFS path. Will result in same process since HDFS paths is not part of qualified name for QUERY operations
final String pFile2 = createTestDFSPath("somedfspath2");
query = "insert overwrite DIRECTORY '" + pFile2 + "' select id, name from " + tableName;
runCommandWithDelay(query, 1000);
assertTableIsRegistered(DEFAULT_DB, tableName);
Set<WriteEntity> p3Outputs = new LinkedHashSet<WriteEntity>() {
{
addAll(getOutputs(pFile2, Entity.Type.DFS_DIR));
addAll(outputs);
}
};
Referenceable process3Reference = validateProcess(constructEvent(query, HiveOperation.QUERY, inputs, p3Outputs));
validateHDFSPaths(process3Reference, OUTPUTS, pFile2);
Assert.assertEquals(process3Reference.getId()._getId(), processReference.getId()._getId());
}
Aggregations