use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHook method handleExternalTables.
private void handleExternalTables(final HiveMetaStoreBridge dgiBridge, final HiveEventContext event, final LinkedHashMap<Type, Referenceable> tables) throws HiveException, MalformedURLException {
List<Referenceable> entities = new ArrayList<>();
final WriteEntity hiveEntity = (WriteEntity) getEntityByType(event.getOutputs(), Type.TABLE);
Table hiveTable = hiveEntity == null ? null : hiveEntity.getTable();
//Refresh to get the correct location
if (hiveTable != null) {
hiveTable = dgiBridge.hiveClient.getTable(hiveTable.getDbName(), hiveTable.getTableName());
}
if (hiveTable != null && TableType.EXTERNAL_TABLE.equals(hiveTable.getTableType())) {
LOG.info("Registering external table process {} ", event.getQueryStr());
final String location = lower(hiveTable.getDataLocation().toString());
final ReadEntity dfsEntity = new ReadEntity();
dfsEntity.setTyp(Type.DFS_DIR);
dfsEntity.setD(new Path(location));
SortedMap<ReadEntity, Referenceable> hiveInputsMap = new TreeMap<ReadEntity, Referenceable>(entityComparator) {
{
put(dfsEntity, dgiBridge.fillHDFSDataSet(location));
}
};
SortedMap<WriteEntity, Referenceable> hiveOutputsMap = new TreeMap<WriteEntity, Referenceable>(entityComparator) {
{
put(hiveEntity, tables.get(Type.TABLE));
}
};
SortedSet<ReadEntity> sortedIps = new TreeSet<>(entityComparator);
sortedIps.addAll(hiveInputsMap.keySet());
SortedSet<WriteEntity> sortedOps = new TreeSet<>(entityComparator);
sortedOps.addAll(hiveOutputsMap.keySet());
Referenceable processReferenceable = getProcessReferenceable(dgiBridge, event, sortedIps, sortedOps, hiveInputsMap, hiveOutputsMap);
entities.addAll(tables.values());
entities.add(processReferenceable);
event.addMessage(new HookNotification.EntityUpdateRequest(event.getUser(), entities));
}
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHook method registerProcess.
private void registerProcess(HiveMetaStoreBridge dgiBridge, HiveEventContext event) throws AtlasHookException {
try {
Set<ReadEntity> inputs = event.getInputs();
Set<WriteEntity> outputs = event.getOutputs();
//Even explain CTAS has operation name as CREATETABLE_AS_SELECT
if (inputs.isEmpty() && outputs.isEmpty()) {
LOG.info("Explain statement. Skipping...");
return;
}
if (event.getQueryId() == null) {
LOG.info("Query id/plan is missing for {}", event.getQueryStr());
}
final SortedMap<ReadEntity, Referenceable> source = new TreeMap<>(entityComparator);
final SortedMap<WriteEntity, Referenceable> target = new TreeMap<>(entityComparator);
final Set<String> dataSets = new HashSet<>();
final Set<Referenceable> entities = new LinkedHashSet<>();
boolean isSelectQuery = isSelectQuery(event);
// filter out select queries which do not modify data
if (!isSelectQuery) {
SortedSet<ReadEntity> sortedHiveInputs = new TreeSet<>(entityComparator);
if (event.getInputs() != null) {
sortedHiveInputs.addAll(event.getInputs());
}
SortedSet<WriteEntity> sortedHiveOutputs = new TreeSet<>(entityComparator);
if (event.getOutputs() != null) {
sortedHiveOutputs.addAll(event.getOutputs());
}
for (ReadEntity readEntity : sortedHiveInputs) {
processHiveEntity(dgiBridge, event, readEntity, dataSets, source, entities);
}
for (WriteEntity writeEntity : sortedHiveOutputs) {
processHiveEntity(dgiBridge, event, writeEntity, dataSets, target, entities);
}
if (source.size() > 0 || target.size() > 0) {
Referenceable processReferenceable = getProcessReferenceable(dgiBridge, event, sortedHiveInputs, sortedHiveOutputs, source, target);
// setup Column Lineage
List<Referenceable> sourceList = new ArrayList<>(source.values());
List<Referenceable> targetList = new ArrayList<>(target.values());
List<Referenceable> colLineageProcessInstances = new ArrayList<>();
try {
Map<String, Referenceable> columnQNameToRef = ColumnLineageUtils.buildColumnReferenceableMap(sourceList, targetList);
colLineageProcessInstances = createColumnLineageProcessInstances(processReferenceable, event.lineageInfo, columnQNameToRef);
} catch (Exception e) {
LOG.warn("Column lineage process setup failed with exception {}", e);
}
colLineageProcessInstances.add(0, processReferenceable);
entities.addAll(colLineageProcessInstances);
event.addMessage(new HookNotification.EntityUpdateRequest(event.getUser(), new ArrayList<>(entities)));
} else {
LOG.info("Skipped query {} since it has no getInputs() or resulting getOutputs()", event.getQueryStr());
}
} else {
LOG.info("Skipped query {} for processing since it is a select query ", event.getQueryStr());
}
} catch (Exception e) {
throw new AtlasHookException("HiveHook.registerProcess() failed.", e);
}
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHookIT method testDropAndRecreateCTASOutput.
@Test
public void testDropAndRecreateCTASOutput() throws Exception {
String tableName = createTable();
String ctasTableName = "table" + random();
String query = "create table " + ctasTableName + " as select * from " + tableName;
runCommand(query);
assertTableIsRegistered(DEFAULT_DB, ctasTableName);
Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
Set<WriteEntity> outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
final HiveHook.HiveEventContext hiveEventContext = constructEvent(query, HiveOperation.CREATETABLE_AS_SELECT, inputs, outputs);
String processId = assertProcessIsRegistered(hiveEventContext);
final String drpquery = String.format("drop table %s ", ctasTableName);
runCommandWithDelay(drpquery, 100);
assertTableIsNotRegistered(DEFAULT_DB, ctasTableName);
runCommand(query);
assertTableIsRegistered(DEFAULT_DB, ctasTableName);
outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
String process2Id = assertProcessIsRegistered(hiveEventContext, inputs, outputs);
assertNotEquals(process2Id, processId);
Referenceable processRef = atlasClient.getEntity(processId);
validateOutputTables(processRef, outputs);
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHookIT method testExportImportPartitionedTable.
@Test
public void testExportImportPartitionedTable() throws Exception {
boolean isPartitionedTable = true;
final String tableName = createTable(isPartitionedTable);
assertTableIsRegistered(DEFAULT_DB, tableName);
//Add a partition
String partFile = "pfile://" + mkdir("partition");
String query = "alter table " + tableName + " add partition (dt='" + PART_FILE + "') location '" + partFile + "'";
runCommand(query);
String filename = "pfile://" + mkdir("export");
query = "export table " + tableName + " to \"" + filename + "\"";
runCommand(query);
final Set<ReadEntity> expectedExportInputs = getInputs(tableName, Entity.Type.TABLE);
final Set<WriteEntity> outputs = getOutputs(filename, Entity.Type.DFS_DIR);
//Note that export has only partition as input in this case
final Set<ReadEntity> partitionIps = getInputs(DEFAULT_DB + "@" + tableName + "@dt=" + PART_FILE, Entity.Type.PARTITION);
partitionIps.addAll(expectedExportInputs);
Referenceable processReference = validateProcess(constructEvent(query, HiveOperation.EXPORT, partitionIps, outputs), expectedExportInputs, outputs);
validateHDFSPaths(processReference, OUTPUTS, filename);
//Import
String importTableName = createTable(true);
assertTableIsRegistered(DEFAULT_DB, tableName);
query = "import table " + importTableName + " from '" + filename + "'";
runCommand(query);
final Set<ReadEntity> expectedImportInputs = getInputs(filename, Entity.Type.DFS_DIR);
final Set<WriteEntity> importOutputs = getOutputs(importTableName, Entity.Type.TABLE);
final Set<WriteEntity> partitionOps = getOutputs(DEFAULT_DB + "@" + importTableName + "@dt=" + PART_FILE, Entity.Type.PARTITION);
partitionOps.addAll(importOutputs);
validateProcess(constructEvent(query, HiveOperation.IMPORT, expectedImportInputs, partitionOps), expectedImportInputs, importOutputs);
//Export should update same process
filename = "pfile://" + mkdir("export2");
query = "export table " + tableName + " to \"" + filename + "\"";
runCommand(query);
final Set<WriteEntity> outputs2 = getOutputs(filename, Entity.Type.DFS_DIR);
Set<WriteEntity> p3Outputs = new LinkedHashSet<WriteEntity>() {
{
addAll(outputs2);
addAll(outputs);
}
};
validateProcess(constructEvent(query, HiveOperation.EXPORT, partitionIps, outputs2), expectedExportInputs, p3Outputs);
query = "alter table " + importTableName + " drop partition (dt='" + PART_FILE + "')";
runCommand(query);
//Import should update same process
query = "import table " + importTableName + " from '" + filename + "'";
runCommandWithDelay(query, 1000);
final Set<ReadEntity> importInputs = getInputs(filename, Entity.Type.DFS_DIR);
final Set<ReadEntity> expectedImport2Inputs = new LinkedHashSet<ReadEntity>() {
{
addAll(importInputs);
addAll(expectedImportInputs);
}
};
validateProcess(constructEvent(query, HiveOperation.IMPORT, importInputs, partitionOps), expectedImport2Inputs, importOutputs);
}
use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.
the class HiveHookIT method testLoadDFSPathPartitioned.
@Test
public void testLoadDFSPathPartitioned() throws Exception {
String tableName = createTable(true, true, false);
assertTableIsRegistered(DEFAULT_DB, tableName);
final String loadFile = createTestDFSFile("loadDFSFile");
String query = "load data inpath '" + loadFile + "' into table " + tableName + " partition(dt = '" + PART_FILE + "')";
runCommand(query);
final Set<WriteEntity> outputs = getOutputs(tableName, Entity.Type.TABLE);
final Set<ReadEntity> inputs = getInputs(loadFile, Entity.Type.DFS_DIR);
final Set<WriteEntity> partitionOps = new LinkedHashSet<>(outputs);
partitionOps.addAll(getOutputs(DEFAULT_DB + "@" + tableName + "@dt=" + PART_FILE, Entity.Type.PARTITION));
Referenceable processReference = validateProcess(constructEvent(query, HiveOperation.LOAD, inputs, partitionOps), inputs, outputs);
validateHDFSPaths(processReference, INPUTS, loadFile);
validateOutputTables(processReference, outputs);
final String loadFile2 = createTestDFSFile("loadDFSFile1");
query = "load data inpath '" + loadFile2 + "' into table " + tableName + " partition(dt = '" + PART_FILE + "')";
runCommand(query);
Set<ReadEntity> process2Inputs = getInputs(loadFile2, Entity.Type.DFS_DIR);
Set<ReadEntity> expectedInputs = new LinkedHashSet<>();
expectedInputs.addAll(process2Inputs);
expectedInputs.addAll(inputs);
validateProcess(constructEvent(query, HiveOperation.LOAD, expectedInputs, partitionOps), expectedInputs, outputs);
}
Aggregations