use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project hive by apache.
the class DDLTask method createIndex.
private int createIndex(Hive db, CreateIndexDesc crtIndex) throws HiveException {
if (HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
throw new UnsupportedOperationException("Indexes unsupported for Tez execution engine");
}
if (crtIndex.getSerde() != null) {
validateSerDe(crtIndex.getSerde());
}
String indexTableName = crtIndex.getIndexTableName();
if (!Utilities.isDefaultNameNode(conf)) {
// If location is specified - ensure that it is a full qualified name
makeLocationQualified(crtIndex, indexTableName);
}
db.createIndex(crtIndex.getTableName(), crtIndex.getIndexName(), crtIndex.getIndexTypeHandlerClass(), crtIndex.getIndexedCols(), crtIndex.getIndexTableName(), crtIndex.getDeferredRebuild(), crtIndex.getInputFormat(), crtIndex.getOutputFormat(), crtIndex.getSerde(), crtIndex.getStorageHandler(), crtIndex.getLocation(), crtIndex.getIdxProps(), crtIndex.getTblProps(), crtIndex.getSerdeProps(), crtIndex.getCollItemDelim(), crtIndex.getFieldDelim(), crtIndex.getFieldEscape(), crtIndex.getLineDelim(), crtIndex.getMapKeyDelim(), crtIndex.getIndexComment());
if (HiveUtils.getIndexHandler(conf, crtIndex.getIndexTypeHandlerClass()).usesIndexTable()) {
Table indexTable = db.getTable(indexTableName);
addIfAbsentByName(new WriteEntity(indexTable, WriteEntity.WriteType.DDL_NO_LOCK));
}
return 0;
}
use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project incubator-atlas by apache.
the class HiveHookIT method getPartitionOutput.
private WriteEntity getPartitionOutput() {
WriteEntity partEntity = new WriteEntity();
partEntity.setName(PART_FILE);
partEntity.setTyp(Entity.Type.PARTITION);
return partEntity;
}
use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project incubator-atlas by apache.
the class HiveHookIT method testColumnLevelLineage.
/*
The test is disabled by default
Reason : Atlas uses Hive version 1.2.x and the Hive patch HIVE-13112 which enables column level lineage is not
committed in Hive version 1.2.x
This test will fail if the lineage information is not available from Hive
Once the patch for HIVE-13112 is committed to Hive branch 1.2.x, the test can be enabled
Please track HIVE-14706 to know the status of column lineage availability in latest Hive versions i.e 2.1.x
*/
@Test(enabled = false)
public void testColumnLevelLineage() throws Exception {
String sourceTable = "table" + random();
runCommand("create table " + sourceTable + "(a int, b int)");
String sourceTableGUID = assertTableIsRegistered(DEFAULT_DB, sourceTable);
String a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "a"));
String b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "b"));
String ctasTableName = "table" + random();
String query = "create table " + ctasTableName + " as " + "select sum(a+b) as a, count(*) as b from " + sourceTable;
runCommand(query);
String dest_a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "a"));
String dest_b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "b"));
final Set<ReadEntity> inputs = getInputs(sourceTable, Entity.Type.TABLE);
final Set<WriteEntity> outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
HiveHook.HiveEventContext event = constructEvent(query, HiveOperation.CREATETABLE_AS_SELECT, inputs, outputs);
assertProcessIsRegistered(event);
assertTableIsRegistered(DEFAULT_DB, ctasTableName);
String processQName = sortEventsAndGetProcessQualifiedName(event);
List<String> aLineageInputs = Arrays.asList(a_guid, b_guid);
String aLineageProcessName = processQName + ":" + "a";
LOG.debug("Searching for column lineage process {} ", aLineageProcessName);
String guid = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, aLineageProcessName, null);
List<Id> processInputs = (List<Id>) atlasClient.getEntity(guid).get("inputs");
List<String> processInputsAsString = new ArrayList<>();
for (Id input : processInputs) {
processInputsAsString.add(input._getId());
}
Collections.sort(processInputsAsString);
Collections.sort(aLineageInputs);
Assert.assertEquals(processInputsAsString, aLineageInputs);
List<String> bLineageInputs = Arrays.asList(sourceTableGUID);
String bLineageProcessName = processQName + ":" + "b";
LOG.debug("Searching for column lineage process {} ", bLineageProcessName);
String guid1 = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, bLineageProcessName, null);
List<Id> bProcessInputs = (List<Id>) atlasClient.getEntity(guid1).get("inputs");
List<String> bProcessInputsAsString = new ArrayList<>();
for (Id input : bProcessInputs) {
bProcessInputsAsString.add(input._getId());
}
Collections.sort(bProcessInputsAsString);
Collections.sort(bLineageInputs);
Assert.assertEquals(bProcessInputsAsString, bLineageInputs);
//Test lineage API response
JSONObject response = atlasClient.getInputGraphForEntity(dest_a_guid);
JSONObject vertices = response.getJSONObject("values").getJSONObject("vertices");
JSONObject dest_a_val = (JSONObject) vertices.get(dest_a_guid);
JSONObject src_a_val = (JSONObject) vertices.get(a_guid);
JSONObject src_b_val = (JSONObject) vertices.get(b_guid);
Assert.assertNotNull(dest_a_val);
Assert.assertNotNull(src_a_val);
Assert.assertNotNull(src_b_val);
JSONObject b_response = atlasClient.getInputGraphForEntity(dest_b_guid);
JSONObject b_vertices = b_response.getJSONObject("values").getJSONObject("vertices");
JSONObject b_val = (JSONObject) b_vertices.get(dest_b_guid);
JSONObject src_tbl_val = (JSONObject) b_vertices.get(sourceTableGUID);
Assert.assertNotNull(b_val);
Assert.assertNotNull(src_tbl_val);
}
use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project incubator-atlas by apache.
the class HiveHookIT method testUpdateProcess.
@Test
public void testUpdateProcess() throws Exception {
String tableName = createTable();
String pFile1 = createTestDFSPath("somedfspath1");
String query = "insert overwrite DIRECTORY '" + pFile1 + "' select id, name from " + tableName;
runCommand(query);
Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
final Set<WriteEntity> outputs = getOutputs(pFile1, Entity.Type.DFS_DIR);
outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
final HiveHook.HiveEventContext hiveEventContext = constructEvent(query, HiveOperation.QUERY, inputs, outputs);
Referenceable processReference = validateProcess(hiveEventContext);
validateHDFSPaths(processReference, OUTPUTS, pFile1);
assertTableIsRegistered(DEFAULT_DB, tableName);
validateInputTables(processReference, inputs);
//Rerun same query with same HDFS path
runCommandWithDelay(query, 1000);
assertTableIsRegistered(DEFAULT_DB, tableName);
Referenceable process2Reference = validateProcess(hiveEventContext);
validateHDFSPaths(process2Reference, OUTPUTS, pFile1);
Assert.assertEquals(process2Reference.getId()._getId(), processReference.getId()._getId());
//Rerun same query with a new HDFS path. Will result in same process since HDFS paths is not part of qualified name for QUERY operations
final String pFile2 = createTestDFSPath("somedfspath2");
query = "insert overwrite DIRECTORY '" + pFile2 + "' select id, name from " + tableName;
runCommandWithDelay(query, 1000);
assertTableIsRegistered(DEFAULT_DB, tableName);
Set<WriteEntity> p3Outputs = new LinkedHashSet<WriteEntity>() {
{
addAll(getOutputs(pFile2, Entity.Type.DFS_DIR));
addAll(outputs);
}
};
Referenceable process3Reference = validateProcess(constructEvent(query, HiveOperation.QUERY, inputs, p3Outputs));
validateHDFSPaths(process3Reference, OUTPUTS, pFile2);
Assert.assertEquals(process3Reference.getId()._getId(), processReference.getId()._getId());
}
use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project incubator-atlas by apache.
the class HiveHookIT method testInsertIntoDFSDirPartitioned.
@Test
public void testInsertIntoDFSDirPartitioned() throws Exception {
//Test with partitioned table
String tableName = createTable(true);
String pFile1 = createTestDFSPath("somedfspath1");
String query = "insert overwrite DIRECTORY '" + pFile1 + "' select id, name from " + tableName + " where dt = '" + PART_FILE + "'";
runCommand(query);
Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
final Set<WriteEntity> outputs = getOutputs(pFile1, Entity.Type.DFS_DIR);
outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
final Set<ReadEntity> partitionIps = new LinkedHashSet<>(inputs);
partitionIps.addAll(getInputs(DEFAULT_DB + "@" + tableName + "@dt='" + PART_FILE + "'", Entity.Type.PARTITION));
Referenceable processReference = validateProcess(constructEvent(query, HiveOperation.QUERY, partitionIps, outputs), inputs, outputs);
//Rerun same query with different HDFS path. Should not create another process and should update it.
final String pFile2 = createTestDFSPath("somedfspath2");
query = "insert overwrite DIRECTORY '" + pFile2 + "' select id, name from " + tableName + " where dt = '" + PART_FILE + "'";
runCommand(query);
final Set<WriteEntity> pFile2Outputs = getOutputs(pFile2, Entity.Type.DFS_DIR);
pFile2Outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
//Now the process has 2 paths - one older with deleted reference to partition and another with the the latest partition
Set<WriteEntity> p2Outputs = new LinkedHashSet<WriteEntity>() {
{
addAll(pFile2Outputs);
addAll(outputs);
}
};
Referenceable process2Reference = validateProcess(constructEvent(query, HiveOperation.QUERY, partitionIps, pFile2Outputs), inputs, p2Outputs);
validateHDFSPaths(process2Reference, OUTPUTS, pFile2);
Assert.assertEquals(process2Reference.getId()._getId(), processReference.getId()._getId());
}
Aggregations