Search in sources :

Example 6 with WriteEntity

use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project hive by apache.

the class DDLTask method createIndex.

private int createIndex(Hive db, CreateIndexDesc crtIndex) throws HiveException {
    if (HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
        throw new UnsupportedOperationException("Indexes unsupported for Tez execution engine");
    }
    if (crtIndex.getSerde() != null) {
        validateSerDe(crtIndex.getSerde());
    }
    String indexTableName = crtIndex.getIndexTableName();
    if (!Utilities.isDefaultNameNode(conf)) {
        // If location is specified - ensure that it is a full qualified name
        makeLocationQualified(crtIndex, indexTableName);
    }
    db.createIndex(crtIndex.getTableName(), crtIndex.getIndexName(), crtIndex.getIndexTypeHandlerClass(), crtIndex.getIndexedCols(), crtIndex.getIndexTableName(), crtIndex.getDeferredRebuild(), crtIndex.getInputFormat(), crtIndex.getOutputFormat(), crtIndex.getSerde(), crtIndex.getStorageHandler(), crtIndex.getLocation(), crtIndex.getIdxProps(), crtIndex.getTblProps(), crtIndex.getSerdeProps(), crtIndex.getCollItemDelim(), crtIndex.getFieldDelim(), crtIndex.getFieldEscape(), crtIndex.getLineDelim(), crtIndex.getMapKeyDelim(), crtIndex.getIndexComment());
    if (HiveUtils.getIndexHandler(conf, crtIndex.getIndexTypeHandlerClass()).usesIndexTable()) {
        Table indexTable = db.getTable(indexTableName);
        addIfAbsentByName(new WriteEntity(indexTable, WriteEntity.WriteType.DDL_NO_LOCK));
    }
    return 0;
}
Also used : Table(org.apache.hadoop.hive.ql.metadata.Table) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity)

Example 7 with WriteEntity

use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project incubator-atlas by apache.

the class HiveHookIT method getPartitionOutput.

private WriteEntity getPartitionOutput() {
    WriteEntity partEntity = new WriteEntity();
    partEntity.setName(PART_FILE);
    partEntity.setTyp(Entity.Type.PARTITION);
    return partEntity;
}
Also used : WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity)

Example 8 with WriteEntity

use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project incubator-atlas by apache.

the class HiveHookIT method testColumnLevelLineage.

/*
    The test is disabled by default
    Reason : Atlas uses Hive version 1.2.x and the Hive patch HIVE-13112 which enables column level lineage is not
    committed in Hive version 1.2.x
    This test will fail if the lineage information is not available from Hive
    Once the patch for HIVE-13112 is committed to Hive branch 1.2.x, the test can be enabled
    Please track HIVE-14706 to know the status of column lineage availability in latest Hive versions i.e 2.1.x
     */
@Test(enabled = false)
public void testColumnLevelLineage() throws Exception {
    String sourceTable = "table" + random();
    runCommand("create table " + sourceTable + "(a int, b int)");
    String sourceTableGUID = assertTableIsRegistered(DEFAULT_DB, sourceTable);
    String a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "a"));
    String b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, sourceTable), "b"));
    String ctasTableName = "table" + random();
    String query = "create table " + ctasTableName + " as " + "select sum(a+b) as a, count(*) as b from " + sourceTable;
    runCommand(query);
    String dest_a_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "a"));
    String dest_b_guid = assertColumnIsRegistered(HiveMetaStoreBridge.getColumnQualifiedName(HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, ctasTableName), "b"));
    final Set<ReadEntity> inputs = getInputs(sourceTable, Entity.Type.TABLE);
    final Set<WriteEntity> outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
    HiveHook.HiveEventContext event = constructEvent(query, HiveOperation.CREATETABLE_AS_SELECT, inputs, outputs);
    assertProcessIsRegistered(event);
    assertTableIsRegistered(DEFAULT_DB, ctasTableName);
    String processQName = sortEventsAndGetProcessQualifiedName(event);
    List<String> aLineageInputs = Arrays.asList(a_guid, b_guid);
    String aLineageProcessName = processQName + ":" + "a";
    LOG.debug("Searching for column lineage process {} ", aLineageProcessName);
    String guid = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, aLineageProcessName, null);
    List<Id> processInputs = (List<Id>) atlasClient.getEntity(guid).get("inputs");
    List<String> processInputsAsString = new ArrayList<>();
    for (Id input : processInputs) {
        processInputsAsString.add(input._getId());
    }
    Collections.sort(processInputsAsString);
    Collections.sort(aLineageInputs);
    Assert.assertEquals(processInputsAsString, aLineageInputs);
    List<String> bLineageInputs = Arrays.asList(sourceTableGUID);
    String bLineageProcessName = processQName + ":" + "b";
    LOG.debug("Searching for column lineage process {} ", bLineageProcessName);
    String guid1 = assertEntityIsRegistered(HiveDataTypes.HIVE_COLUMN_LINEAGE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, bLineageProcessName, null);
    List<Id> bProcessInputs = (List<Id>) atlasClient.getEntity(guid1).get("inputs");
    List<String> bProcessInputsAsString = new ArrayList<>();
    for (Id input : bProcessInputs) {
        bProcessInputsAsString.add(input._getId());
    }
    Collections.sort(bProcessInputsAsString);
    Collections.sort(bLineageInputs);
    Assert.assertEquals(bProcessInputsAsString, bLineageInputs);
    //Test lineage API response
    JSONObject response = atlasClient.getInputGraphForEntity(dest_a_guid);
    JSONObject vertices = response.getJSONObject("values").getJSONObject("vertices");
    JSONObject dest_a_val = (JSONObject) vertices.get(dest_a_guid);
    JSONObject src_a_val = (JSONObject) vertices.get(a_guid);
    JSONObject src_b_val = (JSONObject) vertices.get(b_guid);
    Assert.assertNotNull(dest_a_val);
    Assert.assertNotNull(src_a_val);
    Assert.assertNotNull(src_b_val);
    JSONObject b_response = atlasClient.getInputGraphForEntity(dest_b_guid);
    JSONObject b_vertices = b_response.getJSONObject("values").getJSONObject("vertices");
    JSONObject b_val = (JSONObject) b_vertices.get(dest_b_guid);
    JSONObject src_tbl_val = (JSONObject) b_vertices.get(sourceTableGUID);
    Assert.assertNotNull(b_val);
    Assert.assertNotNull(src_tbl_val);
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) JSONObject(org.codehaus.jettison.json.JSONObject) ImmutableList(com.google.common.collect.ImmutableList) Id(org.apache.atlas.typesystem.persistence.Id) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Example 9 with WriteEntity

use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project incubator-atlas by apache.

the class HiveHookIT method testUpdateProcess.

@Test
public void testUpdateProcess() throws Exception {
    String tableName = createTable();
    String pFile1 = createTestDFSPath("somedfspath1");
    String query = "insert overwrite DIRECTORY '" + pFile1 + "' select id, name from " + tableName;
    runCommand(query);
    Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
    final Set<WriteEntity> outputs = getOutputs(pFile1, Entity.Type.DFS_DIR);
    outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
    final HiveHook.HiveEventContext hiveEventContext = constructEvent(query, HiveOperation.QUERY, inputs, outputs);
    Referenceable processReference = validateProcess(hiveEventContext);
    validateHDFSPaths(processReference, OUTPUTS, pFile1);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    validateInputTables(processReference, inputs);
    //Rerun same query with same HDFS path
    runCommandWithDelay(query, 1000);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    Referenceable process2Reference = validateProcess(hiveEventContext);
    validateHDFSPaths(process2Reference, OUTPUTS, pFile1);
    Assert.assertEquals(process2Reference.getId()._getId(), processReference.getId()._getId());
    //Rerun same query with a new HDFS path. Will result in same process since HDFS paths is not part of qualified name for QUERY operations
    final String pFile2 = createTestDFSPath("somedfspath2");
    query = "insert overwrite DIRECTORY '" + pFile2 + "' select id, name from " + tableName;
    runCommandWithDelay(query, 1000);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    Set<WriteEntity> p3Outputs = new LinkedHashSet<WriteEntity>() {

        {
            addAll(getOutputs(pFile2, Entity.Type.DFS_DIR));
            addAll(outputs);
        }
    };
    Referenceable process3Reference = validateProcess(constructEvent(query, HiveOperation.QUERY, inputs, p3Outputs));
    validateHDFSPaths(process3Reference, OUTPUTS, pFile2);
    Assert.assertEquals(process3Reference.getId()._getId(), processReference.getId()._getId());
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Referenceable(org.apache.atlas.typesystem.Referenceable) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Example 10 with WriteEntity

use of org.apache.hadoop.hive.ql.hooks.WriteEntity in project incubator-atlas by apache.

the class HiveHookIT method testInsertIntoDFSDirPartitioned.

@Test
public void testInsertIntoDFSDirPartitioned() throws Exception {
    //Test with partitioned table
    String tableName = createTable(true);
    String pFile1 = createTestDFSPath("somedfspath1");
    String query = "insert overwrite DIRECTORY '" + pFile1 + "' select id, name from " + tableName + " where dt = '" + PART_FILE + "'";
    runCommand(query);
    Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
    final Set<WriteEntity> outputs = getOutputs(pFile1, Entity.Type.DFS_DIR);
    outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
    final Set<ReadEntity> partitionIps = new LinkedHashSet<>(inputs);
    partitionIps.addAll(getInputs(DEFAULT_DB + "@" + tableName + "@dt='" + PART_FILE + "'", Entity.Type.PARTITION));
    Referenceable processReference = validateProcess(constructEvent(query, HiveOperation.QUERY, partitionIps, outputs), inputs, outputs);
    //Rerun same query with different HDFS path. Should not create another process and should update it.
    final String pFile2 = createTestDFSPath("somedfspath2");
    query = "insert overwrite DIRECTORY '" + pFile2 + "' select id, name from " + tableName + " where dt = '" + PART_FILE + "'";
    runCommand(query);
    final Set<WriteEntity> pFile2Outputs = getOutputs(pFile2, Entity.Type.DFS_DIR);
    pFile2Outputs.iterator().next().setWriteType(WriteEntity.WriteType.PATH_WRITE);
    //Now the process has 2 paths - one older with deleted reference to partition and another with the the latest partition
    Set<WriteEntity> p2Outputs = new LinkedHashSet<WriteEntity>() {

        {
            addAll(pFile2Outputs);
            addAll(outputs);
        }
    };
    Referenceable process2Reference = validateProcess(constructEvent(query, HiveOperation.QUERY, partitionIps, pFile2Outputs), inputs, p2Outputs);
    validateHDFSPaths(process2Reference, OUTPUTS, pFile2);
    Assert.assertEquals(process2Reference.getId()._getId(), processReference.getId()._getId());
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Referenceable(org.apache.atlas.typesystem.Referenceable) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Aggregations

WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)88 Table (org.apache.hadoop.hive.ql.metadata.Table)39 ReadEntity (org.apache.hadoop.hive.ql.hooks.ReadEntity)35 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)24 Partition (org.apache.hadoop.hive.ql.metadata.Partition)24 ArrayList (java.util.ArrayList)18 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)14 Path (org.apache.hadoop.fs.Path)13 AlterTableExchangePartition (org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition)13 Referenceable (org.apache.atlas.typesystem.Referenceable)11 Database (org.apache.hadoop.hive.metastore.api.Database)11 Test (org.junit.Test)11 QueryPlan (org.apache.hadoop.hive.ql.QueryPlan)10 HashMap (java.util.HashMap)9 LinkedHashMap (java.util.LinkedHashMap)9 Test (org.testng.annotations.Test)9 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)8 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)8 SQLNotNullConstraint (org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint)8 SQLUniqueConstraint (org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint)8