Search in sources :

Example 46 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHook method handleExternalTables.

private void handleExternalTables(final HiveMetaStoreBridge dgiBridge, final HiveEventContext event, final LinkedHashMap<Type, Referenceable> tables) throws HiveException, MalformedURLException {
    List<Referenceable> entities = new ArrayList<>();
    final WriteEntity hiveEntity = (WriteEntity) getEntityByType(event.getOutputs(), Type.TABLE);
    Table hiveTable = hiveEntity == null ? null : hiveEntity.getTable();
    //Refresh to get the correct location
    if (hiveTable != null) {
        hiveTable = dgiBridge.hiveClient.getTable(hiveTable.getDbName(), hiveTable.getTableName());
    }
    if (hiveTable != null && TableType.EXTERNAL_TABLE.equals(hiveTable.getTableType())) {
        LOG.info("Registering external table process {} ", event.getQueryStr());
        final String location = lower(hiveTable.getDataLocation().toString());
        final ReadEntity dfsEntity = new ReadEntity();
        dfsEntity.setTyp(Type.DFS_DIR);
        dfsEntity.setD(new Path(location));
        SortedMap<ReadEntity, Referenceable> hiveInputsMap = new TreeMap<ReadEntity, Referenceable>(entityComparator) {

            {
                put(dfsEntity, dgiBridge.fillHDFSDataSet(location));
            }
        };
        SortedMap<WriteEntity, Referenceable> hiveOutputsMap = new TreeMap<WriteEntity, Referenceable>(entityComparator) {

            {
                put(hiveEntity, tables.get(Type.TABLE));
            }
        };
        SortedSet<ReadEntity> sortedIps = new TreeSet<>(entityComparator);
        sortedIps.addAll(hiveInputsMap.keySet());
        SortedSet<WriteEntity> sortedOps = new TreeSet<>(entityComparator);
        sortedOps.addAll(hiveOutputsMap.keySet());
        Referenceable processReferenceable = getProcessReferenceable(dgiBridge, event, sortedIps, sortedOps, hiveInputsMap, hiveOutputsMap);
        entities.addAll(tables.values());
        entities.add(processReferenceable);
        event.addMessage(new HookNotification.EntityUpdateRequest(event.getUser(), entities));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Table(org.apache.hadoop.hive.ql.metadata.Table) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) HookNotification(org.apache.atlas.notification.hook.HookNotification) Referenceable(org.apache.atlas.typesystem.Referenceable) TreeSet(java.util.TreeSet) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity)

Example 47 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHook method registerProcess.

private void registerProcess(HiveMetaStoreBridge dgiBridge, HiveEventContext event) throws AtlasHookException {
    try {
        Set<ReadEntity> inputs = event.getInputs();
        Set<WriteEntity> outputs = event.getOutputs();
        //Even explain CTAS has operation name as CREATETABLE_AS_SELECT
        if (inputs.isEmpty() && outputs.isEmpty()) {
            LOG.info("Explain statement. Skipping...");
            return;
        }
        if (event.getQueryId() == null) {
            LOG.info("Query id/plan is missing for {}", event.getQueryStr());
        }
        final SortedMap<ReadEntity, Referenceable> source = new TreeMap<>(entityComparator);
        final SortedMap<WriteEntity, Referenceable> target = new TreeMap<>(entityComparator);
        final Set<String> dataSets = new HashSet<>();
        final Set<Referenceable> entities = new LinkedHashSet<>();
        boolean isSelectQuery = isSelectQuery(event);
        // filter out select queries which do not modify data
        if (!isSelectQuery) {
            SortedSet<ReadEntity> sortedHiveInputs = new TreeSet<>(entityComparator);
            if (event.getInputs() != null) {
                sortedHiveInputs.addAll(event.getInputs());
            }
            SortedSet<WriteEntity> sortedHiveOutputs = new TreeSet<>(entityComparator);
            if (event.getOutputs() != null) {
                sortedHiveOutputs.addAll(event.getOutputs());
            }
            for (ReadEntity readEntity : sortedHiveInputs) {
                processHiveEntity(dgiBridge, event, readEntity, dataSets, source, entities);
            }
            for (WriteEntity writeEntity : sortedHiveOutputs) {
                processHiveEntity(dgiBridge, event, writeEntity, dataSets, target, entities);
            }
            if (source.size() > 0 || target.size() > 0) {
                Referenceable processReferenceable = getProcessReferenceable(dgiBridge, event, sortedHiveInputs, sortedHiveOutputs, source, target);
                // setup Column Lineage
                List<Referenceable> sourceList = new ArrayList<>(source.values());
                List<Referenceable> targetList = new ArrayList<>(target.values());
                List<Referenceable> colLineageProcessInstances = new ArrayList<>();
                try {
                    Map<String, Referenceable> columnQNameToRef = ColumnLineageUtils.buildColumnReferenceableMap(sourceList, targetList);
                    colLineageProcessInstances = createColumnLineageProcessInstances(processReferenceable, event.lineageInfo, columnQNameToRef);
                } catch (Exception e) {
                    LOG.warn("Column lineage process setup failed with exception {}", e);
                }
                colLineageProcessInstances.add(0, processReferenceable);
                entities.addAll(colLineageProcessInstances);
                event.addMessage(new HookNotification.EntityUpdateRequest(event.getUser(), new ArrayList<>(entities)));
            } else {
                LOG.info("Skipped query {} since it has no getInputs() or resulting getOutputs()", event.getQueryStr());
            }
        } else {
            LOG.info("Skipped query {} for processing since it is a select query ", event.getQueryStr());
        }
    } catch (Exception e) {
        throw new AtlasHookException("HiveHook.registerProcess() failed.", e);
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) AtlasHookException(org.apache.atlas.hook.AtlasHookException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MalformedURLException(java.net.MalformedURLException) AtlasHookException(org.apache.atlas.hook.AtlasHookException) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) HookNotification(org.apache.atlas.notification.hook.HookNotification) Referenceable(org.apache.atlas.typesystem.Referenceable) TreeSet(java.util.TreeSet) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 48 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHookIT method testDropAndRecreateCTASOutput.

@Test
public void testDropAndRecreateCTASOutput() throws Exception {
    String tableName = createTable();
    String ctasTableName = "table" + random();
    String query = "create table " + ctasTableName + " as select * from " + tableName;
    runCommand(query);
    assertTableIsRegistered(DEFAULT_DB, ctasTableName);
    Set<ReadEntity> inputs = getInputs(tableName, Entity.Type.TABLE);
    Set<WriteEntity> outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
    final HiveHook.HiveEventContext hiveEventContext = constructEvent(query, HiveOperation.CREATETABLE_AS_SELECT, inputs, outputs);
    String processId = assertProcessIsRegistered(hiveEventContext);
    final String drpquery = String.format("drop table %s ", ctasTableName);
    runCommandWithDelay(drpquery, 100);
    assertTableIsNotRegistered(DEFAULT_DB, ctasTableName);
    runCommand(query);
    assertTableIsRegistered(DEFAULT_DB, ctasTableName);
    outputs = getOutputs(ctasTableName, Entity.Type.TABLE);
    String process2Id = assertProcessIsRegistered(hiveEventContext, inputs, outputs);
    assertNotEquals(process2Id, processId);
    Referenceable processRef = atlasClient.getEntity(processId);
    validateOutputTables(processRef, outputs);
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Referenceable(org.apache.atlas.typesystem.Referenceable) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Example 49 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHookIT method testExportImportPartitionedTable.

@Test
public void testExportImportPartitionedTable() throws Exception {
    boolean isPartitionedTable = true;
    final String tableName = createTable(isPartitionedTable);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    //Add a partition
    String partFile = "pfile://" + mkdir("partition");
    String query = "alter table " + tableName + " add partition (dt='" + PART_FILE + "') location '" + partFile + "'";
    runCommand(query);
    String filename = "pfile://" + mkdir("export");
    query = "export table " + tableName + " to \"" + filename + "\"";
    runCommand(query);
    final Set<ReadEntity> expectedExportInputs = getInputs(tableName, Entity.Type.TABLE);
    final Set<WriteEntity> outputs = getOutputs(filename, Entity.Type.DFS_DIR);
    //Note that export has only partition as input in this case
    final Set<ReadEntity> partitionIps = getInputs(DEFAULT_DB + "@" + tableName + "@dt=" + PART_FILE, Entity.Type.PARTITION);
    partitionIps.addAll(expectedExportInputs);
    Referenceable processReference = validateProcess(constructEvent(query, HiveOperation.EXPORT, partitionIps, outputs), expectedExportInputs, outputs);
    validateHDFSPaths(processReference, OUTPUTS, filename);
    //Import
    String importTableName = createTable(true);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    query = "import table " + importTableName + " from '" + filename + "'";
    runCommand(query);
    final Set<ReadEntity> expectedImportInputs = getInputs(filename, Entity.Type.DFS_DIR);
    final Set<WriteEntity> importOutputs = getOutputs(importTableName, Entity.Type.TABLE);
    final Set<WriteEntity> partitionOps = getOutputs(DEFAULT_DB + "@" + importTableName + "@dt=" + PART_FILE, Entity.Type.PARTITION);
    partitionOps.addAll(importOutputs);
    validateProcess(constructEvent(query, HiveOperation.IMPORT, expectedImportInputs, partitionOps), expectedImportInputs, importOutputs);
    //Export should update same process
    filename = "pfile://" + mkdir("export2");
    query = "export table " + tableName + " to \"" + filename + "\"";
    runCommand(query);
    final Set<WriteEntity> outputs2 = getOutputs(filename, Entity.Type.DFS_DIR);
    Set<WriteEntity> p3Outputs = new LinkedHashSet<WriteEntity>() {

        {
            addAll(outputs2);
            addAll(outputs);
        }
    };
    validateProcess(constructEvent(query, HiveOperation.EXPORT, partitionIps, outputs2), expectedExportInputs, p3Outputs);
    query = "alter table " + importTableName + " drop partition (dt='" + PART_FILE + "')";
    runCommand(query);
    //Import should update same process
    query = "import table " + importTableName + " from '" + filename + "'";
    runCommandWithDelay(query, 1000);
    final Set<ReadEntity> importInputs = getInputs(filename, Entity.Type.DFS_DIR);
    final Set<ReadEntity> expectedImport2Inputs = new LinkedHashSet<ReadEntity>() {

        {
            addAll(importInputs);
            addAll(expectedImportInputs);
        }
    };
    validateProcess(constructEvent(query, HiveOperation.IMPORT, importInputs, partitionOps), expectedImport2Inputs, importOutputs);
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Referenceable(org.apache.atlas.typesystem.Referenceable) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Example 50 with ReadEntity

use of org.apache.hadoop.hive.ql.hooks.ReadEntity in project incubator-atlas by apache.

the class HiveHookIT method testLoadDFSPathPartitioned.

@Test
public void testLoadDFSPathPartitioned() throws Exception {
    String tableName = createTable(true, true, false);
    assertTableIsRegistered(DEFAULT_DB, tableName);
    final String loadFile = createTestDFSFile("loadDFSFile");
    String query = "load data inpath '" + loadFile + "' into table " + tableName + " partition(dt = '" + PART_FILE + "')";
    runCommand(query);
    final Set<WriteEntity> outputs = getOutputs(tableName, Entity.Type.TABLE);
    final Set<ReadEntity> inputs = getInputs(loadFile, Entity.Type.DFS_DIR);
    final Set<WriteEntity> partitionOps = new LinkedHashSet<>(outputs);
    partitionOps.addAll(getOutputs(DEFAULT_DB + "@" + tableName + "@dt=" + PART_FILE, Entity.Type.PARTITION));
    Referenceable processReference = validateProcess(constructEvent(query, HiveOperation.LOAD, inputs, partitionOps), inputs, outputs);
    validateHDFSPaths(processReference, INPUTS, loadFile);
    validateOutputTables(processReference, outputs);
    final String loadFile2 = createTestDFSFile("loadDFSFile1");
    query = "load data inpath '" + loadFile2 + "' into table " + tableName + " partition(dt = '" + PART_FILE + "')";
    runCommand(query);
    Set<ReadEntity> process2Inputs = getInputs(loadFile2, Entity.Type.DFS_DIR);
    Set<ReadEntity> expectedInputs = new LinkedHashSet<>();
    expectedInputs.addAll(process2Inputs);
    expectedInputs.addAll(inputs);
    validateProcess(constructEvent(query, HiveOperation.LOAD, expectedInputs, partitionOps), expectedInputs, outputs);
}
Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Referenceable(org.apache.atlas.typesystem.Referenceable) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Test(org.testng.annotations.Test)

Aggregations

ReadEntity (org.apache.hadoop.hive.ql.hooks.ReadEntity)75 Table (org.apache.hadoop.hive.ql.metadata.Table)35 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)34 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)24 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)18 Partition (org.apache.hadoop.hive.ql.metadata.Partition)18 ArrayList (java.util.ArrayList)15 Referenceable (org.apache.atlas.typesystem.Referenceable)10 LinkedHashMap (java.util.LinkedHashMap)9 AlterTableExchangePartition (org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition)9 HashMap (java.util.HashMap)8 Test (org.testng.annotations.Test)8 Path (org.apache.hadoop.fs.Path)7 FileNotFoundException (java.io.FileNotFoundException)6 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)5 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)5 SQLNotNullConstraint (org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint)5 SQLUniqueConstraint (org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint)5 DefaultConstraint (org.apache.hadoop.hive.ql.metadata.DefaultConstraint)5 InvalidTableException (org.apache.hadoop.hive.ql.metadata.InvalidTableException)5