Search in sources :

Example 96 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method testSource.

/**
 * Tests that the source creates workUnits appropriately. Sets up a topic with a single partition and checks that a
 * single workUnit is returned with the right parameters sets
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void testSource() throws IOException, InterruptedException {
    String topic = "testSimpleStreamingSource";
    _kafkaTestHelper.provisionTopic(topic);
    List<WorkUnit> lWu = getWorkUnits(topic);
    // Check we have a single WorkUnit with the right properties setup.
    Assert.assertEquals(lWu.size(), 1);
    WorkUnit wU = lWu.get(0);
    Assert.assertEquals(KafkaSimpleStreamingSource.getTopicNameFromState(wU), topic);
    Assert.assertEquals(KafkaSimpleStreamingSource.getPartitionIdFromState(wU), 0);
}
Also used : WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 97 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HivePurgerPublisher method submitEvent.

private void submitEvent(WorkUnitState state, String name) {
    WorkUnit workUnit = state.getWorkunit();
    Map<String, String> metadata = new HashMap<>();
    String recordsRead = state.getProp(ComplianceConfigurationKeys.NUM_ROWS);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSREAD, recordsRead);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESREAD, getDataSize(workUnit.getProp(ComplianceConfigurationKeys.RAW_DATA_SIZE), workUnit.getProp(ComplianceConfigurationKeys.TOTAL_SIZE)));
    String partitionNameProp = workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME);
    Splitter AT_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();
    List<String> namesList = AT_SPLITTER.splitToList(partitionNameProp);
    if (namesList.size() != 3) {
        log.warn("Not submitting event. Invalid partition name: " + partitionNameProp);
        return;
    }
    String dbName = namesList.get(0), tableName = namesList.get(1), partitionName = namesList.get(2);
    org.apache.hadoop.hive.metastore.api.Partition apiPartition = null;
    Partition qlPartition = null;
    try {
        Table table = new Table(this.client.getTable(dbName, tableName));
        apiPartition = this.client.getPartition(dbName, tableName, partitionName);
        qlPartition = new Partition(table, apiPartition);
    } catch (Exception e) {
        log.warn("Not submitting event. Failed to resolve partition '" + partitionName + "': " + e);
        e.printStackTrace();
        return;
    }
    HivePartitionDataset hivePartitionDataset = new HivePartitionDataset(qlPartition);
    String recordsWritten = DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.NUM_ROWS, ComplianceConfigurationKeys.DEFAULT_NUM_ROWS);
    String recordsPurged = Long.toString((Long.parseLong(recordsRead) - Long.parseLong(recordsWritten)));
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSWRITTEN, recordsWritten);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESWRITTEN, getDataSize(DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.RAW_DATA_SIZE, ComplianceConfigurationKeys.DEFAULT_RAW_DATA_SIZE), DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.TOTAL_SIZE, ComplianceConfigurationKeys.DEFAULT_TOTAL_SIZE)));
    metadata.put(DatasetMetrics.DATABASE_NAME, hivePartitionDataset.getDbName());
    metadata.put(DatasetMetrics.TABLE_NAME, hivePartitionDataset.getTableName());
    metadata.put(DatasetMetrics.PARTITION_NAME, hivePartitionDataset.getName());
    metadata.put(DatasetMetrics.RECORDS_PURGED, recordsPurged);
    this.eventSubmitter.submit(name, metadata);
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) Splitter(com.google.common.base.Splitter) Table(org.apache.hadoop.hive.ql.metadata.Table) HashMap(java.util.HashMap) TException(org.apache.thrift.TException) HivePartitionDataset(org.apache.gobblin.compliance.HivePartitionDataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 98 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HivePurgerSource method createWorkunitsFromPreviousState.

/**
 * Add failed work units in a workUnitMap with partition name as Key.
 * New work units are created using required configuration from the old work unit.
 */
protected void createWorkunitsFromPreviousState(SourceState state) {
    if (this.lowWatermark.equalsIgnoreCase(ComplianceConfigurationKeys.NO_PREVIOUS_WATERMARK)) {
        return;
    }
    if (Iterables.isEmpty(state.getPreviousWorkUnitStates())) {
        return;
    }
    for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
        if (workUnitState.getWorkingState() == WorkUnitState.WorkingState.COMMITTED) {
            continue;
        }
        WorkUnit workUnit = workUnitState.getWorkunit();
        Preconditions.checkArgument(workUnit.contains(ComplianceConfigurationKeys.PARTITION_NAME), "Older WorkUnit doesn't contain property partition name.");
        int executionAttempts = workUnit.getPropAsInt(ComplianceConfigurationKeys.EXECUTION_ATTEMPTS, ComplianceConfigurationKeys.DEFAULT_EXECUTION_ATTEMPTS);
        if (executionAttempts < this.maxWorkUnitExecutionAttempts) {
            Optional<WorkUnit> workUnitOptional = createNewWorkUnit(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), ++executionAttempts);
            if (!workUnitOptional.isPresent()) {
                continue;
            }
            workUnit = workUnitOptional.get();
            log.info("Revived old Work Unit for partiton " + workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME) + " having execution attempt " + workUnit.getProp(ComplianceConfigurationKeys.EXECUTION_ATTEMPTS));
            workUnitMap.put(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), workUnit);
        }
    }
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 99 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HivePurgerSource method createNewWorkUnit.

protected WorkUnit createNewWorkUnit(HivePartitionDataset dataset, int executionAttempts) {
    WorkUnit workUnit = WorkUnit.createEmpty();
    workUnit.setProp(ComplianceConfigurationKeys.PARTITION_NAME, dataset.datasetURN());
    workUnit.setProp(ComplianceConfigurationKeys.EXECUTION_ATTEMPTS, executionAttempts);
    workUnit.setProp(ComplianceConfigurationKeys.TIMESTAMP, this.timeStamp);
    workUnit.setProp(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_SHOULD_PROXY, this.shouldProxy);
    workUnit.setProp(ComplianceConfigurationKeys.EXECUTION_COUNT, this.executionCount);
    workUnit.setProp(ComplianceConfigurationKeys.NUM_ROWS, DatasetUtils.getProperty(dataset, ComplianceConfigurationKeys.NUM_ROWS, ComplianceConfigurationKeys.DEFAULT_NUM_ROWS));
    workUnit.setProp(ComplianceConfigurationKeys.RAW_DATA_SIZE, DatasetUtils.getProperty(dataset, ComplianceConfigurationKeys.RAW_DATA_SIZE, ComplianceConfigurationKeys.DEFAULT_RAW_DATA_SIZE));
    workUnit.setProp(ComplianceConfigurationKeys.TOTAL_SIZE, DatasetUtils.getProperty(dataset, ComplianceConfigurationKeys.TOTAL_SIZE, ComplianceConfigurationKeys.DEFAULT_TOTAL_SIZE));
    submitWorkUnitGeneratedEvent(dataset.datasetURN(), executionAttempts);
    return workUnit;
}
Also used : WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 100 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HivePurgerSource method createWorkUnits.

/**
 * This method creates the list of all work units needed for the current execution.
 * Fresh work units are created for each partition starting from watermark and failed work units from the
 * previous run will be added to the list.
 */
protected void createWorkUnits(SourceState state) throws IOException {
    createWorkunitsFromPreviousState(state);
    if (this.datasets.isEmpty()) {
        return;
    }
    for (HivePartitionDataset dataset : this.datasets) {
        Optional<String> owner = dataset.getOwner();
        if (workUnitsExceeded()) {
            log.info("Workunits exceeded");
            setJobWatermark(state, dataset.datasetURN());
            return;
        }
        if (!this.policy.shouldPurge(dataset)) {
            continue;
        }
        WorkUnit workUnit = createNewWorkUnit(dataset);
        log.info("Created new work unit with partition " + workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME));
        this.workUnitMap.put(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), workUnit);
        this.workUnitsCreatedCount++;
    }
    if (!state.contains(ComplianceConfigurationKeys.HIVE_PURGER_WATERMARK)) {
        this.setJobWatermark(state, ComplianceConfigurationKeys.NO_PREVIOUS_WATERMARK);
    }
}
Also used : HivePartitionDataset(org.apache.gobblin.compliance.HivePartitionDataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6