use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaSimpleStreamingTest method testSource.
/**
* Tests that the source creates workUnits appropriately. Sets up a topic with a single partition and checks that a
* single workUnit is returned with the right parameters sets
* @throws IOException
* @throws InterruptedException
*/
@Test
public void testSource() throws IOException, InterruptedException {
String topic = "testSimpleStreamingSource";
_kafkaTestHelper.provisionTopic(topic);
List<WorkUnit> lWu = getWorkUnits(topic);
// Check we have a single WorkUnit with the right properties setup.
Assert.assertEquals(lWu.size(), 1);
WorkUnit wU = lWu.get(0);
Assert.assertEquals(KafkaSimpleStreamingSource.getTopicNameFromState(wU), topic);
Assert.assertEquals(KafkaSimpleStreamingSource.getPartitionIdFromState(wU), 0);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HivePurgerPublisher method submitEvent.
private void submitEvent(WorkUnitState state, String name) {
WorkUnit workUnit = state.getWorkunit();
Map<String, String> metadata = new HashMap<>();
String recordsRead = state.getProp(ComplianceConfigurationKeys.NUM_ROWS);
metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSREAD, recordsRead);
metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESREAD, getDataSize(workUnit.getProp(ComplianceConfigurationKeys.RAW_DATA_SIZE), workUnit.getProp(ComplianceConfigurationKeys.TOTAL_SIZE)));
String partitionNameProp = workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME);
Splitter AT_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();
List<String> namesList = AT_SPLITTER.splitToList(partitionNameProp);
if (namesList.size() != 3) {
log.warn("Not submitting event. Invalid partition name: " + partitionNameProp);
return;
}
String dbName = namesList.get(0), tableName = namesList.get(1), partitionName = namesList.get(2);
org.apache.hadoop.hive.metastore.api.Partition apiPartition = null;
Partition qlPartition = null;
try {
Table table = new Table(this.client.getTable(dbName, tableName));
apiPartition = this.client.getPartition(dbName, tableName, partitionName);
qlPartition = new Partition(table, apiPartition);
} catch (Exception e) {
log.warn("Not submitting event. Failed to resolve partition '" + partitionName + "': " + e);
e.printStackTrace();
return;
}
HivePartitionDataset hivePartitionDataset = new HivePartitionDataset(qlPartition);
String recordsWritten = DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.NUM_ROWS, ComplianceConfigurationKeys.DEFAULT_NUM_ROWS);
String recordsPurged = Long.toString((Long.parseLong(recordsRead) - Long.parseLong(recordsWritten)));
metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSWRITTEN, recordsWritten);
metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESWRITTEN, getDataSize(DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.RAW_DATA_SIZE, ComplianceConfigurationKeys.DEFAULT_RAW_DATA_SIZE), DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.TOTAL_SIZE, ComplianceConfigurationKeys.DEFAULT_TOTAL_SIZE)));
metadata.put(DatasetMetrics.DATABASE_NAME, hivePartitionDataset.getDbName());
metadata.put(DatasetMetrics.TABLE_NAME, hivePartitionDataset.getTableName());
metadata.put(DatasetMetrics.PARTITION_NAME, hivePartitionDataset.getName());
metadata.put(DatasetMetrics.RECORDS_PURGED, recordsPurged);
this.eventSubmitter.submit(name, metadata);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HivePurgerSource method createWorkunitsFromPreviousState.
/**
* Add failed work units in a workUnitMap with partition name as Key.
* New work units are created using required configuration from the old work unit.
*/
protected void createWorkunitsFromPreviousState(SourceState state) {
if (this.lowWatermark.equalsIgnoreCase(ComplianceConfigurationKeys.NO_PREVIOUS_WATERMARK)) {
return;
}
if (Iterables.isEmpty(state.getPreviousWorkUnitStates())) {
return;
}
for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
if (workUnitState.getWorkingState() == WorkUnitState.WorkingState.COMMITTED) {
continue;
}
WorkUnit workUnit = workUnitState.getWorkunit();
Preconditions.checkArgument(workUnit.contains(ComplianceConfigurationKeys.PARTITION_NAME), "Older WorkUnit doesn't contain property partition name.");
int executionAttempts = workUnit.getPropAsInt(ComplianceConfigurationKeys.EXECUTION_ATTEMPTS, ComplianceConfigurationKeys.DEFAULT_EXECUTION_ATTEMPTS);
if (executionAttempts < this.maxWorkUnitExecutionAttempts) {
Optional<WorkUnit> workUnitOptional = createNewWorkUnit(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), ++executionAttempts);
if (!workUnitOptional.isPresent()) {
continue;
}
workUnit = workUnitOptional.get();
log.info("Revived old Work Unit for partiton " + workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME) + " having execution attempt " + workUnit.getProp(ComplianceConfigurationKeys.EXECUTION_ATTEMPTS));
workUnitMap.put(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), workUnit);
}
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HivePurgerSource method createNewWorkUnit.
protected WorkUnit createNewWorkUnit(HivePartitionDataset dataset, int executionAttempts) {
WorkUnit workUnit = WorkUnit.createEmpty();
workUnit.setProp(ComplianceConfigurationKeys.PARTITION_NAME, dataset.datasetURN());
workUnit.setProp(ComplianceConfigurationKeys.EXECUTION_ATTEMPTS, executionAttempts);
workUnit.setProp(ComplianceConfigurationKeys.TIMESTAMP, this.timeStamp);
workUnit.setProp(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_SHOULD_PROXY, this.shouldProxy);
workUnit.setProp(ComplianceConfigurationKeys.EXECUTION_COUNT, this.executionCount);
workUnit.setProp(ComplianceConfigurationKeys.NUM_ROWS, DatasetUtils.getProperty(dataset, ComplianceConfigurationKeys.NUM_ROWS, ComplianceConfigurationKeys.DEFAULT_NUM_ROWS));
workUnit.setProp(ComplianceConfigurationKeys.RAW_DATA_SIZE, DatasetUtils.getProperty(dataset, ComplianceConfigurationKeys.RAW_DATA_SIZE, ComplianceConfigurationKeys.DEFAULT_RAW_DATA_SIZE));
workUnit.setProp(ComplianceConfigurationKeys.TOTAL_SIZE, DatasetUtils.getProperty(dataset, ComplianceConfigurationKeys.TOTAL_SIZE, ComplianceConfigurationKeys.DEFAULT_TOTAL_SIZE));
submitWorkUnitGeneratedEvent(dataset.datasetURN(), executionAttempts);
return workUnit;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HivePurgerSource method createWorkUnits.
/**
* This method creates the list of all work units needed for the current execution.
* Fresh work units are created for each partition starting from watermark and failed work units from the
* previous run will be added to the list.
*/
protected void createWorkUnits(SourceState state) throws IOException {
createWorkunitsFromPreviousState(state);
if (this.datasets.isEmpty()) {
return;
}
for (HivePartitionDataset dataset : this.datasets) {
Optional<String> owner = dataset.getOwner();
if (workUnitsExceeded()) {
log.info("Workunits exceeded");
setJobWatermark(state, dataset.datasetURN());
return;
}
if (!this.policy.shouldPurge(dataset)) {
continue;
}
WorkUnit workUnit = createNewWorkUnit(dataset);
log.info("Created new work unit with partition " + workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME));
this.workUnitMap.put(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), workUnit);
this.workUnitsCreatedCount++;
}
if (!state.contains(ComplianceConfigurationKeys.HIVE_PURGER_WATERMARK)) {
this.setJobWatermark(state, ComplianceConfigurationKeys.NO_PREVIOUS_WATERMARK);
}
}
Aggregations