use of org.apache.gobblin.source.workunit.WorkUnitStream in project incubator-gobblin by apache.
the class AbstractJobLauncher method launchJob.
@Override
public void launchJob(JobListener jobListener) throws JobException {
String jobId = this.jobContext.getJobId();
final JobState jobState = this.jobContext.getJobState();
try {
MDC.put(ConfigurationKeys.JOB_NAME_KEY, this.jobContext.getJobName());
MDC.put(ConfigurationKeys.JOB_KEY_KEY, this.jobContext.getJobKey());
TimingEvent launchJobTimer = this.eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.FULL_JOB_EXECUTION);
try (Closer closer = Closer.create()) {
closer.register(this.jobContext);
notifyListeners(this.jobContext, jobListener, TimingEvent.LauncherTimings.JOB_PREPARE, new JobListenerAction() {
@Override
public void apply(JobListener jobListener, JobContext jobContext) throws Exception {
jobListener.onJobPrepare(jobContext);
}
});
if (this.jobContext.getSemantics() == DeliverySemantics.EXACTLY_ONCE) {
// If exactly-once is used, commit sequences of the previous run must be successfully compelted
// before this run can make progress.
executeUnfinishedCommitSequences(jobState.getJobName());
}
TimingEvent workUnitsCreationTimer = this.eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.WORK_UNITS_CREATION);
Source<?, ?> source = this.jobContext.getSource();
WorkUnitStream workUnitStream;
if (source instanceof WorkUnitStreamSource) {
workUnitStream = ((WorkUnitStreamSource) source).getWorkunitStream(jobState);
} else {
workUnitStream = new BasicWorkUnitStream.Builder(source.getWorkunits(jobState)).build();
}
workUnitsCreationTimer.stop(this.eventMetadataGenerator.getMetadata(this.jobContext, EventName.WORK_UNITS_CREATION));
// The absence means there is something wrong getting the work units
if (workUnitStream == null || workUnitStream.getWorkUnits() == null) {
this.eventSubmitter.submit(JobEvent.WORK_UNITS_MISSING);
jobState.setState(JobState.RunningState.FAILED);
throw new JobException("Failed to get work units for job " + jobId);
}
// No work unit to run
if (!workUnitStream.getWorkUnits().hasNext()) {
this.eventSubmitter.submit(JobEvent.WORK_UNITS_EMPTY);
LOG.warn("No work units have been created for job " + jobId);
jobState.setState(JobState.RunningState.COMMITTED);
notifyListeners(this.jobContext, jobListener, TimingEvent.LauncherTimings.JOB_COMPLETE, new JobListenerAction() {
@Override
public void apply(JobListener jobListener, JobContext jobContext) throws Exception {
jobListener.onJobCompletion(jobContext);
}
});
return;
}
// Initialize writer and converter(s)
closer.register(WriterInitializerFactory.newInstace(jobState, workUnitStream)).initialize();
closer.register(ConverterInitializerFactory.newInstance(jobState, workUnitStream)).initialize();
TimingEvent stagingDataCleanTimer = this.eventSubmitter.getTimingEvent(TimingEvent.RunJobTimings.MR_STAGING_DATA_CLEAN);
// Cleanup left-over staging data possibly from the previous run. This is particularly
// important if the current batch of WorkUnits include failed WorkUnits from the previous
// run which may still have left-over staging data not cleaned up yet.
cleanLeftoverStagingData(workUnitStream, jobState);
stagingDataCleanTimer.stop(this.eventMetadataGenerator.getMetadata(this.jobContext, EventName.MR_STAGING_DATA_CLEAN));
long startTime = System.currentTimeMillis();
jobState.setStartTime(startTime);
jobState.setState(JobState.RunningState.RUNNING);
try {
LOG.info("Starting job " + jobId);
notifyListeners(this.jobContext, jobListener, TimingEvent.LauncherTimings.JOB_START, new JobListenerAction() {
@Override
public void apply(JobListener jobListener, JobContext jobContext) throws Exception {
jobListener.onJobStart(jobContext);
}
});
TimingEvent workUnitsPreparationTimer = this.eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.WORK_UNITS_PREPARATION);
// Add task ids
workUnitStream = prepareWorkUnits(workUnitStream, jobState);
// Remove skipped workUnits from the list of work units to execute.
workUnitStream = workUnitStream.filter(new SkippedWorkUnitsFilter(jobState));
// Add surviving tasks to jobState
workUnitStream = workUnitStream.transform(new MultiWorkUnitForEach() {
@Override
public void forWorkUnit(WorkUnit workUnit) {
jobState.incrementTaskCount();
jobState.addTaskState(new TaskState(new WorkUnitState(workUnit, jobState)));
}
});
workUnitsPreparationTimer.stop(this.eventMetadataGenerator.getMetadata(this.jobContext, EventName.WORK_UNITS_PREPARATION));
// Write job execution info to the job history store before the job starts to run
this.jobContext.storeJobExecutionInfo();
TimingEvent jobRunTimer = this.eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.JOB_RUN);
// Start the job and wait for it to finish
runWorkUnitStream(workUnitStream);
jobRunTimer.stop(this.eventMetadataGenerator.getMetadata(this.jobContext, EventName.JOB_RUN));
this.eventSubmitter.submit(CaseFormat.UPPER_UNDERSCORE.to(CaseFormat.UPPER_CAMEL, "JOB_" + jobState.getState()));
// Check and set final job jobPropsState upon job completion
if (jobState.getState() == JobState.RunningState.CANCELLED) {
LOG.info(String.format("Job %s has been cancelled, aborting now", jobId));
return;
}
TimingEvent jobCommitTimer = this.eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.JOB_COMMIT);
this.jobContext.finalizeJobStateBeforeCommit();
this.jobContext.commit();
postProcessJobState(jobState);
jobCommitTimer.stop(this.eventMetadataGenerator.getMetadata(this.jobContext, EventName.JOB_COMMIT));
} finally {
long endTime = System.currentTimeMillis();
jobState.setEndTime(endTime);
jobState.setDuration(endTime - jobState.getStartTime());
}
} catch (Throwable t) {
jobState.setState(JobState.RunningState.FAILED);
String errMsg = "Failed to launch and run job " + jobId;
LOG.error(errMsg + ": " + t, t);
} finally {
try {
TimingEvent jobCleanupTimer = this.eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.JOB_CLEANUP);
cleanupStagingData(jobState);
jobCleanupTimer.stop(this.eventMetadataGenerator.getMetadata(this.jobContext, EventName.JOB_CLEANUP));
// Write job execution info to the job history store upon job termination
this.jobContext.storeJobExecutionInfo();
} finally {
launchJobTimer.stop(this.eventMetadataGenerator.getMetadata(this.jobContext, EventName.FULL_JOB_EXECUTION));
}
}
for (JobState.DatasetState datasetState : this.jobContext.getDatasetStatesByUrns().values()) {
// Set the overall job state to FAILED if the job failed to process any dataset
if (datasetState.getState() == JobState.RunningState.FAILED) {
jobState.setState(JobState.RunningState.FAILED);
LOG.warn("At least one dataset state is FAILED. Setting job state to FAILED.");
break;
}
}
notifyListeners(this.jobContext, jobListener, TimingEvent.LauncherTimings.JOB_COMPLETE, new JobListenerAction() {
@Override
public void apply(JobListener jobListener, JobContext jobContext) throws Exception {
jobListener.onJobCompletion(jobContext);
}
});
if (jobState.getState() == JobState.RunningState.FAILED) {
notifyListeners(this.jobContext, jobListener, TimingEvent.LauncherTimings.JOB_FAILED, new JobListenerAction() {
@Override
public void apply(JobListener jobListener, JobContext jobContext) throws Exception {
jobListener.onJobFailure(jobContext);
}
});
throw new JobException(String.format("Job %s failed", jobId));
}
} finally {
// Stop metrics reporting
if (this.jobContext.getJobMetricsOptional().isPresent()) {
JobMetrics.remove(jobState);
}
MDC.remove(ConfigurationKeys.JOB_NAME_KEY);
MDC.remove(ConfigurationKeys.JOB_KEY_KEY);
}
}
use of org.apache.gobblin.source.workunit.WorkUnitStream in project incubator-gobblin by apache.
the class DatasetFinderSourceTest method testDrilledDown.
@Test
public void testDrilledDown() {
Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2")));
Dataset dataset3 = new SimpleDatasetForTesting("dataset3");
IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset1, dataset2, dataset3));
MySource mySource = new MySource(true, finder);
List<WorkUnit> workUnits = mySource.getWorkunits(new SourceState());
Assert.assertEquals(workUnits.size(), 4);
Assert.assertEquals(workUnits.get(0).getProp(DATASET_URN), "dataset1");
Assert.assertNull(workUnits.get(0).getProp(PARTITION_URN));
Assert.assertEquals(workUnits.get(1).getProp(DATASET_URN), "dataset2");
Assert.assertEquals(workUnits.get(1).getProp(PARTITION_URN), "p1");
Assert.assertEquals(workUnits.get(2).getProp(DATASET_URN), "dataset2");
Assert.assertEquals(workUnits.get(2).getProp(PARTITION_URN), "p2");
Assert.assertEquals(workUnits.get(3).getProp(DATASET_URN), "dataset3");
Assert.assertNull(workUnits.get(3).getProp(PARTITION_URN));
WorkUnitStream workUnitStream = mySource.getWorkunitStream(new SourceState());
Assert.assertEquals(Lists.newArrayList(workUnitStream.getWorkUnits()), workUnits);
}
use of org.apache.gobblin.source.workunit.WorkUnitStream in project incubator-gobblin by apache.
the class LoopingDatasetFinderSourceTest method testNonDrilldown.
@Test
public void testNonDrilldown() {
Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2")));
Dataset dataset3 = new SimpleDatasetForTesting("dataset3");
Dataset dataset4 = new SimpleDatasetForTesting("dataset4");
Dataset dataset5 = new SimpleDatasetForTesting("dataset5");
IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset5, dataset4, dataset3, dataset2, dataset1));
MySource mySource = new MySource(false, finder);
SourceState sourceState = new SourceState();
sourceState.setProp(LoopingDatasetFinderSource.MAX_WORK_UNITS_PER_RUN_KEY, 3);
WorkUnitStream workUnitStream = mySource.getWorkunitStream(sourceState);
List<WorkUnit> workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
Assert.assertEquals(workUnits.size(), 3);
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
Assert.assertNull(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN));
Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
Assert.assertNull(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN));
// Second run should continue where it left off
List<WorkUnitState> workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
SourceState sourceStateSpy = Mockito.spy(sourceState);
Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
Assert.assertEquals(workUnits.size(), 3);
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset4");
Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset5");
Assert.assertNull(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN));
Assert.assertTrue(workUnits.get(2).getPropAsBoolean(LoopingDatasetFinderSource.END_OF_DATASETS_KEY));
// Loop around
workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
Assert.assertEquals(workUnits.size(), 3);
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
Assert.assertNull(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN));
Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
Assert.assertNull(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN));
}
use of org.apache.gobblin.source.workunit.WorkUnitStream in project incubator-gobblin by apache.
the class LoopingDatasetFinderSourceTest method testDrilldown.
@Test
public void testDrilldown() {
// Create three datasets, two of them partitioned
Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2"), new SimpleDatasetPartitionForTesting("p3")));
Dataset dataset3 = new SimplePartitionableDatasetForTesting("dataset3", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2"), new SimpleDatasetPartitionForTesting("p3")));
IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset3, dataset2, dataset1));
MySource mySource = new MySource(true, finder);
// Limit to 3 wunits per run
SourceState sourceState = new SourceState();
sourceState.setProp(LoopingDatasetFinderSource.MAX_WORK_UNITS_PER_RUN_KEY, 3);
// first run, get three first work units
WorkUnitStream workUnitStream = mySource.getWorkunitStream(sourceState);
List<WorkUnit> workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
Assert.assertEquals(workUnits.size(), 3);
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
// Second run should continue where it left off
List<WorkUnitState> workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
SourceState sourceStateSpy = Mockito.spy(sourceState);
Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
Assert.assertEquals(workUnits.size(), 3);
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN), "p3");
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
// third run, continue from where it left off
workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
Assert.assertEquals(workUnits.size(), 2);
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN), "p3");
Assert.assertTrue(workUnits.get(1).getPropAsBoolean(LoopingDatasetFinderSource.END_OF_DATASETS_KEY));
// fourth run, finished all work units, loop around
workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
Assert.assertEquals(workUnits.size(), 3);
Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
}
use of org.apache.gobblin.source.workunit.WorkUnitStream in project incubator-gobblin by apache.
the class DatasetFinderSourceTest method testNonDrilledDown.
@Test
public void testNonDrilledDown() {
Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2")));
Dataset dataset3 = new SimpleDatasetForTesting("dataset3");
IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset1, dataset2, dataset3));
MySource mySource = new MySource(false, finder);
List<WorkUnit> workUnits = mySource.getWorkunits(new SourceState());
Assert.assertEquals(workUnits.size(), 3);
Assert.assertEquals(workUnits.get(0).getProp(DATASET_URN), "dataset1");
Assert.assertNull(workUnits.get(0).getProp(PARTITION_URN));
Assert.assertEquals(workUnits.get(1).getProp(DATASET_URN), "dataset2");
Assert.assertNull(workUnits.get(1).getProp(PARTITION_URN));
Assert.assertEquals(workUnits.get(2).getProp(DATASET_URN), "dataset3");
Assert.assertNull(workUnits.get(2).getProp(PARTITION_URN));
WorkUnitStream workUnitStream = mySource.getWorkunitStream(new SourceState());
Assert.assertEquals(Lists.newArrayList(workUnitStream.getWorkUnits()), workUnits);
}
Aggregations