Search in sources :

Example 16 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class MultiWorkUnitUnpackingIterator method next.

@Override
public WorkUnit next() {
    // In case, the caller forgets to call hasNext()
    seekNext();
    WorkUnit wu = nextWu;
    if (nextWu instanceof MultiWorkUnit) {
        wu = this.currentIterator.next();
    }
    needSeek = true;
    return wu;
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 17 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class MultiWorkUnitTest method testSerDe.

@Test
public void testSerDe() throws IOException {
    Closer closer = Closer.create();
    try {
        ByteArrayOutputStream baos = closer.register(new ByteArrayOutputStream());
        DataOutputStream dos = closer.register(new DataOutputStream(baos));
        this.multiWorkUnit.write(dos);
        ByteArrayInputStream bais = closer.register((new ByteArrayInputStream(baos.toByteArray())));
        DataInputStream dis = closer.register((new DataInputStream(bais)));
        MultiWorkUnit copy = new MultiWorkUnit();
        copy.readFields(dis);
        List<WorkUnit> workUnitList = copy.getWorkUnits();
        Assert.assertEquals(workUnitList.size(), 2);
        Assert.assertEquals(workUnitList.get(0).getHighWaterMark(), 1000);
        Assert.assertEquals(workUnitList.get(0).getLowWaterMark(), 0);
        Assert.assertEquals(workUnitList.get(0).getProp("k1"), "v1");
        Assert.assertEquals(workUnitList.get(1).getHighWaterMark(), 2000);
        Assert.assertEquals(workUnitList.get(1).getLowWaterMark(), 1001);
        Assert.assertEquals(workUnitList.get(1).getProp("k2"), "v2");
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }
}
Also used : Closer(com.google.common.io.Closer) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) ByteArrayInputStream(java.io.ByteArrayInputStream) DataOutputStream(java.io.DataOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) DataInputStream(java.io.DataInputStream) Test(org.testng.annotations.Test)

Example 18 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class MultiWorkUnitTest method setUp.

@BeforeClass
public void setUp() {
    this.multiWorkUnit = new MultiWorkUnit();
    WorkUnit workUnit1 = WorkUnit.createEmpty();
    workUnit1.setHighWaterMark(1000);
    workUnit1.setLowWaterMark(0);
    workUnit1.setProp("k1", "v1");
    this.multiWorkUnit.addWorkUnit(workUnit1);
    WorkUnit workUnit2 = WorkUnit.createEmpty();
    workUnit2.setHighWaterMark(2000);
    workUnit2.setLowWaterMark(1001);
    workUnit2.setProp("k2", "v2");
    this.multiWorkUnit.addWorkUnit(workUnit2);
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) BeforeClass(org.testng.annotations.BeforeClass)

Example 19 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class JobLauncherUtilsTest method testFlattenWorkUnits.

@Test
public void testFlattenWorkUnits() {
    List<WorkUnit> workUnitsOnly = Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty());
    Assert.assertEquals(JobLauncherUtils.flattenWorkUnits(workUnitsOnly).size(), 3);
    MultiWorkUnit multiWorkUnit1 = MultiWorkUnit.createEmpty();
    multiWorkUnit1.addWorkUnits(Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty()));
    MultiWorkUnit multiWorkUnit2 = MultiWorkUnit.createEmpty();
    multiWorkUnit1.addWorkUnits(Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty()));
    List<WorkUnit> workUnitsAndMultiWorkUnits = Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty(), multiWorkUnit1, multiWorkUnit2);
    Assert.assertEquals(JobLauncherUtils.flattenWorkUnits(workUnitsAndMultiWorkUnits).size(), 9);
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 20 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class GobblinHelixJobLauncher method createJob.

/**
 * Create a job from a given batch of {@link WorkUnit}s.
 */
private JobConfig.Builder createJob(List<WorkUnit> workUnits) throws IOException {
    Map<String, TaskConfig> taskConfigMap = Maps.newHashMap();
    try (ParallelRunner stateSerDeRunner = new ParallelRunner(this.stateSerDeRunnerThreads, this.fs)) {
        int multiTaskIdSequence = 0;
        for (WorkUnit workUnit : workUnits) {
            if (workUnit instanceof MultiWorkUnit) {
                workUnit.setId(JobLauncherUtils.newMultiTaskId(this.jobContext.getJobId(), multiTaskIdSequence++));
            }
            addWorkUnit(workUnit, stateSerDeRunner, taskConfigMap);
        }
        Path jobStateFilePath;
        // write the job.state using the state store if present, otherwise serialize directly to the file
        if (this.stateStores.haveJobStateStore()) {
            jobStateFilePath = GobblinClusterUtils.getJobStateFilePath(true, this.appWorkDir, this.jobContext.getJobId());
            this.stateStores.getJobStateStore().put(jobStateFilePath.getParent().getName(), jobStateFilePath.getName(), this.jobContext.getJobState());
        } else {
            jobStateFilePath = GobblinClusterUtils.getJobStateFilePath(false, this.appWorkDir, this.jobContext.getJobId());
            SerializationUtils.serializeState(this.fs, jobStateFilePath, this.jobContext.getJobState());
        }
        LOGGER.debug("GobblinHelixJobLauncher.createJob: jobStateFilePath {}, jobState {} jobProperties {}", jobStateFilePath, this.jobContext.getJobState().toString(), this.jobContext.getJobState().getProperties());
    }
    JobConfig.Builder jobConfigBuilder = new JobConfig.Builder();
    jobConfigBuilder.setMaxAttemptsPerTask(this.jobContext.getJobState().getPropAsInt(ConfigurationKeys.MAX_TASK_RETRIES_KEY, ConfigurationKeys.DEFAULT_MAX_TASK_RETRIES));
    jobConfigBuilder.setFailureThreshold(workUnits.size());
    jobConfigBuilder.addTaskConfigMap(taskConfigMap).setCommand(GobblinTaskRunner.GOBBLIN_TASK_FACTORY_NAME);
    jobConfigBuilder.setNumConcurrentTasksPerInstance(ConfigUtils.getInt(jobConfig, GobblinClusterConfigurationKeys.HELIX_CLUSTER_TASK_CONCURRENCY, GobblinClusterConfigurationKeys.HELIX_CLUSTER_TASK_CONCURRENCY_DEFAULT));
    if (Task.getExecutionModel(ConfigUtils.configToState(jobConfig)).equals(ExecutionModel.STREAMING)) {
        jobConfigBuilder.setRebalanceRunningTask(true);
    }
    return jobConfigBuilder;
}
Also used : Path(org.apache.hadoop.fs.Path) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) TaskConfig(org.apache.helix.task.TaskConfig) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) ParallelRunner(org.apache.gobblin.util.ParallelRunner) JobConfig(org.apache.helix.task.JobConfig)

Aggregations

MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)23 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)20 Path (org.apache.hadoop.fs.Path)5 Closer (com.google.common.io.Closer)4 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)3 Test (org.testng.annotations.Test)3 DataInputStream (java.io.DataInputStream)2 SourceState (org.apache.gobblin.configuration.SourceState)2 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)2 MultiLongWatermark (org.apache.gobblin.source.extractor.extract.kafka.MultiLongWatermark)2 ParallelRunner (org.apache.gobblin.util.ParallelRunner)2 Configuration (org.apache.hadoop.conf.Configuration)2 BeforeClass (org.testng.annotations.BeforeClass)2 Predicate (com.google.common.base.Predicate)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataOutputStream (java.io.DataOutputStream)1 IOException (java.io.IOException)1 URI (java.net.URI)1 PriorityQueue (java.util.PriorityQueue)1