Examples with MultiWorkUnit - org.apache.gobblin.source.workunit.MultiWorkUnit

Example 21 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class GobblinHelixJobLauncher method persistWorkUnit.

/**
 * Persist a single {@link WorkUnit} (flattened) to a file.
 */
private String persistWorkUnit(final Path workUnitFileDir, final WorkUnit workUnit, ParallelRunner stateSerDeRunner) throws IOException {
    final StateStore stateStore;
    String workUnitFileName = workUnit.getId();
    if (workUnit instanceof MultiWorkUnit) {
        workUnitFileName += MULTI_WORK_UNIT_FILE_EXTENSION;
        stateStore = stateStores.getMwuStateStore();
    } else {
        workUnitFileName += WORK_UNIT_FILE_EXTENSION;
        stateStore = stateStores.getWuStateStore();
    }
    Path workUnitFile = new Path(workUnitFileDir, workUnitFileName);
    final String fileName = workUnitFile.getName();
    final String storeName = workUnitFile.getParent().getName();
    stateSerDeRunner.submitCallable(new Callable<Void>() {

        @Override
        public Void call() throws Exception {
            stateStore.put(storeName, fileName, workUnit);
            return null;
        }
    }, "Serialize state to store " + storeName + " file " + fileName);
    return workUnitFile.toString();
}

Also used : Path(org.apache.hadoop.fs.Path) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) StateStore(org.apache.gobblin.metastore.StateStore) JobException(org.apache.gobblin.runtime.JobException) IOException(java.io.IOException)

Example 22 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class GobblinOutputCommitter method abortJob.

@Override
public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException {
    LOG.info("Aborting Job: " + jobContext.getJobID() + " with state: " + state);
    Configuration conf = jobContext.getConfiguration();
    URI fsUri = URI.create(conf.get(ConfigurationKeys.FS_URI_KEY, ConfigurationKeys.LOCAL_FS_URI));
    FileSystem fs = FileSystem.get(fsUri, conf);
    Path mrJobDir = new Path(conf.get(ConfigurationKeys.MR_JOB_ROOT_DIR_KEY), conf.get(ConfigurationKeys.JOB_NAME_KEY));
    Path jobInputDir = new Path(mrJobDir, MRJobLauncher.INPUT_DIR_NAME);
    if (!fs.exists(jobInputDir) || !fs.isDirectory(jobInputDir)) {
        LOG.warn(String.format("%s either does not exist or is not a directory. No data to cleanup.", jobInputDir));
        return;
    }
    // Iterate through all files in the jobInputDir, each file should correspond to a serialized wu or mwu
    try {
        for (FileStatus status : fs.listStatus(jobInputDir, new WorkUnitFilter())) {
            Closer workUnitFileCloser = Closer.create();
            // If the file ends with ".wu" de-serialize it into a WorkUnit
            if (status.getPath().getName().endsWith(AbstractJobLauncher.WORK_UNIT_FILE_EXTENSION)) {
                WorkUnit wu = WorkUnit.createEmpty();
                try {
                    wu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath()))));
                } finally {
                    workUnitFileCloser.close();
                }
                JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG);
            }
            // If the file ends with ".mwu" de-serialize it into a MultiWorkUnit
            if (status.getPath().getName().endsWith(AbstractJobLauncher.MULTI_WORK_UNIT_FILE_EXTENSION)) {
                MultiWorkUnit mwu = MultiWorkUnit.createEmpty();
                try {
                    mwu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath()))));
                } finally {
                    workUnitFileCloser.close();
                }
                for (WorkUnit wu : mwu.getWorkUnits()) {
                    JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG);
                }
            }
        }
    } finally {
        try {
            cleanUpWorkingDirectory(mrJobDir, fs);
        } finally {
            super.abortJob(jobContext, state);
        }
    }
}

Also used : Path(org.apache.hadoop.fs.Path) Closer(com.google.common.io.Closer) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) FileSystem(org.apache.hadoop.fs.FileSystem) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) DataInputStream(java.io.DataInputStream) URI(java.net.URI)

Example 23 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class MRJobLauncher method prepareJobInput.

/**
 * Prepare the job input.
 * @throws IOException
 */
private void prepareJobInput(List<WorkUnit> workUnits) throws IOException {
    Closer closer = Closer.create();
    try {
        ParallelRunner parallelRunner = closer.register(new ParallelRunner(this.parallelRunnerThreads, this.fs));
        int multiTaskIdSequence = 0;
        // Serialize each work unit into a file named after the task ID
        for (WorkUnit workUnit : workUnits) {
            String workUnitFileName;
            if (workUnit instanceof MultiWorkUnit) {
                workUnitFileName = JobLauncherUtils.newMultiTaskId(this.jobContext.getJobId(), multiTaskIdSequence++) + MULTI_WORK_UNIT_FILE_EXTENSION;
            } else {
                workUnitFileName = workUnit.getProp(ConfigurationKeys.TASK_ID_KEY) + WORK_UNIT_FILE_EXTENSION;
            }
            Path workUnitFile = new Path(this.jobInputPath, workUnitFileName);
            LOG.debug("Writing work unit file " + workUnitFileName);
            parallelRunner.serializeToFile(workUnit, workUnitFile);
        // Append the work unit file path to the job input file
        }
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }
}

Also used : Closer(com.google.common.io.Closer) Path(org.apache.hadoop.fs.Path) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) ParallelRunner(org.apache.gobblin.util.ParallelRunner)

Aggregations

MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)23 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)20 Path (org.apache.hadoop.fs.Path)5 Closer (com.google.common.io.Closer)4 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)3 Test (org.testng.annotations.Test)3 DataInputStream (java.io.DataInputStream)2 SourceState (org.apache.gobblin.configuration.SourceState)2 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)2 MultiLongWatermark (org.apache.gobblin.source.extractor.extract.kafka.MultiLongWatermark)2 ParallelRunner (org.apache.gobblin.util.ParallelRunner)2 Configuration (org.apache.hadoop.conf.Configuration)2 BeforeClass (org.testng.annotations.BeforeClass)2 Predicate (com.google.common.base.Predicate)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataOutputStream (java.io.DataOutputStream)1 IOException (java.io.IOException)1 URI (java.net.URI)1 PriorityQueue (java.util.PriorityQueue)1