Search in sources :

Example 1 with Dataset

use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.

the class MRCompactor method verifyDataCompleteness.

private void verifyDataCompleteness() {
    List<Pattern> blacklist = DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_BLACKLIST);
    List<Pattern> whitelist = DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_WHITELIST);
    int numDatasetsVerifiedTogether = getNumDatasetsVerifiedTogether();
    List<Dataset> datasetsToBeVerified = Lists.newArrayList();
    for (Dataset dataset : this.datasets) {
        if (dataset.state() != UNVERIFIED) {
            continue;
        }
        if (shouldVerifyCompletenessForDataset(dataset, blacklist, whitelist)) {
            datasetsToBeVerified.add(dataset);
            if (datasetsToBeVerified.size() >= numDatasetsVerifiedTogether) {
                ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified);
                addCallback(datasetsToBeVerified, future);
                datasetsToBeVerified = Lists.newArrayList();
            }
        } else {
            dataset.setState(VERIFIED);
        }
    }
    if (!datasetsToBeVerified.isEmpty()) {
        ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified);
        addCallback(datasetsToBeVerified, future);
    }
}
Also used : Pattern(java.util.regex.Pattern) Results(org.apache.gobblin.compaction.verify.DataCompletenessVerifier.Results) Dataset(org.apache.gobblin.compaction.dataset.Dataset)

Example 2 with Dataset

use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.

the class MRCompactor method createJobPropsForDatasets.

/**
 * Create compaction job properties for {@link Dataset}s.
 */
private void createJobPropsForDatasets() {
    final Set<Dataset> datasetsWithProps = Sets.newHashSet();
    for (Dataset dataset : this.datasets) {
        datasetsWithProps.addAll(createJobPropsForDataset(dataset));
    }
    this.datasets.clear();
    this.datasets.addAll(datasetsWithProps);
}
Also used : Dataset(org.apache.gobblin.compaction.dataset.Dataset)

Example 3 with Dataset

use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.

the class MRCompactor method submitCompactionJobsAndWaitForCompletion.

private void submitCompactionJobsAndWaitForCompletion() {
    LOG.info("Submitting compaction jobs. Number of datasets: " + this.datasets.size());
    boolean allDatasetsCompleted = false;
    while (!allDatasetsCompleted) {
        allDatasetsCompleted = true;
        for (Dataset dataset : this.datasets) {
            MRCompactorJobRunner jobRunner = MRCompactor.this.jobRunnables.get(dataset);
            if (dataset.state() == VERIFIED || dataset.state() == UNVERIFIED) {
                allDatasetsCompleted = false;
                // Run compaction for a dataset, if it is not already running or completed
                if (jobRunner == null || jobRunner.status() == ABORTED) {
                    runCompactionForDataset(dataset, dataset.state() == VERIFIED);
                }
            } else if (dataset.state() == GIVEN_UP) {
                if (this.shouldPublishDataIfCannotVerifyCompl) {
                    allDatasetsCompleted = false;
                    if (jobRunner == null || jobRunner.status() == ABORTED) {
                        runCompactionForDataset(dataset, true);
                    } else {
                        jobRunner.proceed();
                    }
                } else {
                    if (jobRunner != null) {
                        jobRunner.abort();
                    }
                }
            }
        }
        if (this.stopwatch.elapsed(TimeUnit.MINUTES) >= this.compactionTimeoutMinutes) {
            // Compaction timed out. Killing all compaction jobs running
            LOG.error("Compaction timed-out. Killing all running jobs");
            for (MRCompactorJobRunner jobRunner : MRCompactor.this.jobRunnables.values()) {
                jobRunner.abort();
            }
            break;
        }
        // Sleep for a few seconds before another round
        try {
            Thread.sleep(TimeUnit.SECONDS.toMillis(COMPACTION_JOB_WAIT_INTERVAL_SECONDS));
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new RuntimeException("Interrupted while waiting", e);
        }
    }
}
Also used : Dataset(org.apache.gobblin.compaction.dataset.Dataset)

Example 4 with Dataset

use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.

the class RenameSourceDirectoryTest method testRenamingProcedure.

@Test
public void testRenamingProcedure() throws Exception {
    fs.delete(new Path(RENAME_SRC_DIR), true);
    createFile(RENAME_SRC_DIR_RUN1_FILE);
    createFile(RENAME_SRC_DIR_RUN2_FILE);
    createFile(RENAME_SRC_DIR_RUN3_FILE);
    createFile(RENAME_SRC_DIR_RUN4_COMPLETE_FILE);
    createFile(RENAME_SRC_DIR_RUN5_COMPLETE_FILE);
    Set<Path> inputPaths = new HashSet<>();
    inputPaths.add(new Path(RENAME_SRC_DIR_RUN1_DIR));
    inputPaths.add(new Path(RENAME_SRC_DIR_RUN2_DIR));
    inputPaths.add(new Path(RENAME_SRC_DIR_RUN3_DIR));
    inputPaths.add(new Path(RENAME_SRC_DIR_RUN4_DIR_COMPLETE));
    inputPaths.add(new Path(RENAME_SRC_DIR_RUN5_DIR_COMPLETE));
    Dataset dataset = mock(Dataset.class);
    Set<Path> unrenamed = MRCompactor.getDeepestLevelUnrenamedDirsWithFileExistence(fs, inputPaths);
    Assert.assertEquals(unrenamed.size(), 3);
    when(dataset.getRenamePaths()).thenReturn(unrenamed);
    MRCompactor.renameSourceDirAsCompactionComplete(fs, dataset);
    Assert.assertEquals(fs.exists(new Path(RENAME_SRC_DIR_RUN1_DIR + "_COMPLETE/dummy")), true);
    Assert.assertEquals(fs.exists(new Path(RENAME_SRC_DIR_RUN2_DIR + "_COMPLETE/dummy")), true);
    Assert.assertEquals(fs.exists(new Path(RENAME_SRC_DIR_RUN3_DIR + "_COMPLETE/dummy")), true);
    fs.delete(new Path(RENAME_SRC_DIR), true);
}
Also used : Path(org.apache.hadoop.fs.Path) Dataset(org.apache.gobblin.compaction.dataset.Dataset) HashSet(java.util.HashSet) Test(org.testng.annotations.Test)

Example 5 with Dataset

use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.

the class MRCompactorAvroKeyDedupJobRunnerTest method setUp.

@BeforeClass
public void setUp() throws IOException {
    State state = new State();
    state.setProp(ConfigurationKeys.JOB_NAME_KEY, "MRCompactorAvroKeyDedupJobRunnerTest");
    state.setProp(MRCompactor.COMPACTION_SHOULD_DEDUPLICATE, "true");
    Dataset.Builder datasetBuilder = (new Dataset.Builder()).withInputPath(new Path("/tmp"));
    Dataset dataset = datasetBuilder.build();
    dataset.setJobProps(state);
    this.runner = new MRCompactorAvroKeyDedupJobRunner(dataset, FileSystem.get(new Configuration()));
    this.job = Job.getInstance();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) State(org.apache.gobblin.configuration.State) Dataset(org.apache.gobblin.compaction.dataset.Dataset) BeforeClass(org.testng.annotations.BeforeClass)

Aggregations

Dataset (org.apache.gobblin.compaction.dataset.Dataset)8 Path (org.apache.hadoop.fs.Path)2 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Pattern (java.util.regex.Pattern)1 Results (org.apache.gobblin.compaction.verify.DataCompletenessVerifier.Results)1 State (org.apache.gobblin.configuration.State)1 Configuration (org.apache.hadoop.conf.Configuration)1 Job (org.apache.hadoop.mapreduce.Job)1 BeforeClass (org.testng.annotations.BeforeClass)1 Test (org.testng.annotations.Test)1