use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.
the class MRCompactor method verifyDataCompleteness.
private void verifyDataCompleteness() {
List<Pattern> blacklist = DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_BLACKLIST);
List<Pattern> whitelist = DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_WHITELIST);
int numDatasetsVerifiedTogether = getNumDatasetsVerifiedTogether();
List<Dataset> datasetsToBeVerified = Lists.newArrayList();
for (Dataset dataset : this.datasets) {
if (dataset.state() != UNVERIFIED) {
continue;
}
if (shouldVerifyCompletenessForDataset(dataset, blacklist, whitelist)) {
datasetsToBeVerified.add(dataset);
if (datasetsToBeVerified.size() >= numDatasetsVerifiedTogether) {
ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified);
addCallback(datasetsToBeVerified, future);
datasetsToBeVerified = Lists.newArrayList();
}
} else {
dataset.setState(VERIFIED);
}
}
if (!datasetsToBeVerified.isEmpty()) {
ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified);
addCallback(datasetsToBeVerified, future);
}
}
use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.
the class MRCompactor method createJobPropsForDatasets.
/**
* Create compaction job properties for {@link Dataset}s.
*/
private void createJobPropsForDatasets() {
final Set<Dataset> datasetsWithProps = Sets.newHashSet();
for (Dataset dataset : this.datasets) {
datasetsWithProps.addAll(createJobPropsForDataset(dataset));
}
this.datasets.clear();
this.datasets.addAll(datasetsWithProps);
}
use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.
the class MRCompactor method submitCompactionJobsAndWaitForCompletion.
private void submitCompactionJobsAndWaitForCompletion() {
LOG.info("Submitting compaction jobs. Number of datasets: " + this.datasets.size());
boolean allDatasetsCompleted = false;
while (!allDatasetsCompleted) {
allDatasetsCompleted = true;
for (Dataset dataset : this.datasets) {
MRCompactorJobRunner jobRunner = MRCompactor.this.jobRunnables.get(dataset);
if (dataset.state() == VERIFIED || dataset.state() == UNVERIFIED) {
allDatasetsCompleted = false;
// Run compaction for a dataset, if it is not already running or completed
if (jobRunner == null || jobRunner.status() == ABORTED) {
runCompactionForDataset(dataset, dataset.state() == VERIFIED);
}
} else if (dataset.state() == GIVEN_UP) {
if (this.shouldPublishDataIfCannotVerifyCompl) {
allDatasetsCompleted = false;
if (jobRunner == null || jobRunner.status() == ABORTED) {
runCompactionForDataset(dataset, true);
} else {
jobRunner.proceed();
}
} else {
if (jobRunner != null) {
jobRunner.abort();
}
}
}
}
if (this.stopwatch.elapsed(TimeUnit.MINUTES) >= this.compactionTimeoutMinutes) {
// Compaction timed out. Killing all compaction jobs running
LOG.error("Compaction timed-out. Killing all running jobs");
for (MRCompactorJobRunner jobRunner : MRCompactor.this.jobRunnables.values()) {
jobRunner.abort();
}
break;
}
// Sleep for a few seconds before another round
try {
Thread.sleep(TimeUnit.SECONDS.toMillis(COMPACTION_JOB_WAIT_INTERVAL_SECONDS));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException("Interrupted while waiting", e);
}
}
}
use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.
the class RenameSourceDirectoryTest method testRenamingProcedure.
@Test
public void testRenamingProcedure() throws Exception {
fs.delete(new Path(RENAME_SRC_DIR), true);
createFile(RENAME_SRC_DIR_RUN1_FILE);
createFile(RENAME_SRC_DIR_RUN2_FILE);
createFile(RENAME_SRC_DIR_RUN3_FILE);
createFile(RENAME_SRC_DIR_RUN4_COMPLETE_FILE);
createFile(RENAME_SRC_DIR_RUN5_COMPLETE_FILE);
Set<Path> inputPaths = new HashSet<>();
inputPaths.add(new Path(RENAME_SRC_DIR_RUN1_DIR));
inputPaths.add(new Path(RENAME_SRC_DIR_RUN2_DIR));
inputPaths.add(new Path(RENAME_SRC_DIR_RUN3_DIR));
inputPaths.add(new Path(RENAME_SRC_DIR_RUN4_DIR_COMPLETE));
inputPaths.add(new Path(RENAME_SRC_DIR_RUN5_DIR_COMPLETE));
Dataset dataset = mock(Dataset.class);
Set<Path> unrenamed = MRCompactor.getDeepestLevelUnrenamedDirsWithFileExistence(fs, inputPaths);
Assert.assertEquals(unrenamed.size(), 3);
when(dataset.getRenamePaths()).thenReturn(unrenamed);
MRCompactor.renameSourceDirAsCompactionComplete(fs, dataset);
Assert.assertEquals(fs.exists(new Path(RENAME_SRC_DIR_RUN1_DIR + "_COMPLETE/dummy")), true);
Assert.assertEquals(fs.exists(new Path(RENAME_SRC_DIR_RUN2_DIR + "_COMPLETE/dummy")), true);
Assert.assertEquals(fs.exists(new Path(RENAME_SRC_DIR_RUN3_DIR + "_COMPLETE/dummy")), true);
fs.delete(new Path(RENAME_SRC_DIR), true);
}
use of org.apache.gobblin.compaction.dataset.Dataset in project incubator-gobblin by apache.
the class MRCompactorAvroKeyDedupJobRunnerTest method setUp.
@BeforeClass
public void setUp() throws IOException {
State state = new State();
state.setProp(ConfigurationKeys.JOB_NAME_KEY, "MRCompactorAvroKeyDedupJobRunnerTest");
state.setProp(MRCompactor.COMPACTION_SHOULD_DEDUPLICATE, "true");
Dataset.Builder datasetBuilder = (new Dataset.Builder()).withInputPath(new Path("/tmp"));
Dataset dataset = datasetBuilder.build();
dataset.setJobProps(state);
this.runner = new MRCompactorAvroKeyDedupJobRunner(dataset, FileSystem.get(new Configuration()));
this.job = Job.getInstance();
}
Aggregations