use of org.apache.gobblin.compaction.dataset.DatasetHelper in project incubator-gobblin by apache.
the class MRCompactorJobPropCreator method obtainDatasetWithJobProps.
private Optional<Dataset> obtainDatasetWithJobProps(State jobProps, Dataset dataset) throws IOException {
if (this.recompactFromInputPaths) {
LOG.info(String.format("Will recompact for %s.", dataset.outputPath()));
addInputLateFilesForFirstTimeCompaction(jobProps, dataset);
} else {
Set<Path> newDataFiles = new HashSet<>();
do {
if (renameSourceDirEnabled) {
Set<Path> newUnrenamedDirs = MRCompactor.getDeepestLevelUnrenamedDirsWithFileExistence(this.fs, dataset.inputPaths());
if (newUnrenamedDirs.isEmpty()) {
LOG.info("[{}] doesn't have unprocessed directories", dataset.getDatasetName());
break;
}
Set<Path> allFiles = getAllFilePathsRecursively(newUnrenamedDirs);
if (allFiles.isEmpty()) {
LOG.info("[{}] has unprocessed directories but all empty: {}", dataset.getDatasetName(), newUnrenamedDirs);
break;
}
dataset.setRenamePaths(newUnrenamedDirs);
newDataFiles.addAll(allFiles);
LOG.info("[{}] has unprocessed directories: {}", dataset.getDatasetName(), newUnrenamedDirs);
} else {
newDataFiles = getNewDataInFolder(dataset.inputPaths(), dataset.outputPath());
Set<Path> newDataFilesInLatePath = getNewDataInFolder(dataset.inputLatePaths(), dataset.outputPath());
newDataFiles.addAll(newDataFilesInLatePath);
if (newDataFiles.isEmpty()) {
break;
}
if (!newDataFilesInLatePath.isEmpty()) {
dataset.addAdditionalInputPaths(dataset.inputLatePaths());
}
}
} while (false);
if (newDataFiles.isEmpty()) {
// re-compaction flow will run.
if (isOutputLateDataExists(dataset)) {
LOG.info("{} don't have new data, but previous late data still remains, check if it requires to move", dataset.getDatasetName());
dataset.setJobProps(jobProps);
dataset.checkIfNeedToRecompact(new DatasetHelper(dataset, this.fs, Lists.newArrayList("avro")));
if (dataset.needToRecompact()) {
MRCompactor.modifyDatasetStateToRecompact(dataset);
} else {
return Optional.absent();
}
} else {
return Optional.absent();
}
} else {
LOG.info(String.format("Will copy %d new data files for %s", newDataFiles.size(), dataset.outputPath()));
jobProps.setProp(MRCompactor.COMPACTION_JOB_LATE_DATA_MOVEMENT_TASK, true);
jobProps.setProp(MRCompactor.COMPACTION_JOB_LATE_DATA_FILES, Joiner.on(",").join(newDataFiles));
}
}
dataset.setJobProps(jobProps);
return Optional.of(dataset);
}
use of org.apache.gobblin.compaction.dataset.DatasetHelper in project incubator-gobblin by apache.
the class RecompactionConditionTest method testRecompactionConditionBasedOnFileCount.
@Test
public void testRecompactionConditionBasedOnFileCount() {
try {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
fs.delete(outputLatePath, true);
fs.mkdirs(outputLatePath);
RecompactionConditionFactory factory = new RecompactionConditionBasedOnFileCount.Factory();
RecompactionCondition conditionBasedOnFileCount = factory.createRecompactionCondition(dataset);
DatasetHelper helper = new DatasetHelper(dataset, fs, Lists.newArrayList("avro"));
fs.createNewFile(new Path(outputLatePath, new Path("1.avro")));
fs.createNewFile(new Path(outputLatePath, new Path("2.avro")));
Assert.assertEquals(conditionBasedOnFileCount.isRecompactionNeeded(helper), false);
fs.createNewFile(new Path(outputLatePath, new Path("3.avro")));
Assert.assertEquals(conditionBasedOnFileCount.isRecompactionNeeded(helper), true);
fs.delete(outputLatePath, true);
} catch (Exception e) {
e.printStackTrace();
}
}
use of org.apache.gobblin.compaction.dataset.DatasetHelper in project incubator-gobblin by apache.
the class RecompactionConditionTest method testRecompactionConditionBasedOnRatio.
@Test
public void testRecompactionConditionBasedOnRatio() {
RecompactionConditionFactory factory = new RecompactionConditionBasedOnRatio.Factory();
RecompactionCondition conditionBasedOnRatio = factory.createRecompactionCondition(dataset);
DatasetHelper helper = mock(DatasetHelper.class);
when(helper.getLateOutputRecordCount()).thenReturn(6L);
when(helper.getOutputRecordCount()).thenReturn(94L);
Assert.assertEquals(conditionBasedOnRatio.isRecompactionNeeded(helper), false);
when(helper.getLateOutputRecordCount()).thenReturn(21L);
when(helper.getOutputRecordCount()).thenReturn(79L);
Assert.assertEquals(conditionBasedOnRatio.isRecompactionNeeded(helper), true);
}
use of org.apache.gobblin.compaction.dataset.DatasetHelper in project incubator-gobblin by apache.
the class RecompactionConditionTest method testRecompactionCombineCondition.
@Test
public void testRecompactionCombineCondition() {
DatasetHelper helper = mock(DatasetHelper.class);
RecompactionCondition cond1 = mock(RecompactionConditionBasedOnRatio.class);
RecompactionCondition cond2 = mock(RecompactionConditionBasedOnFileCount.class);
RecompactionCondition cond3 = mock(RecompactionConditionBasedOnDuration.class);
RecompactionCombineCondition combineConditionOr = new RecompactionCombineCondition(Arrays.asList(cond1, cond2, cond3), RecompactionCombineCondition.CombineOperation.OR);
when(cond1.isRecompactionNeeded(helper)).thenReturn(false);
when(cond2.isRecompactionNeeded(helper)).thenReturn(false);
when(cond3.isRecompactionNeeded(helper)).thenReturn(false);
Assert.assertEquals(combineConditionOr.isRecompactionNeeded(helper), false);
when(cond1.isRecompactionNeeded(helper)).thenReturn(false);
when(cond2.isRecompactionNeeded(helper)).thenReturn(true);
when(cond3.isRecompactionNeeded(helper)).thenReturn(false);
Assert.assertEquals(combineConditionOr.isRecompactionNeeded(helper), true);
RecompactionCombineCondition combineConditionAnd = new RecompactionCombineCondition(Arrays.asList(cond1, cond2, cond3), RecompactionCombineCondition.CombineOperation.AND);
when(cond1.isRecompactionNeeded(helper)).thenReturn(true);
when(cond2.isRecompactionNeeded(helper)).thenReturn(true);
when(cond3.isRecompactionNeeded(helper)).thenReturn(false);
Assert.assertEquals(combineConditionAnd.isRecompactionNeeded(helper), false);
when(cond1.isRecompactionNeeded(helper)).thenReturn(true);
when(cond2.isRecompactionNeeded(helper)).thenReturn(true);
when(cond3.isRecompactionNeeded(helper)).thenReturn(true);
Assert.assertEquals(combineConditionAnd.isRecompactionNeeded(helper), true);
}
use of org.apache.gobblin.compaction.dataset.DatasetHelper in project incubator-gobblin by apache.
the class RecompactionConditionTest method testRecompactionConditionBasedOnDuration.
@Test
public void testRecompactionConditionBasedOnDuration() {
RecompactionConditionFactory factory = new RecompactionConditionBasedOnDuration.Factory();
RecompactionCondition conditionBasedOnDuration = factory.createRecompactionCondition(dataset);
DatasetHelper helper = mock(DatasetHelper.class);
when(helper.getDataset()).thenReturn(dataset);
PeriodFormatter periodFormatter = new PeriodFormatterBuilder().appendMonths().appendSuffix("m").appendDays().appendSuffix("d").appendHours().appendSuffix("h").appendMinutes().appendSuffix("min").toFormatter();
DateTime currentTime = getCurrentTime();
Period period_A = periodFormatter.parsePeriod("11h59min");
DateTime earliest_A = currentTime.minus(period_A);
when(helper.getEarliestLateFileModificationTime()).thenReturn(Optional.of(earliest_A));
when(helper.getCurrentTime()).thenReturn(currentTime);
Assert.assertEquals(conditionBasedOnDuration.isRecompactionNeeded(helper), false);
Period period_B = periodFormatter.parsePeriod("12h01min");
DateTime earliest_B = currentTime.minus(period_B);
when(helper.getEarliestLateFileModificationTime()).thenReturn(Optional.of(earliest_B));
when(helper.getCurrentTime()).thenReturn(currentTime);
Assert.assertEquals(conditionBasedOnDuration.isRecompactionNeeded(helper), true);
}
Aggregations