use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.
the class MultiVersionCleanableDatasetBase method clean.
/**
* Method to perform the Retention operations for this dataset.
*
*<ul>
* <li>{@link MultiVersionCleanableDatasetBase#getVersionFindersAndPolicies()} gets a list {@link VersionFinderAndPolicy}s
* <li>Each {@link VersionFinderAndPolicy} contains a {@link VersionFinder} and a {@link VersionSelectionPolicy}. It can
* optionally have a {@link RetentionAction}
* <li>The {@link MultiVersionCleanableDatasetBase#clean()} method finds all the {@link FileSystemDatasetVersion}s using
* {@link VersionFinderAndPolicy#versionFinder}
* <li> It gets the deletable {@link FileSystemDatasetVersion}s by applying {@link VersionFinderAndPolicy#versionSelectionPolicy}.
* These deletable version are deleted and then deletes empty parent directories.
* <li>If additional retention actions are available at {@link VersionFinderAndPolicy#getRetentionActions()}, all versions
* found by the {@link VersionFinderAndPolicy#versionFinder} are passed to {@link RetentionAction#execute(List)} for
* each {@link RetentionAction}
* </ul>
*/
@Override
public void clean() throws IOException {
if (this.isDatasetBlacklisted) {
this.log.info("Dataset blacklisted. Cleanup skipped for " + datasetRoot());
return;
}
boolean atLeastOneFailureSeen = false;
for (VersionFinderAndPolicy<T> versionFinderAndPolicy : getVersionFindersAndPolicies()) {
VersionSelectionPolicy<T> selectionPolicy = versionFinderAndPolicy.getVersionSelectionPolicy();
VersionFinder<? extends T> versionFinder = versionFinderAndPolicy.getVersionFinder();
if (!selectionPolicy.versionClass().isAssignableFrom(versionFinder.versionClass())) {
throw new IOException("Incompatible dataset version classes.");
}
this.log.info(String.format("Cleaning dataset %s. Using version finder %s and policy %s", this, versionFinder.getClass().getName(), selectionPolicy));
List<T> versions = Lists.newArrayList(versionFinder.findDatasetVersions(this));
if (versions.isEmpty()) {
this.log.warn("No dataset version can be found. Ignoring.");
continue;
}
Collections.sort(versions, Collections.reverseOrder());
Collection<T> deletableVersions = selectionPolicy.listSelectedVersions(versions);
cleanImpl(deletableVersions);
List<DatasetVersion> allVersions = Lists.newArrayList();
for (T ver : versions) {
allVersions.add(ver);
}
for (RetentionAction retentionAction : versionFinderAndPolicy.getRetentionActions()) {
try {
retentionAction.execute(allVersions);
} catch (Throwable t) {
atLeastOneFailureSeen = true;
log.error(String.format("RetentionAction %s failed for dataset %s", retentionAction.getClass().getName(), this.datasetRoot()), t);
}
}
}
if (atLeastOneFailureSeen) {
throw new RuntimeException(String.format("At least one failure happened while processing %s. Look for previous logs for failures", datasetRoot()));
}
}
use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.
the class ReplicationDataValidPathPicker method getValidPaths.
@SuppressWarnings("unchecked")
public static Collection<Path> getValidPaths(HadoopFsEndPoint hadoopFsEndPoint) throws IOException {
Config selectionConfig = hadoopFsEndPoint.getSelectionConfig();
FileSystemDataset tmpDataset = new HadoopFsEndPointDataset(hadoopFsEndPoint);
FileSystem theFs = FileSystem.get(hadoopFsEndPoint.getFsURI(), new Configuration());
/**
* Use {@link FileSystemDatasetVersion} as
* {@link DateTimeDatasetVersionFinder} / {@link GlobModTimeDatasetVersionFinder} use {@link TimestampedDatasetVersion}
* {@link SingleVersionFinder} uses {@link FileStatusDatasetVersion}
*/
VersionFinder<FileSystemDatasetVersion> finder;
try {
finder = (VersionFinder<FileSystemDatasetVersion>) ConstructorUtils.invokeConstructor(Class.forName(selectionConfig.getString(FINDER_CLASS)), theFs, selectionConfig);
} catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) {
throw new IllegalArgumentException(e);
}
List<FileSystemDatasetVersion> versions = Ordering.natural().reverse().sortedCopy(finder.findDatasetVersions(tmpDataset));
VersionSelectionPolicy<FileSystemDatasetVersion> selector;
try {
selector = (VersionSelectionPolicy<FileSystemDatasetVersion>) ConstructorUtils.invokeConstructor(Class.forName(selectionConfig.getString(POLICY_CLASS)), selectionConfig);
} catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) {
throw new IllegalArgumentException(e);
}
Collection<FileSystemDatasetVersion> versionsSelected = selector.listSelectedVersions(versions);
List<Path> result = new ArrayList<Path>();
for (FileSystemDatasetVersion t : versionsSelected) {
// get the first element out
result.add(t.getPaths().iterator().next());
}
return result;
}
use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.
the class HiddenFilterSelectionPolicyTest method testListSelectedVersions.
@Test
public void testListSelectedVersions() throws Exception {
List<FileSystemDatasetVersion> versionList = new ArrayList<>();
Set<String> pathSet = new HashSet<>();
Path path1 = new Path("/data/dataset/versions/version1");
pathSet.add(path1.toString());
Path path2 = new Path("/data/dataset/versions/version2");
pathSet.add(path2.toString());
Path path3 = new Path("/data/dataset/.temp/tmpPath");
Path path4 = new Path("/data/dataset/_temp/tmpPath");
versionList.add(new TimestampedDatasetVersion(new DateTime(), path1));
versionList.add(new TimestampedDatasetVersion(new DateTime(), path2));
versionList.add(new TimestampedDatasetVersion(new DateTime(), path3));
versionList.add(new TimestampedDatasetVersion(new DateTime(), path4));
List<String> hiddenFilePrefixes = Arrays.asList("_", ".");
List<Config> configList = new ArrayList<>();
Config config1 = ConfigFactory.parseMap(ImmutableMap.of(HiddenFilterSelectionPolicy.HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY, hiddenFilePrefixes));
configList.add(config1);
Config config2 = ConfigFactory.parseMap(ImmutableMap.of(HiddenFilterSelectionPolicy.HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY, "_,."));
configList.add(config2);
for (Config config : configList) {
HiddenFilterSelectionPolicy policy = new HiddenFilterSelectionPolicy(config);
Collection<FileSystemDatasetVersion> selectedVersions = policy.listSelectedVersions(versionList);
Assert.assertEquals(selectedVersions.size(), 2);
for (FileSystemDatasetVersion version : selectedVersions) {
Set<Path> paths = version.getPaths();
for (Path path : paths) {
Assert.assertTrue(pathSet.contains(path.toString()));
}
}
}
}
use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.
the class FsCleanableHelperTest method testDeleteEmptyDirs.
@Test
public void testDeleteEmptyDirs() throws Exception {
Properties props = new Properties();
props.setProperty(FsCleanableHelper.SKIP_TRASH_KEY, Boolean.toString(true));
FsCleanableHelper fsCleanableHelper = new FsCleanableHelper(this.fs, props, ConfigFactory.empty(), log);
FileSystemDataset fsDataset = mock(FileSystemDataset.class);
Path datasetRoot = new Path(testTempPath, "dataset1");
when(fsDataset.datasetRoot()).thenReturn(datasetRoot);
// To delete
Path deleted1 = new Path(datasetRoot, "2016/01/01/13");
Path deleted2 = new Path(datasetRoot, "2016/01/01/14");
Path deleted3 = new Path(datasetRoot, "2016/01/02/15");
// Do not delete
Path notDeleted1 = new Path(datasetRoot, "2016/01/02/16");
this.fs.mkdirs(deleted1);
this.fs.mkdirs(deleted2);
this.fs.mkdirs(deleted3);
this.fs.mkdirs(notDeleted1);
// Make sure all paths are created
Assert.assertTrue(this.fs.exists(deleted1));
Assert.assertTrue(this.fs.exists(deleted2));
Assert.assertTrue(this.fs.exists(deleted3));
Assert.assertTrue(this.fs.exists(notDeleted1));
List<FileSystemDatasetVersion> deletableVersions = ImmutableList.<FileSystemDatasetVersion>of(new MockFileSystemDatasetVersion(deleted1), new MockFileSystemDatasetVersion(deleted2), new MockFileSystemDatasetVersion(deleted3));
fsCleanableHelper.clean(deletableVersions, fsDataset);
// Verify versions are deleted
Assert.assertFalse(this.fs.exists(deleted1));
Assert.assertFalse(this.fs.exists(deleted2));
Assert.assertFalse(this.fs.exists(deleted3));
// Verify versions are not deleted
Assert.assertTrue(this.fs.exists(notDeleted1));
// Verify empty parent dir "2016/01/01" is deleted
Assert.assertFalse(this.fs.exists(deleted1.getParent()));
// Verify non empty parent dir "2016/01/02" exists
Assert.assertTrue(this.fs.exists(notDeleted1.getParent()));
}
use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.
the class ConfigurableCleanableDatasetTest method testDatasetIsBlacklisted.
@Test
public void testDatasetIsBlacklisted() throws Exception {
Config conf = ConfigFactory.parseMap(ImmutableMap.<String, String>of("gobblin.retention.version.finder.class", "org.apache.gobblin.data.management.version.finder.WatermarkDatasetVersionFinder", "gobblin.retention.selection.policy.class", "org.apache.gobblin.data.management.policy.NewestKSelectionPolicy", "gobblin.retention.selection.newestK.versionsSelected", "2", "gobblin.retention.dataset.is.blacklisted", "true"));
ConfigurableCleanableDataset<FileSystemDatasetVersion> dataset = new ConfigurableCleanableDataset<FileSystemDatasetVersion>(FileSystem.get(new URI(ConfigurationKeys.LOCAL_FS_URI), new Configuration()), new Properties(), new Path("/someroot"), conf, LoggerFactory.getLogger(ConfigurableCleanableDatasetTest.class));
Assert.assertEquals(dataset.isDatasetBlacklisted(), true);
}
Aggregations