use of org.apache.gobblin.data.management.version.DatasetVersion in project incubator-gobblin by apache.
the class MultiVersionCleanableDatasetBase method clean.
/**
* Method to perform the Retention operations for this dataset.
*
*<ul>
* <li>{@link MultiVersionCleanableDatasetBase#getVersionFindersAndPolicies()} gets a list {@link VersionFinderAndPolicy}s
* <li>Each {@link VersionFinderAndPolicy} contains a {@link VersionFinder} and a {@link VersionSelectionPolicy}. It can
* optionally have a {@link RetentionAction}
* <li>The {@link MultiVersionCleanableDatasetBase#clean()} method finds all the {@link FileSystemDatasetVersion}s using
* {@link VersionFinderAndPolicy#versionFinder}
* <li> It gets the deletable {@link FileSystemDatasetVersion}s by applying {@link VersionFinderAndPolicy#versionSelectionPolicy}.
* These deletable version are deleted and then deletes empty parent directories.
* <li>If additional retention actions are available at {@link VersionFinderAndPolicy#getRetentionActions()}, all versions
* found by the {@link VersionFinderAndPolicy#versionFinder} are passed to {@link RetentionAction#execute(List)} for
* each {@link RetentionAction}
* </ul>
*/
@Override
public void clean() throws IOException {
if (this.isDatasetBlacklisted) {
this.log.info("Dataset blacklisted. Cleanup skipped for " + datasetRoot());
return;
}
boolean atLeastOneFailureSeen = false;
for (VersionFinderAndPolicy<T> versionFinderAndPolicy : getVersionFindersAndPolicies()) {
VersionSelectionPolicy<T> selectionPolicy = versionFinderAndPolicy.getVersionSelectionPolicy();
VersionFinder<? extends T> versionFinder = versionFinderAndPolicy.getVersionFinder();
if (!selectionPolicy.versionClass().isAssignableFrom(versionFinder.versionClass())) {
throw new IOException("Incompatible dataset version classes.");
}
this.log.info(String.format("Cleaning dataset %s. Using version finder %s and policy %s", this, versionFinder.getClass().getName(), selectionPolicy));
List<T> versions = Lists.newArrayList(versionFinder.findDatasetVersions(this));
if (versions.isEmpty()) {
this.log.warn("No dataset version can be found. Ignoring.");
continue;
}
Collections.sort(versions, Collections.reverseOrder());
Collection<T> deletableVersions = selectionPolicy.listSelectedVersions(versions);
cleanImpl(deletableVersions);
List<DatasetVersion> allVersions = Lists.newArrayList();
for (T ver : versions) {
allVersions.add(ver);
}
for (RetentionAction retentionAction : versionFinderAndPolicy.getRetentionActions()) {
try {
retentionAction.execute(allVersions);
} catch (Throwable t) {
atLeastOneFailureSeen = true;
log.error(String.format("RetentionAction %s failed for dataset %s", retentionAction.getClass().getName(), this.datasetRoot()), t);
}
}
}
if (atLeastOneFailureSeen) {
throw new RuntimeException(String.format("At least one failure happened while processing %s. Look for previous logs for failures", datasetRoot()));
}
}
use of org.apache.gobblin.data.management.version.DatasetVersion in project incubator-gobblin by apache.
the class CombineRetentionPolicyTest method testIntersect.
@Test
public void testIntersect() throws IOException {
Properties props = new Properties();
props.setProperty(CombineRetentionPolicy.RETENTION_POLICIES_PREFIX + "1", ContainsARetentionPolicy.class.getCanonicalName());
props.setProperty(CombineRetentionPolicy.RETENTION_POLICIES_PREFIX + "2", ContainsBRetentionPolicy.class.getCanonicalName());
props.setProperty(CombineRetentionPolicy.RETENTION_POLICIES_PREFIX + "3", ContainsCRetentionPolicy.class.getCanonicalName());
props.setProperty(CombineRetentionPolicy.DELETE_SETS_COMBINE_OPERATION, CombineRetentionPolicy.DeletableCombineOperation.INTERSECT.name());
CombineRetentionPolicy policy = new CombineRetentionPolicy(props);
Collection<DatasetVersion> deletableVersions = policy.listDeletableVersions(Lists.<DatasetVersion>newArrayList(new StringDatasetVersion("a", new Path("/")), new StringDatasetVersion("abc", new Path("/")), new StringDatasetVersion("abcd", new Path("/")), new StringDatasetVersion("bc", new Path("/")), new StringDatasetVersion("d", new Path("/"))));
Set<String> actualDeletableVersions = Sets.newHashSet(Iterables.transform(deletableVersions, new Function<DatasetVersion, String>() {
@Nullable
@Override
public String apply(DatasetVersion input) {
return ((StringDatasetVersion) input).getVersion();
}
}));
Assert.assertEquals(policy.versionClass(), StringDatasetVersion.class);
Assert.assertEquals(deletableVersions.size(), 2);
Assert.assertEquals(actualDeletableVersions, Sets.newHashSet("abcd", "abc"));
}
use of org.apache.gobblin.data.management.version.DatasetVersion in project incubator-gobblin by apache.
the class NewestKSelectionPolicyTest method testSelect.
@Test
public void testSelect() {
ArrayList<DatasetVersion> versions = new ArrayList<>();
for (int i = 0; i < 10; ++i) {
versions.add(new TestStringDatasetVersion(String.format("v%03d", i)));
}
// selectedVersions 5 < 10
Config conf = ConfigFactory.empty().withValue(NewestKSelectionPolicy.NEWEST_K_VERSIONS_SELECTED_KEY, ConfigValueFactory.fromAnyRef(5));
NewestKSelectionPolicy policy = new NewestKSelectionPolicy(conf);
Collection<DatasetVersion> res = policy.listSelectedVersions(versions);
int idx = 0;
Assert.assertEquals(res.size(), policy.getVersionsSelected());
for (DatasetVersion v : res) {
Assert.assertEquals(v, versions.get(idx++), "Mismatch for index " + idx);
}
// selectedVersions 15 > 10
conf = ConfigFactory.empty().withValue(NewestKSelectionPolicy.NEWEST_K_VERSIONS_SELECTED_KEY, ConfigValueFactory.fromAnyRef(15));
policy = new NewestKSelectionPolicy(conf);
res = policy.listSelectedVersions(versions);
idx = 0;
Assert.assertEquals(res.size(), versions.size());
for (DatasetVersion v : res) {
Assert.assertEquals(v, versions.get(idx++), "Mismatch for index " + idx);
}
// notSelectedVersions 4 < 10
conf = ConfigFactory.empty().withValue(NewestKSelectionPolicy.NEWEST_K_VERSIONS_NOTSELECTED_KEY, ConfigValueFactory.fromAnyRef(4));
policy = new NewestKSelectionPolicy(conf);
res = policy.listSelectedVersions(versions);
idx = policy.getVersionsSelected();
Assert.assertEquals(res.size(), versions.size() - policy.getVersionsSelected());
for (DatasetVersion v : res) {
Assert.assertEquals(v, versions.get(idx++), "Mismatch for index " + idx);
}
// notSelectedVersions 14 > 10
conf = ConfigFactory.empty().withValue(NewestKSelectionPolicy.NEWEST_K_VERSIONS_NOTSELECTED_KEY, ConfigValueFactory.fromAnyRef(14));
policy = new NewestKSelectionPolicy(conf);
res = policy.listSelectedVersions(versions);
Assert.assertEquals(res.size(), 0);
}
use of org.apache.gobblin.data.management.version.DatasetVersion in project incubator-gobblin by apache.
the class CombineRetentionPolicyTest method testUnion.
@Test
public void testUnion() throws IOException {
Properties props = new Properties();
props.setProperty(CombineRetentionPolicy.RETENTION_POLICIES_PREFIX + "1", ContainsARetentionPolicy.class.getCanonicalName());
props.setProperty(CombineRetentionPolicy.RETENTION_POLICIES_PREFIX + "2", ContainsBRetentionPolicy.class.getCanonicalName());
props.setProperty(CombineRetentionPolicy.RETENTION_POLICIES_PREFIX + "3", ContainsCRetentionPolicy.class.getCanonicalName());
props.setProperty(CombineRetentionPolicy.DELETE_SETS_COMBINE_OPERATION, CombineRetentionPolicy.DeletableCombineOperation.UNION.name());
CombineRetentionPolicy policy = new CombineRetentionPolicy(props);
Collection<DatasetVersion> deletableVersions = policy.listDeletableVersions(Lists.<DatasetVersion>newArrayList(new StringDatasetVersion("a", new Path("/")), new StringDatasetVersion("abc", new Path("/")), new StringDatasetVersion("abcd", new Path("/")), new StringDatasetVersion("bc", new Path("/")), new StringDatasetVersion("d", new Path("/"))));
Set<String> actualDeletableVersions = Sets.newHashSet(Iterables.transform(deletableVersions, new Function<DatasetVersion, String>() {
@Nullable
@Override
public String apply(DatasetVersion input) {
return ((StringDatasetVersion) input).getVersion();
}
}));
Assert.assertEquals(deletableVersions.size(), 4);
Assert.assertEquals(actualDeletableVersions, Sets.newHashSet("abcd", "abc", "a", "bc"));
}
Aggregations