Search in sources :

Example 1 with FileSystemDatasetVersion

use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.

the class MultiVersionCleanableDatasetBase method clean.

/**
 * Method to perform the Retention operations for this dataset.
 *
 *<ul>
 * <li>{@link MultiVersionCleanableDatasetBase#getVersionFindersAndPolicies()} gets a list {@link VersionFinderAndPolicy}s
 * <li>Each {@link VersionFinderAndPolicy} contains a {@link VersionFinder} and a {@link VersionSelectionPolicy}. It can
 * optionally have a {@link RetentionAction}
 * <li>The {@link MultiVersionCleanableDatasetBase#clean()} method finds all the {@link FileSystemDatasetVersion}s using
 * {@link VersionFinderAndPolicy#versionFinder}
 * <li> It gets the deletable {@link FileSystemDatasetVersion}s by applying {@link VersionFinderAndPolicy#versionSelectionPolicy}.
 * These deletable version are deleted  and then deletes empty parent directories.
 * <li>If additional retention actions are available at {@link VersionFinderAndPolicy#getRetentionActions()}, all versions
 * found by the {@link VersionFinderAndPolicy#versionFinder} are passed to {@link RetentionAction#execute(List)} for
 * each {@link RetentionAction}
 * </ul>
 */
@Override
public void clean() throws IOException {
    if (this.isDatasetBlacklisted) {
        this.log.info("Dataset blacklisted. Cleanup skipped for " + datasetRoot());
        return;
    }
    boolean atLeastOneFailureSeen = false;
    for (VersionFinderAndPolicy<T> versionFinderAndPolicy : getVersionFindersAndPolicies()) {
        VersionSelectionPolicy<T> selectionPolicy = versionFinderAndPolicy.getVersionSelectionPolicy();
        VersionFinder<? extends T> versionFinder = versionFinderAndPolicy.getVersionFinder();
        if (!selectionPolicy.versionClass().isAssignableFrom(versionFinder.versionClass())) {
            throw new IOException("Incompatible dataset version classes.");
        }
        this.log.info(String.format("Cleaning dataset %s. Using version finder %s and policy %s", this, versionFinder.getClass().getName(), selectionPolicy));
        List<T> versions = Lists.newArrayList(versionFinder.findDatasetVersions(this));
        if (versions.isEmpty()) {
            this.log.warn("No dataset version can be found. Ignoring.");
            continue;
        }
        Collections.sort(versions, Collections.reverseOrder());
        Collection<T> deletableVersions = selectionPolicy.listSelectedVersions(versions);
        cleanImpl(deletableVersions);
        List<DatasetVersion> allVersions = Lists.newArrayList();
        for (T ver : versions) {
            allVersions.add(ver);
        }
        for (RetentionAction retentionAction : versionFinderAndPolicy.getRetentionActions()) {
            try {
                retentionAction.execute(allVersions);
            } catch (Throwable t) {
                atLeastOneFailureSeen = true;
                log.error(String.format("RetentionAction %s failed for dataset %s", retentionAction.getClass().getName(), this.datasetRoot()), t);
            }
        }
    }
    if (atLeastOneFailureSeen) {
        throw new RuntimeException(String.format("At least one failure happened while processing %s. Look for previous logs for failures", datasetRoot()));
    }
}
Also used : DatasetVersion(org.apache.gobblin.data.management.version.DatasetVersion) FileSystemDatasetVersion(org.apache.gobblin.data.management.version.FileSystemDatasetVersion) IOException(java.io.IOException) RetentionAction(org.apache.gobblin.data.management.retention.action.RetentionAction)

Example 2 with FileSystemDatasetVersion

use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.

the class ReplicationDataValidPathPicker method getValidPaths.

@SuppressWarnings("unchecked")
public static Collection<Path> getValidPaths(HadoopFsEndPoint hadoopFsEndPoint) throws IOException {
    Config selectionConfig = hadoopFsEndPoint.getSelectionConfig();
    FileSystemDataset tmpDataset = new HadoopFsEndPointDataset(hadoopFsEndPoint);
    FileSystem theFs = FileSystem.get(hadoopFsEndPoint.getFsURI(), new Configuration());
    /**
     * Use {@link FileSystemDatasetVersion} as
     * {@link DateTimeDatasetVersionFinder} / {@link GlobModTimeDatasetVersionFinder} use {@link TimestampedDatasetVersion}
     * {@link SingleVersionFinder} uses {@link FileStatusDatasetVersion}
     */
    VersionFinder<FileSystemDatasetVersion> finder;
    try {
        finder = (VersionFinder<FileSystemDatasetVersion>) ConstructorUtils.invokeConstructor(Class.forName(selectionConfig.getString(FINDER_CLASS)), theFs, selectionConfig);
    } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) {
        throw new IllegalArgumentException(e);
    }
    List<FileSystemDatasetVersion> versions = Ordering.natural().reverse().sortedCopy(finder.findDatasetVersions(tmpDataset));
    VersionSelectionPolicy<FileSystemDatasetVersion> selector;
    try {
        selector = (VersionSelectionPolicy<FileSystemDatasetVersion>) ConstructorUtils.invokeConstructor(Class.forName(selectionConfig.getString(POLICY_CLASS)), selectionConfig);
    } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) {
        throw new IllegalArgumentException(e);
    }
    Collection<FileSystemDatasetVersion> versionsSelected = selector.listSelectedVersions(versions);
    List<Path> result = new ArrayList<Path>();
    for (FileSystemDatasetVersion t : versionsSelected) {
        // get the first element out
        result.add(t.getPaths().iterator().next());
    }
    return result;
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystemDataset(org.apache.gobblin.dataset.FileSystemDataset) Configuration(org.apache.hadoop.conf.Configuration) Config(com.typesafe.config.Config) FileSystemDatasetVersion(org.apache.gobblin.data.management.version.FileSystemDatasetVersion) ArrayList(java.util.ArrayList) InvocationTargetException(java.lang.reflect.InvocationTargetException) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 3 with FileSystemDatasetVersion

use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.

the class HiddenFilterSelectionPolicyTest method testListSelectedVersions.

@Test
public void testListSelectedVersions() throws Exception {
    List<FileSystemDatasetVersion> versionList = new ArrayList<>();
    Set<String> pathSet = new HashSet<>();
    Path path1 = new Path("/data/dataset/versions/version1");
    pathSet.add(path1.toString());
    Path path2 = new Path("/data/dataset/versions/version2");
    pathSet.add(path2.toString());
    Path path3 = new Path("/data/dataset/.temp/tmpPath");
    Path path4 = new Path("/data/dataset/_temp/tmpPath");
    versionList.add(new TimestampedDatasetVersion(new DateTime(), path1));
    versionList.add(new TimestampedDatasetVersion(new DateTime(), path2));
    versionList.add(new TimestampedDatasetVersion(new DateTime(), path3));
    versionList.add(new TimestampedDatasetVersion(new DateTime(), path4));
    List<String> hiddenFilePrefixes = Arrays.asList("_", ".");
    List<Config> configList = new ArrayList<>();
    Config config1 = ConfigFactory.parseMap(ImmutableMap.of(HiddenFilterSelectionPolicy.HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY, hiddenFilePrefixes));
    configList.add(config1);
    Config config2 = ConfigFactory.parseMap(ImmutableMap.of(HiddenFilterSelectionPolicy.HIDDEN_FILTER_HIDDEN_FILE_PREFIX_KEY, "_,."));
    configList.add(config2);
    for (Config config : configList) {
        HiddenFilterSelectionPolicy policy = new HiddenFilterSelectionPolicy(config);
        Collection<FileSystemDatasetVersion> selectedVersions = policy.listSelectedVersions(versionList);
        Assert.assertEquals(selectedVersions.size(), 2);
        for (FileSystemDatasetVersion version : selectedVersions) {
            Set<Path> paths = version.getPaths();
            for (Path path : paths) {
                Assert.assertTrue(pathSet.contains(path.toString()));
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TimestampedDatasetVersion(org.apache.gobblin.data.management.version.TimestampedDatasetVersion) Config(com.typesafe.config.Config) FileSystemDatasetVersion(org.apache.gobblin.data.management.version.FileSystemDatasetVersion) ArrayList(java.util.ArrayList) DateTime(org.joda.time.DateTime) HashSet(java.util.HashSet) Test(org.testng.annotations.Test)

Example 4 with FileSystemDatasetVersion

use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.

the class FsCleanableHelperTest method testDeleteEmptyDirs.

@Test
public void testDeleteEmptyDirs() throws Exception {
    Properties props = new Properties();
    props.setProperty(FsCleanableHelper.SKIP_TRASH_KEY, Boolean.toString(true));
    FsCleanableHelper fsCleanableHelper = new FsCleanableHelper(this.fs, props, ConfigFactory.empty(), log);
    FileSystemDataset fsDataset = mock(FileSystemDataset.class);
    Path datasetRoot = new Path(testTempPath, "dataset1");
    when(fsDataset.datasetRoot()).thenReturn(datasetRoot);
    // To delete
    Path deleted1 = new Path(datasetRoot, "2016/01/01/13");
    Path deleted2 = new Path(datasetRoot, "2016/01/01/14");
    Path deleted3 = new Path(datasetRoot, "2016/01/02/15");
    // Do not delete
    Path notDeleted1 = new Path(datasetRoot, "2016/01/02/16");
    this.fs.mkdirs(deleted1);
    this.fs.mkdirs(deleted2);
    this.fs.mkdirs(deleted3);
    this.fs.mkdirs(notDeleted1);
    // Make sure all paths are created
    Assert.assertTrue(this.fs.exists(deleted1));
    Assert.assertTrue(this.fs.exists(deleted2));
    Assert.assertTrue(this.fs.exists(deleted3));
    Assert.assertTrue(this.fs.exists(notDeleted1));
    List<FileSystemDatasetVersion> deletableVersions = ImmutableList.<FileSystemDatasetVersion>of(new MockFileSystemDatasetVersion(deleted1), new MockFileSystemDatasetVersion(deleted2), new MockFileSystemDatasetVersion(deleted3));
    fsCleanableHelper.clean(deletableVersions, fsDataset);
    // Verify versions are deleted
    Assert.assertFalse(this.fs.exists(deleted1));
    Assert.assertFalse(this.fs.exists(deleted2));
    Assert.assertFalse(this.fs.exists(deleted3));
    // Verify versions are not deleted
    Assert.assertTrue(this.fs.exists(notDeleted1));
    // Verify empty parent dir "2016/01/01" is deleted
    Assert.assertFalse(this.fs.exists(deleted1.getParent()));
    // Verify non empty parent dir "2016/01/02" exists
    Assert.assertTrue(this.fs.exists(notDeleted1.getParent()));
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystemDataset(org.apache.gobblin.dataset.FileSystemDataset) FsCleanableHelper(org.apache.gobblin.data.management.retention.dataset.FsCleanableHelper) FileSystemDatasetVersion(org.apache.gobblin.data.management.version.FileSystemDatasetVersion) Properties(java.util.Properties) Test(org.testng.annotations.Test)

Example 5 with FileSystemDatasetVersion

use of org.apache.gobblin.data.management.version.FileSystemDatasetVersion in project incubator-gobblin by apache.

the class ConfigurableCleanableDatasetTest method testDatasetIsBlacklisted.

@Test
public void testDatasetIsBlacklisted() throws Exception {
    Config conf = ConfigFactory.parseMap(ImmutableMap.<String, String>of("gobblin.retention.version.finder.class", "org.apache.gobblin.data.management.version.finder.WatermarkDatasetVersionFinder", "gobblin.retention.selection.policy.class", "org.apache.gobblin.data.management.policy.NewestKSelectionPolicy", "gobblin.retention.selection.newestK.versionsSelected", "2", "gobblin.retention.dataset.is.blacklisted", "true"));
    ConfigurableCleanableDataset<FileSystemDatasetVersion> dataset = new ConfigurableCleanableDataset<FileSystemDatasetVersion>(FileSystem.get(new URI(ConfigurationKeys.LOCAL_FS_URI), new Configuration()), new Properties(), new Path("/someroot"), conf, LoggerFactory.getLogger(ConfigurableCleanableDatasetTest.class));
    Assert.assertEquals(dataset.isDatasetBlacklisted(), true);
}
Also used : Path(org.apache.hadoop.fs.Path) ConfigurableCleanableDataset(org.apache.gobblin.data.management.retention.dataset.ConfigurableCleanableDataset) Configuration(org.apache.hadoop.conf.Configuration) Config(com.typesafe.config.Config) FileSystemDatasetVersion(org.apache.gobblin.data.management.version.FileSystemDatasetVersion) Properties(java.util.Properties) URI(java.net.URI) Test(org.testng.annotations.Test)

Aggregations

FileSystemDatasetVersion (org.apache.gobblin.data.management.version.FileSystemDatasetVersion)9 Path (org.apache.hadoop.fs.Path)8 Config (com.typesafe.config.Config)6 Test (org.testng.annotations.Test)6 Properties (java.util.Properties)5 Configuration (org.apache.hadoop.conf.Configuration)5 URI (java.net.URI)4 ConfigurableCleanableDataset (org.apache.gobblin.data.management.retention.dataset.ConfigurableCleanableDataset)4 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 FileSystemDataset (org.apache.gobblin.dataset.FileSystemDataset)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 IOException (java.io.IOException)1 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 Map (java.util.Map)1 RetentionAction (org.apache.gobblin.data.management.retention.action.RetentionAction)1 FsCleanableHelper (org.apache.gobblin.data.management.retention.dataset.FsCleanableHelper)1 DatasetVersion (org.apache.gobblin.data.management.version.DatasetVersion)1 TimestampedDatasetVersion (org.apache.gobblin.data.management.version.TimestampedDatasetVersion)1 FileSystem (org.apache.hadoop.fs.FileSystem)1