Search in sources :

Example 1 with FileSystemDataset

use of org.apache.gobblin.dataset.FileSystemDataset in project incubator-gobblin by apache.

the class PinotAuditCountVerifierTest method testTier.

@Test
public void testTier() throws Exception {
    final String topic = "randomTopic";
    final String input = "/base/input";
    final String output = "/base/output";
    final String inputSub = "hourly";
    final String outputSub = "hourly";
    TestAuditCountClient client = new TestAuditCountClient();
    FileSystemDataset dataset = new FileSystemDataset() {

        @Override
        public Path datasetRoot() {
            return new Path(input + topic + inputSub + "/2017/04/03/10");
        }

        @Override
        public String datasetURN() {
            return input + topic + inputSub + "/2017/04/03/10";
        }
    };
    State props = new State();
    props.setProp(CompactionAuditCountVerifier.PRODUCER_TIER, PRODUCER_TIER);
    props.setProp(CompactionAuditCountVerifier.ORIGIN_TIER, ORIGIN_TIER);
    props.setProp(CompactionAuditCountVerifier.GOBBLIN_TIER, GOBBLIN_TIER);
    props.setProp(MRCompactor.COMPACTION_INPUT_DIR, input);
    props.setProp(MRCompactor.COMPACTION_INPUT_SUBDIR, inputSub);
    props.setProp(MRCompactor.COMPACTION_DEST_DIR, output);
    props.setProp(MRCompactor.COMPACTION_DEST_SUBDIR, outputSub);
    props.setProp(MRCompactor.COMPACTION_TMP_DEST_DIR, "/tmp/compaction/verifier");
    props.setProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MAX_TIME_AGO, "3000d");
    props.setProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MIN_TIME_AGO, "1d");
    CompactionAuditCountVerifier verifier = new CompactionAuditCountVerifier(props, client);
    // All complete
    client.setCounts(ImmutableMap.of(PRODUCER_TIER, 1000L, ORIGIN_TIER, 1000L, GOBBLIN_TIER, 1000L));
    Assert.assertTrue(verifier.verify(dataset).isSuccessful);
    // test true because GOBBLIN_TIER / PRODUCER_TIER is above threshold
    client.setCounts(ImmutableMap.of(PRODUCER_TIER, 1000L, ORIGIN_TIER, 1100L, GOBBLIN_TIER, 1000L));
    Assert.assertTrue(verifier.verify(dataset).isSuccessful);
    // test false because GOBBLIN_TIER / (PRODUCER_TIER || ORIGIN_TIER) is below threshold
    client.setCounts(ImmutableMap.of(PRODUCER_TIER, 1100L, ORIGIN_TIER, 1100L, GOBBLIN_TIER, 1000L));
    Assert.assertFalse(verifier.verify(dataset).isSuccessful);
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystemDataset(org.apache.gobblin.dataset.FileSystemDataset) State(org.apache.gobblin.configuration.State) Test(org.testng.annotations.Test)

Example 2 with FileSystemDataset

use of org.apache.gobblin.dataset.FileSystemDataset in project incubator-gobblin by apache.

the class ReplicationDataValidPathPicker method getValidPaths.

@SuppressWarnings("unchecked")
public static Collection<Path> getValidPaths(HadoopFsEndPoint hadoopFsEndPoint) throws IOException {
    Config selectionConfig = hadoopFsEndPoint.getSelectionConfig();
    FileSystemDataset tmpDataset = new HadoopFsEndPointDataset(hadoopFsEndPoint);
    FileSystem theFs = FileSystem.get(hadoopFsEndPoint.getFsURI(), new Configuration());
    /**
     * Use {@link FileSystemDatasetVersion} as
     * {@link DateTimeDatasetVersionFinder} / {@link GlobModTimeDatasetVersionFinder} use {@link TimestampedDatasetVersion}
     * {@link SingleVersionFinder} uses {@link FileStatusDatasetVersion}
     */
    VersionFinder<FileSystemDatasetVersion> finder;
    try {
        finder = (VersionFinder<FileSystemDatasetVersion>) ConstructorUtils.invokeConstructor(Class.forName(selectionConfig.getString(FINDER_CLASS)), theFs, selectionConfig);
    } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) {
        throw new IllegalArgumentException(e);
    }
    List<FileSystemDatasetVersion> versions = Ordering.natural().reverse().sortedCopy(finder.findDatasetVersions(tmpDataset));
    VersionSelectionPolicy<FileSystemDatasetVersion> selector;
    try {
        selector = (VersionSelectionPolicy<FileSystemDatasetVersion>) ConstructorUtils.invokeConstructor(Class.forName(selectionConfig.getString(POLICY_CLASS)), selectionConfig);
    } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) {
        throw new IllegalArgumentException(e);
    }
    Collection<FileSystemDatasetVersion> versionsSelected = selector.listSelectedVersions(versions);
    List<Path> result = new ArrayList<Path>();
    for (FileSystemDatasetVersion t : versionsSelected) {
        // get the first element out
        result.add(t.getPaths().iterator().next());
    }
    return result;
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystemDataset(org.apache.gobblin.dataset.FileSystemDataset) Configuration(org.apache.hadoop.conf.Configuration) Config(com.typesafe.config.Config) FileSystemDatasetVersion(org.apache.gobblin.data.management.version.FileSystemDatasetVersion) ArrayList(java.util.ArrayList) InvocationTargetException(java.lang.reflect.InvocationTargetException) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 3 with FileSystemDataset

use of org.apache.gobblin.dataset.FileSystemDataset in project incubator-gobblin by apache.

the class FsCleanableHelperTest method testDeleteEmptyDirs.

@Test
public void testDeleteEmptyDirs() throws Exception {
    Properties props = new Properties();
    props.setProperty(FsCleanableHelper.SKIP_TRASH_KEY, Boolean.toString(true));
    FsCleanableHelper fsCleanableHelper = new FsCleanableHelper(this.fs, props, ConfigFactory.empty(), log);
    FileSystemDataset fsDataset = mock(FileSystemDataset.class);
    Path datasetRoot = new Path(testTempPath, "dataset1");
    when(fsDataset.datasetRoot()).thenReturn(datasetRoot);
    // To delete
    Path deleted1 = new Path(datasetRoot, "2016/01/01/13");
    Path deleted2 = new Path(datasetRoot, "2016/01/01/14");
    Path deleted3 = new Path(datasetRoot, "2016/01/02/15");
    // Do not delete
    Path notDeleted1 = new Path(datasetRoot, "2016/01/02/16");
    this.fs.mkdirs(deleted1);
    this.fs.mkdirs(deleted2);
    this.fs.mkdirs(deleted3);
    this.fs.mkdirs(notDeleted1);
    // Make sure all paths are created
    Assert.assertTrue(this.fs.exists(deleted1));
    Assert.assertTrue(this.fs.exists(deleted2));
    Assert.assertTrue(this.fs.exists(deleted3));
    Assert.assertTrue(this.fs.exists(notDeleted1));
    List<FileSystemDatasetVersion> deletableVersions = ImmutableList.<FileSystemDatasetVersion>of(new MockFileSystemDatasetVersion(deleted1), new MockFileSystemDatasetVersion(deleted2), new MockFileSystemDatasetVersion(deleted3));
    fsCleanableHelper.clean(deletableVersions, fsDataset);
    // Verify versions are deleted
    Assert.assertFalse(this.fs.exists(deleted1));
    Assert.assertFalse(this.fs.exists(deleted2));
    Assert.assertFalse(this.fs.exists(deleted3));
    // Verify versions are not deleted
    Assert.assertTrue(this.fs.exists(notDeleted1));
    // Verify empty parent dir "2016/01/01" is deleted
    Assert.assertFalse(this.fs.exists(deleted1.getParent()));
    // Verify non empty parent dir "2016/01/02" exists
    Assert.assertTrue(this.fs.exists(notDeleted1.getParent()));
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystemDataset(org.apache.gobblin.dataset.FileSystemDataset) FsCleanableHelper(org.apache.gobblin.data.management.retention.dataset.FsCleanableHelper) FileSystemDatasetVersion(org.apache.gobblin.data.management.version.FileSystemDatasetVersion) Properties(java.util.Properties) Test(org.testng.annotations.Test)

Example 4 with FileSystemDataset

use of org.apache.gobblin.dataset.FileSystemDataset in project incubator-gobblin by apache.

the class ModDateTimeDatasetVersionFinder method findDatasetVersions.

@Override
public Collection<TimestampedDatasetVersion> findDatasetVersions(Dataset dataset) throws IOException {
    FileSystemDataset fsDataset = (FileSystemDataset) dataset;
    FileStatus status = this.fs.getFileStatus(fsDataset.datasetRoot());
    return Lists.newArrayList(new TimestampedDatasetVersion(new DateTime(status.getModificationTime()), fsDataset.datasetRoot()));
}
Also used : FileSystemDataset(org.apache.gobblin.dataset.FileSystemDataset) FileStatus(org.apache.hadoop.fs.FileStatus) TimestampedDatasetVersion(org.apache.gobblin.data.management.version.TimestampedDatasetVersion) DateTime(org.joda.time.DateTime)

Example 5 with FileSystemDataset

use of org.apache.gobblin.dataset.FileSystemDataset in project incubator-gobblin by apache.

the class AbstractDatasetVersionFinder method findDatasetVersions.

/**
 * Find dataset versions in the input {@link org.apache.hadoop.fs.Path}. Dataset versions are subdirectories of the
 * input {@link org.apache.hadoop.fs.Path} representing a single manageable unit in the dataset.
 * See {@link org.apache.gobblin.data.management.retention.DatasetCleaner} for more information.
 *
 * @param dataset {@link org.apache.hadoop.fs.Path} to directory containing all versions of a dataset.
 * @return Map of {@link org.apache.gobblin.data.management.version.DatasetVersion} and {@link org.apache.hadoop.fs.FileStatus}
 *        for each dataset version found.
 * @throws IOException
 */
@Override
public Collection<T> findDatasetVersions(Dataset dataset) throws IOException {
    FileSystemDataset fsDataset = (FileSystemDataset) dataset;
    Path versionGlobStatus = new Path(fsDataset.datasetRoot(), globVersionPattern());
    FileStatus[] dataSetVersionPaths = this.fs.globStatus(versionGlobStatus);
    List<T> dataSetVersions = Lists.newArrayList();
    for (FileStatus dataSetVersionPath : dataSetVersionPaths) {
        T datasetVersion = getDatasetVersion(PathUtils.relativizePath(dataSetVersionPath.getPath(), fsDataset.datasetRoot()), dataSetVersionPath);
        if (datasetVersion != null) {
            dataSetVersions.add(datasetVersion);
        }
    }
    return dataSetVersions;
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystemDataset(org.apache.gobblin.dataset.FileSystemDataset) FileStatus(org.apache.hadoop.fs.FileStatus)

Aggregations

FileSystemDataset (org.apache.gobblin.dataset.FileSystemDataset)5 Path (org.apache.hadoop.fs.Path)4 FileSystemDatasetVersion (org.apache.gobblin.data.management.version.FileSystemDatasetVersion)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 Test (org.testng.annotations.Test)2 Config (com.typesafe.config.Config)1 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 ArrayList (java.util.ArrayList)1 Properties (java.util.Properties)1 State (org.apache.gobblin.configuration.State)1 FsCleanableHelper (org.apache.gobblin.data.management.retention.dataset.FsCleanableHelper)1 TimestampedDatasetVersion (org.apache.gobblin.data.management.version.TimestampedDatasetVersion)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 DateTime (org.joda.time.DateTime)1