Search in sources :

Example 26 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testRecentlyModifiedPartitionWatermarks.

// No previous state. 5 new modified partitions. Only 3 most recently modified retained in getExpectedHighWatermark
@Test
public void testRecentlyModifiedPartitionWatermarks() throws Exception {
    String dbName = "testRecentlyModifiedPartitionWatermarks";
    LocalHiveMetastoreTestUtils.getInstance().dropDatabaseIfExists(dbName);
    SourceState state = new SourceState();
    state.setProp(HiveSource.HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, 3);
    long time5DaysAgo = new DateTime().minusDays(5).getMillis();
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    watermarker.setLeastWatermarkToPersistInState(time5DaysAgo);
    Table table = localTestTable(dbName, "testTable2", true);
    Partition part2010 = localTestPartition(table, ImmutableList.of("2010"));
    Partition part2011 = localTestPartition(table, ImmutableList.of("2011"));
    Partition part2012 = localTestPartition(table, ImmutableList.of("2012"));
    Partition part2013 = localTestPartition(table, ImmutableList.of("2013"));
    Partition part2014 = localTestPartition(table, ImmutableList.of("2014"));
    watermarker.onTableProcessBegin(table, 0l);
    watermarker.onPartitionProcessBegin(part2010, 0l, time5DaysAgo - 100l);
    watermarker.onPartitionProcessBegin(part2011, 0l, time5DaysAgo - 101l);
    watermarker.onPartitionProcessBegin(part2012, 0l, time5DaysAgo + 102l);
    watermarker.onPartitionProcessBegin(part2013, 0l, time5DaysAgo + 103l);
    watermarker.onPartitionProcessBegin(part2014, 0l, time5DaysAgo + 104l);
    List<WorkUnit> workunits = Lists.newArrayList();
    watermarker.onGetWorkunitsEnd(workunits);
    Assert.assertEquals(workunits.size(), 1);
    WorkUnit watermarkWu = workunits.get(0);
    Map<String, Long> workunitWatermarks = watermarkWu.getExpectedHighWatermark(MultiKeyValueLongWatermark.class).getWatermarks();
    Assert.assertEquals(workunitWatermarks.size(), 3, "expectedHighWatermarks size");
    ImmutableMap<String, Long> expectedWatermarks = ImmutableMap.of("2014", time5DaysAgo + 104l, "2013", time5DaysAgo + 103l, "2012", time5DaysAgo + 102l);
    Assert.assertEquals(workunitWatermarks, expectedWatermarks);
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.ql.metadata.Table) DateTime(org.joda.time.DateTime) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 27 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class ConcurrentBoundedWorkUnitListTest method addFiles.

public boolean addFiles(ConcurrentBoundedWorkUnitList list, String fileSetName, int fileNumber) throws IOException {
    FileSet.Builder<CopyEntity> partitionBuilder = new FileSet.Builder<>(fileSetName, new DummyDataset(new Path("/path")));
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (int i = 0; i < fileNumber; i++) {
        CopyEntity cf = createCopyableFile(i);
        partitionBuilder.add(cf);
        WorkUnit workUnit = new WorkUnit();
        CopySource.serializeCopyEntity(workUnit, cf);
        workUnits.add(workUnit);
    }
    return list.addFileSet(partitionBuilder.build(), workUnits);
}
Also used : Path(org.apache.hadoop.fs.Path) FileSet(org.apache.gobblin.data.management.partition.FileSet) DummyDataset(org.apache.gobblin.data.management.dataset.DummyDataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 28 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class CopySourcePrioritizationTest method testNoPrioritization.

@Test
public void testNoPrioritization() throws Exception {
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
    state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, MyFinder.class.getName());
    CopySource source = new CopySource();
    List<WorkUnit> workunits = source.getWorkunits(state);
    workunits = JobLauncherUtils.flattenWorkUnits(workunits);
    Assert.assertEquals(workunits.size(), MyFinder.DATASETS * MyDataset.FILE_SETS * MyFileSet.FILES);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 29 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class CopySourcePrioritizationTest method testUnprioritizedFileLimit.

@Test
public void testUnprioritizedFileLimit() throws Exception {
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
    state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, MyFinder.class.getName());
    // Disable parallel listing to make work unit selection deterministic
    state.setProp(CopySource.MAX_CONCURRENT_LISTING_SERVICES, 1);
    state.setProp(CopyConfiguration.MAX_COPY_PREFIX + "." + CopyResourcePool.ENTITIES_KEY, 10);
    state.setProp(CopyConfiguration.MAX_COPY_PREFIX + "." + CopyResourcePool.TOLERANCE_KEY, 1);
    CopySource source = new CopySource();
    List<WorkUnit> workunits = source.getWorkunits(state);
    workunits = JobLauncherUtils.flattenWorkUnits(workunits);
    // Check limited to 10 entities
    Assert.assertEquals(workunits.size(), 10);
    List<String> paths = extractPaths(workunits);
    Assert.assertTrue(paths.contains("d0.fs0.f1"));
    Assert.assertTrue(paths.contains("d0.fs0.f2"));
    Assert.assertTrue(paths.contains("d0.fs1.f1"));
    Assert.assertTrue(paths.contains("d0.fs1.f2"));
    Assert.assertTrue(paths.contains("d0.fs2.f1"));
    Assert.assertTrue(paths.contains("d0.fs2.f2"));
    Assert.assertTrue(paths.contains("d0.fs3.f1"));
    Assert.assertTrue(paths.contains("d0.fs3.f2"));
    Assert.assertTrue(paths.contains("d1.fs0.f1"));
    Assert.assertTrue(paths.contains("d1.fs0.f2"));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 30 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class CopySourceTest method testCopySource.

@Test
public void testCopySource() throws Exception {
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
    state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyableDatasetFinder.class.getName());
    CopySource source = new CopySource();
    List<WorkUnit> workunits = source.getWorkunits(state);
    workunits = JobLauncherUtils.flattenWorkUnits(workunits);
    Assert.assertEquals(workunits.size(), TestCopyableDataset.FILE_COUNT);
    Extract extract = workunits.get(0).getExtract();
    for (WorkUnit workUnit : workunits) {
        CopyableFile file = (CopyableFile) CopySource.deserializeCopyEntity(workUnit);
        Assert.assertTrue(file.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX));
        Assert.assertEquals(file.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION);
        Assert.assertEquals(workUnit.getExtract(), extract);
    }
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6