Search in sources :

Example 81 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testExpectedHighWatermarkNoPreviousState.

@Test
public void testExpectedHighWatermarkNoPreviousState() throws Exception {
    String dbName = "testExpectedHighWatermarkNoPreviousState";
    LocalHiveMetastoreTestUtils.getInstance().dropDatabaseIfExists(dbName);
    long now = new DateTime().getMillis();
    SourceState state = new SourceState();
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    Table table = localTestTable(dbName, "testTable1", true);
    Partition part1 = localTestPartition(table, Lists.newArrayList("2015"));
    watermarker.onTableProcessBegin(table, 0l);
    watermarker.onPartitionProcessBegin(part1, 0l, now + 2015l);
    Table table2 = localTestTable(dbName, "testTable2", true);
    Partition part2 = localTestPartition(table2, Lists.newArrayList("2016"));
    watermarker.onTableProcessBegin(table2, 0l);
    watermarker.onPartitionProcessBegin(part2, 0l, now + 16l);
    List<WorkUnit> workunits = Lists.newArrayList();
    watermarker.onGetWorkunitsEnd(workunits);
    Assert.assertEquals(watermarker.getPreviousHighWatermark(part1).getValue(), 0l);
    Assert.assertEquals(watermarker.getPreviousHighWatermark(table).getValue(), 0l);
    Assert.assertEquals(watermarker.getPreviousHighWatermark(part2).getValue(), 0l);
    Assert.assertEquals(watermarker.getPreviousHighWatermark(table2).getValue(), 0l);
    Assert.assertEquals(workunits.size(), 2);
    Assert.assertEquals(workunits.get(0).getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY), true);
    Assert.assertEquals(workunits.get(1).getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY), true);
    Collections.sort(workunits, new Comparator<WorkUnit>() {

        @Override
        public int compare(WorkUnit o1, WorkUnit o2) {
            return o1.getProp(ConfigurationKeys.DATASET_URN_KEY).compareTo(o2.getProp(ConfigurationKeys.DATASET_URN_KEY));
        }
    });
    Assert.assertEquals(workunits.get(0).getProp(ConfigurationKeys.DATASET_URN_KEY), table.getCompleteName());
    Assert.assertEquals(workunits.get(1).getProp(ConfigurationKeys.DATASET_URN_KEY), table2.getCompleteName());
    Assert.assertEquals(workunits.get(0).getExpectedHighWatermark(MultiKeyValueLongWatermark.class).getWatermarks(), ImmutableMap.of(PartitionLevelWatermarker.partitionKey(part1), now + 2015l));
    Assert.assertEquals(workunits.get(1).getExpectedHighWatermark(MultiKeyValueLongWatermark.class).getWatermarks(), ImmutableMap.of(PartitionLevelWatermarker.partitionKey(part2), now + 16l));
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.ql.metadata.Table) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) DateTime(org.joda.time.DateTime) Test(org.testng.annotations.Test)

Example 82 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testRecentlyModifiedPartitionWatermarksWithPreviousState.

// Previous state 3. New partitions 3. 2 from new state retained
@Test
public void testRecentlyModifiedPartitionWatermarksWithPreviousState() throws Exception {
    String dbName = "testRecentlyModifiedPartitionWatermarksWithPreviousState";
    LocalHiveMetastoreTestUtils.getInstance().dropDatabaseIfExists(dbName);
    long time5DaysAgo = new DateTime().minusDays(5).getMillis();
    WorkUnitState previousWus = new WorkUnitState();
    previousWus.setProp(ConfigurationKeys.DATASET_URN_KEY, dbName + "@testTable2");
    previousWus.setProp(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY, true);
    previousWus.setActualHighWatermark(new MultiKeyValueLongWatermark(// Do not retain
    ImmutableMap.of(// Do not retain
    "2010", // Do not retain
    time5DaysAgo - 100l, // Do not retain
    "2011", // Do not retain
    time5DaysAgo - 101l, // Do retain
    "2012", // Do retain
    time5DaysAgo + 102l)));
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus));
    state.setProp(HiveSource.HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, 3);
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    watermarker.setLeastWatermarkToPersistInState(time5DaysAgo);
    Table table = localTestTable(dbName, "testTable2", true);
    // Watermark not retained
    Partition part2009 = localTestPartition(table, ImmutableList.of("2009"));
    // Watermark retained
    Partition part2013 = localTestPartition(table, ImmutableList.of("2013"));
    Partition part2014 = localTestPartition(table, ImmutableList.of("2014"));
    watermarker.onTableProcessBegin(table, 0l);
    // Watermark not retained
    watermarker.onPartitionProcessBegin(part2009, 0l, time5DaysAgo - 99l);
    // Watermark retained
    watermarker.onPartitionProcessBegin(part2013, 0l, time5DaysAgo + 103l);
    watermarker.onPartitionProcessBegin(part2014, 0l, time5DaysAgo + 104l);
    List<WorkUnit> workunits = Lists.newArrayList();
    watermarker.onGetWorkunitsEnd(workunits);
    Assert.assertEquals(workunits.size(), 1);
    WorkUnit watermarkWu = workunits.get(0);
    Map<String, Long> workunitWatermarks = watermarkWu.getExpectedHighWatermark(MultiKeyValueLongWatermark.class).getWatermarks();
    Assert.assertEquals(workunitWatermarks.size(), 3, "expectedHighWatermarks size");
    ImmutableMap<String, Long> expectedWatermarks = ImmutableMap.of("2014", time5DaysAgo + 104l, "2013", time5DaysAgo + 103l, "2012", time5DaysAgo + 102l);
    Assert.assertEquals(workunitWatermarks, expectedWatermarks);
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.ql.metadata.Table) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) DateTime(org.joda.time.DateTime) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 83 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class CopySourcePrioritizationTest method testPrioritization.

// This test uses a prioritizer that preferentially copies the lower file sets of each dataset
@Test
public void testPrioritization() throws Exception {
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
    state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, MyFinder.class.getName());
    state.setProp(CopyConfiguration.PRIORITIZER_ALIAS_KEY, MyPrioritizer.class.getName());
    state.setProp(CopyConfiguration.MAX_COPY_PREFIX + "." + CopyResourcePool.ENTITIES_KEY, 8);
    state.setProp(CopyConfiguration.MAX_COPY_PREFIX + "." + CopyResourcePool.TOLERANCE_KEY, 1);
    CopySource source = new CopySource();
    List<WorkUnit> workunits = source.getWorkunits(state);
    workunits = JobLauncherUtils.flattenWorkUnits(workunits);
    Assert.assertEquals(workunits.size(), 8);
    List<String> paths = extractPaths(workunits);
    Assert.assertTrue(paths.contains("d0.fs0.f1"));
    Assert.assertTrue(paths.contains("d0.fs0.f2"));
    Assert.assertTrue(paths.contains("d0.fs1.f1"));
    Assert.assertTrue(paths.contains("d0.fs1.f2"));
    Assert.assertTrue(paths.contains("d1.fs0.f1"));
    Assert.assertTrue(paths.contains("d1.fs0.f2"));
    Assert.assertTrue(paths.contains("d1.fs1.f1"));
    Assert.assertTrue(paths.contains("d1.fs1.f2"));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 84 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class CopySourcePrioritizationTest method extractPaths.

private List<String> extractPaths(List<WorkUnit> workUnits) {
    List<String> paths = Lists.newArrayList();
    for (WorkUnit wu : workUnits) {
        CopyableFile cf = (CopyableFile) CopySource.deserializeCopyEntity(wu);
        paths.add(cf.getOrigin().getPath().toString());
    }
    return paths;
}
Also used : WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 85 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class CopySourceTest method testPartitionableDataset.

@Test
public void testPartitionableDataset() throws Exception {
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
    state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyablePartitionableDatasedFinder.class.getCanonicalName());
    CopySource source = new CopySource();
    List<WorkUnit> workunits = source.getWorkunits(state);
    workunits = JobLauncherUtils.flattenWorkUnits(workunits);
    Assert.assertEquals(workunits.size(), TestCopyableDataset.FILE_COUNT);
    Extract extractAbove = null;
    Extract extractBelow = null;
    for (WorkUnit workUnit : workunits) {
        CopyableFile copyableFile = (CopyableFile) CopySource.deserializeCopyEntity(workUnit);
        Assert.assertTrue(copyableFile.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX));
        Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION);
        if (Integer.parseInt(copyableFile.getOrigin().getPath().getName()) < TestCopyablePartitionableDataset.THRESHOLD) {
            // should be in extractBelow
            if (extractBelow == null) {
                extractBelow = workUnit.getExtract();
            }
            Assert.assertEquals(workUnit.getExtract(), extractBelow);
        } else {
            // should be in extractAbove
            if (extractAbove == null) {
                extractAbove = workUnit.getExtract();
            }
            Assert.assertEquals(workUnit.getExtract(), extractAbove);
        }
    }
    Assert.assertNotNull(extractAbove);
    Assert.assertNotNull(extractBelow);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6