use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class PartitionLevelWatermarkerTest method testExpectedHighWatermarkNoPreviousState.
@Test
public void testExpectedHighWatermarkNoPreviousState() throws Exception {
String dbName = "testExpectedHighWatermarkNoPreviousState";
LocalHiveMetastoreTestUtils.getInstance().dropDatabaseIfExists(dbName);
long now = new DateTime().getMillis();
SourceState state = new SourceState();
PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
Table table = localTestTable(dbName, "testTable1", true);
Partition part1 = localTestPartition(table, Lists.newArrayList("2015"));
watermarker.onTableProcessBegin(table, 0l);
watermarker.onPartitionProcessBegin(part1, 0l, now + 2015l);
Table table2 = localTestTable(dbName, "testTable2", true);
Partition part2 = localTestPartition(table2, Lists.newArrayList("2016"));
watermarker.onTableProcessBegin(table2, 0l);
watermarker.onPartitionProcessBegin(part2, 0l, now + 16l);
List<WorkUnit> workunits = Lists.newArrayList();
watermarker.onGetWorkunitsEnd(workunits);
Assert.assertEquals(watermarker.getPreviousHighWatermark(part1).getValue(), 0l);
Assert.assertEquals(watermarker.getPreviousHighWatermark(table).getValue(), 0l);
Assert.assertEquals(watermarker.getPreviousHighWatermark(part2).getValue(), 0l);
Assert.assertEquals(watermarker.getPreviousHighWatermark(table2).getValue(), 0l);
Assert.assertEquals(workunits.size(), 2);
Assert.assertEquals(workunits.get(0).getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY), true);
Assert.assertEquals(workunits.get(1).getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY), true);
Collections.sort(workunits, new Comparator<WorkUnit>() {
@Override
public int compare(WorkUnit o1, WorkUnit o2) {
return o1.getProp(ConfigurationKeys.DATASET_URN_KEY).compareTo(o2.getProp(ConfigurationKeys.DATASET_URN_KEY));
}
});
Assert.assertEquals(workunits.get(0).getProp(ConfigurationKeys.DATASET_URN_KEY), table.getCompleteName());
Assert.assertEquals(workunits.get(1).getProp(ConfigurationKeys.DATASET_URN_KEY), table2.getCompleteName());
Assert.assertEquals(workunits.get(0).getExpectedHighWatermark(MultiKeyValueLongWatermark.class).getWatermarks(), ImmutableMap.of(PartitionLevelWatermarker.partitionKey(part1), now + 2015l));
Assert.assertEquals(workunits.get(1).getExpectedHighWatermark(MultiKeyValueLongWatermark.class).getWatermarks(), ImmutableMap.of(PartitionLevelWatermarker.partitionKey(part2), now + 16l));
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class PartitionLevelWatermarkerTest method testRecentlyModifiedPartitionWatermarksWithPreviousState.
// Previous state 3. New partitions 3. 2 from new state retained
@Test
public void testRecentlyModifiedPartitionWatermarksWithPreviousState() throws Exception {
String dbName = "testRecentlyModifiedPartitionWatermarksWithPreviousState";
LocalHiveMetastoreTestUtils.getInstance().dropDatabaseIfExists(dbName);
long time5DaysAgo = new DateTime().minusDays(5).getMillis();
WorkUnitState previousWus = new WorkUnitState();
previousWus.setProp(ConfigurationKeys.DATASET_URN_KEY, dbName + "@testTable2");
previousWus.setProp(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY, true);
previousWus.setActualHighWatermark(new MultiKeyValueLongWatermark(// Do not retain
ImmutableMap.of(// Do not retain
"2010", // Do not retain
time5DaysAgo - 100l, // Do not retain
"2011", // Do not retain
time5DaysAgo - 101l, // Do retain
"2012", // Do retain
time5DaysAgo + 102l)));
SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus));
state.setProp(HiveSource.HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, 3);
PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
watermarker.setLeastWatermarkToPersistInState(time5DaysAgo);
Table table = localTestTable(dbName, "testTable2", true);
// Watermark not retained
Partition part2009 = localTestPartition(table, ImmutableList.of("2009"));
// Watermark retained
Partition part2013 = localTestPartition(table, ImmutableList.of("2013"));
Partition part2014 = localTestPartition(table, ImmutableList.of("2014"));
watermarker.onTableProcessBegin(table, 0l);
// Watermark not retained
watermarker.onPartitionProcessBegin(part2009, 0l, time5DaysAgo - 99l);
// Watermark retained
watermarker.onPartitionProcessBegin(part2013, 0l, time5DaysAgo + 103l);
watermarker.onPartitionProcessBegin(part2014, 0l, time5DaysAgo + 104l);
List<WorkUnit> workunits = Lists.newArrayList();
watermarker.onGetWorkunitsEnd(workunits);
Assert.assertEquals(workunits.size(), 1);
WorkUnit watermarkWu = workunits.get(0);
Map<String, Long> workunitWatermarks = watermarkWu.getExpectedHighWatermark(MultiKeyValueLongWatermark.class).getWatermarks();
Assert.assertEquals(workunitWatermarks.size(), 3, "expectedHighWatermarks size");
ImmutableMap<String, Long> expectedWatermarks = ImmutableMap.of("2014", time5DaysAgo + 104l, "2013", time5DaysAgo + 103l, "2012", time5DaysAgo + 102l);
Assert.assertEquals(workunitWatermarks, expectedWatermarks);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class CopySourcePrioritizationTest method testPrioritization.
// This test uses a prioritizer that preferentially copies the lower file sets of each dataset
@Test
public void testPrioritization() throws Exception {
SourceState state = new SourceState();
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, MyFinder.class.getName());
state.setProp(CopyConfiguration.PRIORITIZER_ALIAS_KEY, MyPrioritizer.class.getName());
state.setProp(CopyConfiguration.MAX_COPY_PREFIX + "." + CopyResourcePool.ENTITIES_KEY, 8);
state.setProp(CopyConfiguration.MAX_COPY_PREFIX + "." + CopyResourcePool.TOLERANCE_KEY, 1);
CopySource source = new CopySource();
List<WorkUnit> workunits = source.getWorkunits(state);
workunits = JobLauncherUtils.flattenWorkUnits(workunits);
Assert.assertEquals(workunits.size(), 8);
List<String> paths = extractPaths(workunits);
Assert.assertTrue(paths.contains("d0.fs0.f1"));
Assert.assertTrue(paths.contains("d0.fs0.f2"));
Assert.assertTrue(paths.contains("d0.fs1.f1"));
Assert.assertTrue(paths.contains("d0.fs1.f2"));
Assert.assertTrue(paths.contains("d1.fs0.f1"));
Assert.assertTrue(paths.contains("d1.fs0.f2"));
Assert.assertTrue(paths.contains("d1.fs1.f1"));
Assert.assertTrue(paths.contains("d1.fs1.f2"));
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class CopySourcePrioritizationTest method extractPaths.
private List<String> extractPaths(List<WorkUnit> workUnits) {
List<String> paths = Lists.newArrayList();
for (WorkUnit wu : workUnits) {
CopyableFile cf = (CopyableFile) CopySource.deserializeCopyEntity(wu);
paths.add(cf.getOrigin().getPath().toString());
}
return paths;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class CopySourceTest method testPartitionableDataset.
@Test
public void testPartitionableDataset() throws Exception {
SourceState state = new SourceState();
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyablePartitionableDatasedFinder.class.getCanonicalName());
CopySource source = new CopySource();
List<WorkUnit> workunits = source.getWorkunits(state);
workunits = JobLauncherUtils.flattenWorkUnits(workunits);
Assert.assertEquals(workunits.size(), TestCopyableDataset.FILE_COUNT);
Extract extractAbove = null;
Extract extractBelow = null;
for (WorkUnit workUnit : workunits) {
CopyableFile copyableFile = (CopyableFile) CopySource.deserializeCopyEntity(workUnit);
Assert.assertTrue(copyableFile.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX));
Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION);
if (Integer.parseInt(copyableFile.getOrigin().getPath().getName()) < TestCopyablePartitionableDataset.THRESHOLD) {
// should be in extractBelow
if (extractBelow == null) {
extractBelow = workUnit.getExtract();
}
Assert.assertEquals(workUnit.getExtract(), extractBelow);
} else {
// should be in extractAbove
if (extractAbove == null) {
extractAbove = workUnit.getExtract();
}
Assert.assertEquals(workUnit.getExtract(), extractAbove);
}
}
Assert.assertNotNull(extractAbove);
Assert.assertNotNull(extractBelow);
}
Aggregations