use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class CopyRouteGeneratorTest method testCopyRouteGenerator.
@Test
public void testCopyRouteGenerator() throws Exception {
// Oct 1, 2016
long replica1Watermark = 1475304606000L;
// Oct 4, 2016
long sourceWatermark = 1475604606000L;
ReplicaHadoopFsEndPoint notAvailableReplica = Mockito.mock(ReplicaHadoopFsEndPoint.class);
Mockito.when(notAvailableReplica.isFileSystemAvailable()).thenReturn(false);
Optional<ComparableWatermark> tmp = Optional.absent();
Mockito.when(notAvailableReplica.getWatermark()).thenReturn(tmp);
ReplicaHadoopFsEndPoint replica1 = Mockito.mock(ReplicaHadoopFsEndPoint.class);
Mockito.when(replica1.isFileSystemAvailable()).thenReturn(true);
ComparableWatermark cw = new LongWatermark(replica1Watermark);
tmp = Optional.of(cw);
Mockito.when(replica1.getWatermark()).thenReturn(tmp);
SourceHadoopFsEndPoint source = Mockito.mock(SourceHadoopFsEndPoint.class);
Mockito.when(source.isFileSystemAvailable()).thenReturn(true);
cw = new LongWatermark(sourceWatermark);
tmp = Optional.of(cw);
Mockito.when(source.getWatermark()).thenReturn(tmp);
ReplicaHadoopFsEndPoint copyToEndPoint = Mockito.mock(ReplicaHadoopFsEndPoint.class);
Mockito.when(copyToEndPoint.isFileSystemAvailable()).thenReturn(true);
CopyRoute cp1 = new CopyRoute(notAvailableReplica, copyToEndPoint);
CopyRoute cp2 = new CopyRoute(replica1, copyToEndPoint);
CopyRoute cp3 = new CopyRoute(source, copyToEndPoint);
DataFlowTopology.DataFlowPath dataFlowPath = new DataFlowTopology.DataFlowPath(ImmutableList.<CopyRoute>of(cp1, cp2, cp3));
DataFlowTopology dataFlowTopology = new DataFlowTopology();
dataFlowTopology.addDataFlowPath(dataFlowPath);
ReplicationConfiguration rc = Mockito.mock(ReplicationConfiguration.class);
Mockito.when(rc.getCopyMode()).thenReturn(ReplicationCopyMode.PULL);
Mockito.when(rc.getSource()).thenReturn(source);
Mockito.when(rc.getReplicas()).thenReturn(ImmutableList.<EndPoint>of(notAvailableReplica, replica1, copyToEndPoint));
Mockito.when(rc.getDataFlowToplogy()).thenReturn(dataFlowTopology);
CopyRouteGeneratorOptimizedNetworkBandwidthForTest network = new CopyRouteGeneratorOptimizedNetworkBandwidthForTest();
Assert.assertTrue(network.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().equals(replica1));
Assert.assertTrue(network.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().getWatermark().get().compareTo(new LongWatermark(replica1Watermark)) == 0);
CopyRouteGeneratorOptimizedLatency latency = new CopyRouteGeneratorOptimizedLatency();
Assert.assertTrue(latency.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().equals(source));
Assert.assertTrue(latency.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().getWatermark().get().compareTo(new LongWatermark(sourceWatermark)) == 0);
}
use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class StateStoreWatermarkStorageTest method testPersistWatermarkStateToZk.
@Test
public void testPersistWatermarkStateToZk() throws IOException {
CheckpointableWatermark watermark = new DefaultCheckpointableWatermark("source", new LongWatermark(startTime));
TaskState taskState = new TaskState();
taskState.setJobId(TEST_JOB_ID);
taskState.setProp(ConfigurationKeys.JOB_NAME_KEY, "JobName-" + startTime);
// watermark storage configuration
taskState.setProp(StateStoreBasedWatermarkStorage.WATERMARK_STORAGE_TYPE_KEY, "zk");
taskState.setProp(StateStoreBasedWatermarkStorage.WATERMARK_STORAGE_CONFIG_PREFIX + ZkStateStoreConfigurationKeys.STATE_STORE_ZK_CONNECT_STRING_KEY, testingServer.getConnectString());
StateStoreBasedWatermarkStorage watermarkStorage = new StateStoreBasedWatermarkStorage(taskState);
watermarkStorage.commitWatermarks(ImmutableList.of(watermark));
Map<String, CheckpointableWatermark> watermarkMap = watermarkStorage.getCommittedWatermarks(DefaultCheckpointableWatermark.class, ImmutableList.of("source"));
Assert.assertEquals(watermarkMap.size(), 1);
Assert.assertEquals(((LongWatermark) watermarkMap.get("source").getWatermark()).getValue(), startTime);
}
use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class GoogleWebmasterExtractor method close.
@Override
public void close() throws IOException {
if (_current == _iterators.size()) {
log.info(String.format("Successfully finished fetching data from Google Search Console from %s to %s.", dateFormatter.print(_startDate), dateFormatter.print(_expectedHighWaterMarkDate)));
_wuState.setActualHighWatermark(new LongWatermark(_expectedHighWaterMark));
} else {
log.error(String.format("Had problems fetching data from Google Search Console from %s to %s.", dateFormatter.print(_startDate), dateFormatter.print(_expectedHighWaterMarkDate)));
}
}
use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class GoogleAnalyticsUnsampledExtractor method close.
@Override
public void close() throws IOException {
LOG.info("Updating the current state high water mark with " + nextWatermark);
this.wuState.setActualHighWatermark(new LongWatermark(nextWatermark));
closer.close();
}
use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class HiveSource method createWorkunitsForPartitionedTable.
protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
boolean setLineageInfo = false;
long tableProcessTime = new DateTime().getMillis();
this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
Optional<String> partitionFilter = Optional.absent();
// If the table is date partitioned, use the partition name to filter partitions older than lookback
if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
}
List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
for (Partition sourcePartition : sourcePartitions) {
if (isOlderThanLookback(sourcePartition)) {
continue;
}
LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
try {
if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
continue;
}
long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
log.debug(String.format("Processing partition: %s", sourcePartition));
long partitionProcessTime = new DateTime().getMillis();
this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
log.info("Added lineage event for dataset " + hiveDataset.getUrn());
// Add lineage information only once per hive table
setLineageInfo = true;
}
workunits.add(hiveWorkUnit);
log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
} else {
// If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
}
} catch (UpdateNotFoundException e) {
log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (SchemaNotFoundException e) {
log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (UncheckedExecutionException e) {
log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
}
}
}
Aggregations