Search in sources :

Example 16 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class CopyRouteGeneratorTest method testCopyRouteGenerator.

@Test
public void testCopyRouteGenerator() throws Exception {
    // Oct 1, 2016
    long replica1Watermark = 1475304606000L;
    // Oct 4, 2016
    long sourceWatermark = 1475604606000L;
    ReplicaHadoopFsEndPoint notAvailableReplica = Mockito.mock(ReplicaHadoopFsEndPoint.class);
    Mockito.when(notAvailableReplica.isFileSystemAvailable()).thenReturn(false);
    Optional<ComparableWatermark> tmp = Optional.absent();
    Mockito.when(notAvailableReplica.getWatermark()).thenReturn(tmp);
    ReplicaHadoopFsEndPoint replica1 = Mockito.mock(ReplicaHadoopFsEndPoint.class);
    Mockito.when(replica1.isFileSystemAvailable()).thenReturn(true);
    ComparableWatermark cw = new LongWatermark(replica1Watermark);
    tmp = Optional.of(cw);
    Mockito.when(replica1.getWatermark()).thenReturn(tmp);
    SourceHadoopFsEndPoint source = Mockito.mock(SourceHadoopFsEndPoint.class);
    Mockito.when(source.isFileSystemAvailable()).thenReturn(true);
    cw = new LongWatermark(sourceWatermark);
    tmp = Optional.of(cw);
    Mockito.when(source.getWatermark()).thenReturn(tmp);
    ReplicaHadoopFsEndPoint copyToEndPoint = Mockito.mock(ReplicaHadoopFsEndPoint.class);
    Mockito.when(copyToEndPoint.isFileSystemAvailable()).thenReturn(true);
    CopyRoute cp1 = new CopyRoute(notAvailableReplica, copyToEndPoint);
    CopyRoute cp2 = new CopyRoute(replica1, copyToEndPoint);
    CopyRoute cp3 = new CopyRoute(source, copyToEndPoint);
    DataFlowTopology.DataFlowPath dataFlowPath = new DataFlowTopology.DataFlowPath(ImmutableList.<CopyRoute>of(cp1, cp2, cp3));
    DataFlowTopology dataFlowTopology = new DataFlowTopology();
    dataFlowTopology.addDataFlowPath(dataFlowPath);
    ReplicationConfiguration rc = Mockito.mock(ReplicationConfiguration.class);
    Mockito.when(rc.getCopyMode()).thenReturn(ReplicationCopyMode.PULL);
    Mockito.when(rc.getSource()).thenReturn(source);
    Mockito.when(rc.getReplicas()).thenReturn(ImmutableList.<EndPoint>of(notAvailableReplica, replica1, copyToEndPoint));
    Mockito.when(rc.getDataFlowToplogy()).thenReturn(dataFlowTopology);
    CopyRouteGeneratorOptimizedNetworkBandwidthForTest network = new CopyRouteGeneratorOptimizedNetworkBandwidthForTest();
    Assert.assertTrue(network.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().equals(replica1));
    Assert.assertTrue(network.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().getWatermark().get().compareTo(new LongWatermark(replica1Watermark)) == 0);
    CopyRouteGeneratorOptimizedLatency latency = new CopyRouteGeneratorOptimizedLatency();
    Assert.assertTrue(latency.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().equals(source));
    Assert.assertTrue(latency.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().getWatermark().get().compareTo(new LongWatermark(sourceWatermark)) == 0);
}
Also used : ComparableWatermark(org.apache.gobblin.source.extractor.ComparableWatermark) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 17 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class StateStoreWatermarkStorageTest method testPersistWatermarkStateToZk.

@Test
public void testPersistWatermarkStateToZk() throws IOException {
    CheckpointableWatermark watermark = new DefaultCheckpointableWatermark("source", new LongWatermark(startTime));
    TaskState taskState = new TaskState();
    taskState.setJobId(TEST_JOB_ID);
    taskState.setProp(ConfigurationKeys.JOB_NAME_KEY, "JobName-" + startTime);
    // watermark storage configuration
    taskState.setProp(StateStoreBasedWatermarkStorage.WATERMARK_STORAGE_TYPE_KEY, "zk");
    taskState.setProp(StateStoreBasedWatermarkStorage.WATERMARK_STORAGE_CONFIG_PREFIX + ZkStateStoreConfigurationKeys.STATE_STORE_ZK_CONNECT_STRING_KEY, testingServer.getConnectString());
    StateStoreBasedWatermarkStorage watermarkStorage = new StateStoreBasedWatermarkStorage(taskState);
    watermarkStorage.commitWatermarks(ImmutableList.of(watermark));
    Map<String, CheckpointableWatermark> watermarkMap = watermarkStorage.getCommittedWatermarks(DefaultCheckpointableWatermark.class, ImmutableList.of("source"));
    Assert.assertEquals(watermarkMap.size(), 1);
    Assert.assertEquals(((LongWatermark) watermarkMap.get("source").getWatermark()).getValue(), startTime);
}
Also used : DefaultCheckpointableWatermark(org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) DefaultCheckpointableWatermark(org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 18 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class GoogleWebmasterExtractor method close.

@Override
public void close() throws IOException {
    if (_current == _iterators.size()) {
        log.info(String.format("Successfully finished fetching data from Google Search Console from %s to %s.", dateFormatter.print(_startDate), dateFormatter.print(_expectedHighWaterMarkDate)));
        _wuState.setActualHighWatermark(new LongWatermark(_expectedHighWaterMark));
    } else {
        log.error(String.format("Had problems fetching data from Google Search Console from %s to %s.", dateFormatter.print(_startDate), dateFormatter.print(_expectedHighWaterMarkDate)));
    }
}
Also used : LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 19 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class GoogleAnalyticsUnsampledExtractor method close.

@Override
public void close() throws IOException {
    LOG.info("Updating the current state high water mark with " + nextWatermark);
    this.wuState.setActualHighWatermark(new LongWatermark(nextWatermark));
    closer.close();
}
Also used : LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 20 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class HiveSource method createWorkunitsForPartitionedTable.

protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
    boolean setLineageInfo = false;
    long tableProcessTime = new DateTime().getMillis();
    this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
    Optional<String> partitionFilter = Optional.absent();
    // If the table is date partitioned, use the partition name to filter partitions older than lookback
    if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
        partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
        log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
    }
    List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
    for (Partition sourcePartition : sourcePartitions) {
        if (isOlderThanLookback(sourcePartition)) {
            continue;
        }
        LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
        try {
            if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
                log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
                continue;
            }
            long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
            if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
                log.debug(String.format("Processing partition: %s", sourcePartition));
                long partitionProcessTime = new DateTime().getMillis();
                this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
                LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
                HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
                hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
                EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
                if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
                    setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                    log.info("Added lineage event for dataset " + hiveDataset.getUrn());
                    // Add lineage information only once per hive table
                    setLineageInfo = true;
                }
                workunits.add(hiveWorkUnit);
                log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            } else {
                // If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
                log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            }
        } catch (UpdateNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (SchemaNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (UncheckedExecutionException e) {
            log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
        }
    }
}
Also used : LookbackPartitionFilterGenerator(org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator) Path(org.apache.hadoop.fs.Path) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) DateTime(org.joda.time.DateTime) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)35 Test (org.testng.annotations.Test)16 DefaultCheckpointableWatermark (org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark)12 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)10 CheckpointableWatermark (org.apache.gobblin.source.extractor.CheckpointableWatermark)9 SourceState (org.apache.gobblin.configuration.SourceState)7 State (org.apache.gobblin.configuration.State)7 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)6 IOException (java.io.IOException)5 RecordEnvelope (org.apache.gobblin.stream.RecordEnvelope)5 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)4 Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 Random (java.util.Random)3 TreeSet (java.util.TreeSet)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 ComparableWatermark (org.apache.gobblin.source.extractor.ComparableWatermark)3 Path (org.apache.hadoop.fs.Path)3 Benchmark (org.openjdk.jmh.annotations.Benchmark)3 Group (org.openjdk.jmh.annotations.Group)3 Config (com.typesafe.config.Config)2