Examples with LongWatermark - org.apache.gobblin.source.extractor.extract.LongWatermark

Example 6 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class SequentialTestSource method getExtractor.

@Override
public Extractor<String, Object> getExtractor(WorkUnitState state) throws IOException {
    Config config = ConfigFactory.parseProperties(state.getProperties());
    configureIfNeeded(config);
    final LongWatermark lowWatermark = state.getWorkunit().getLowWatermark(LongWatermark.class);
    final WorkUnitState workUnitState = state;
    final int index = state.getPropAsInt(WORK_UNIT_INDEX);
    final TestBatchExtractor extractor = new TestBatchExtractor(index, lowWatermark, numRecordsPerExtract, sleepTimePerRecord, workUnitState);
    if (!streaming) {
        return extractor;
    } else {
        return (Extractor) new TestStreamingExtractor(extractor);
    }
}

Also used : Config(com.typesafe.config.Config) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) StreamingExtractor(org.apache.gobblin.source.extractor.StreamingExtractor) Extractor(org.apache.gobblin.source.extractor.Extractor) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 7 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class HiveSource method createWorkunitForNonPartitionedTable.

protected void createWorkunitForNonPartitionedTable(HiveDataset hiveDataset) throws IOException {
    // Create workunits for tables
    try {
        long tableProcessTime = new DateTime().getMillis();
        long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());
        this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
        LongWatermark lowWatermark = this.watermarker.getPreviousHighWatermark(hiveDataset.getTable());
        if (!shouldCreateWorkUnit(hiveDataset.getTable().getPath())) {
            log.info(String.format("Not creating workunit for table %s as partition path %s contains data path tokens to ignore %s", hiveDataset.getTable().getCompleteName(), hiveDataset.getTable().getPath(), this.ignoreDataPathIdentifierList));
            return;
        }
        if (shouldCreateWorkunit(hiveDataset.getTable(), lowWatermark)) {
            log.info(String.format("Creating workunit for table %s as updateTime %s or createTime %s is greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
            HiveWorkUnit hiveWorkUnit = workUnitForTable(hiveDataset);
            LongWatermark expectedDatasetHighWatermark = this.watermarker.getExpectedHighWatermark(hiveDataset.getTable(), tableProcessTime);
            hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
            EventWorkunitUtils.setTableSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
            if (hiveDataset instanceof ConvertibleHiveDataset) {
                setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                log.info("Added lineage event for dataset " + hiveDataset.getUrn());
            }
            this.workunits.add(hiveWorkUnit);
            log.debug(String.format("Workunit added for table: %s", hiveWorkUnit));
        } else {
            log.info(String.format("Not creating workunit for table %s as updateTime %s and createTime %s is not greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
        }
    } catch (UpdateNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as update time was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    } catch (SchemaNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as schema was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    }
}

Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) DateTime(org.joda.time.DateTime) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 8 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class PartitionedWriterTest method testWatermarkComputation.

public void testWatermarkComputation(Long committed, Long unacknowledged, Long expected) throws IOException {
    State state = new State();
    state.setProp(ConfigurationKeys.WRITER_PARTITIONER_CLASS, TestPartitioner.class.getCanonicalName());
    String defaultSource = "default";
    WatermarkAwareWriter mockDataWriter = mock(WatermarkAwareWriter.class);
    when(mockDataWriter.isWatermarkCapable()).thenReturn(true);
    when(mockDataWriter.getCommittableWatermark()).thenReturn(Collections.singletonMap(defaultSource, new DefaultCheckpointableWatermark(defaultSource, new LongWatermark(committed))));
    when(mockDataWriter.getUnacknowledgedWatermark()).thenReturn(Collections.singletonMap(defaultSource, new DefaultCheckpointableWatermark(defaultSource, new LongWatermark(unacknowledged))));
    PartitionAwareDataWriterBuilder builder = mock(PartitionAwareDataWriterBuilder.class);
    when(builder.validatePartitionSchema(any(Schema.class))).thenReturn(true);
    when(builder.forPartition(any(GenericRecord.class))).thenReturn(builder);
    when(builder.withWriterId(any(String.class))).thenReturn(builder);
    when(builder.build()).thenReturn(mockDataWriter);
    PartitionedDataWriter writer = new PartitionedDataWriter<String, String>(builder, state);
    RecordEnvelope<String> recordEnvelope = new RecordEnvelope<String>("0");
    recordEnvelope.addCallBack(new AcknowledgableWatermark(new DefaultCheckpointableWatermark(defaultSource, new LongWatermark(0))));
    writer.writeEnvelope(recordEnvelope);
    Map<String, CheckpointableWatermark> watermark = writer.getCommittableWatermark();
    System.out.println(watermark.toString());
    if (expected == null) {
        Assert.assertTrue(watermark.isEmpty(), "Expected watermark to be absent");
    } else {
        Assert.assertTrue(watermark.size() == 1);
        Assert.assertEquals((long) expected, ((LongWatermark) watermark.values().iterator().next().getWatermark()).getValue());
    }
}

Also used : RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) TestPartitioner(org.apache.gobblin.writer.test.TestPartitioner) Schema(org.apache.avro.Schema) DefaultCheckpointableWatermark(org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark) State(org.apache.gobblin.configuration.State) GenericRecord(org.apache.avro.generic.GenericRecord) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) DefaultCheckpointableWatermark(org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 9 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class EventWorkunitUtils method setIsFirstPublishMetadata.

/**
 * Sets metadata to indicate whether this is the first time this table or partition is being published.
 * @param wus to set if this is first publish for this table or partition
 */
public static void setIsFirstPublishMetadata(WorkUnitState wus) {
    if (!Boolean.valueOf(wus.getPropAsBoolean(IS_WATERMARK_WORKUNIT_KEY))) {
        LongWatermark previousWatermark = wus.getWorkunit().getLowWatermark(LongWatermark.class);
        wus.setProp(SlaEventKeys.IS_FIRST_PUBLISH, (null == previousWatermark || previousWatermark.getValue() == 0));
    }
}

Also used : LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 10 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class WikipediaSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
    List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
    Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
    for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
        Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {

            @Override
            public LongWatermark apply(WorkUnitState wus) {
                return wus.getActualHighWatermark(LongWatermark.class);
            }
        });
        watermarks = Iterables.filter(watermarks, Predicates.notNull());
        List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
        if (watermarkList.size() > 0) {
            prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
        }
    }
    Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (String title : titles) {
        LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
        prevHighWatermarks.remove(title);
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
        workUnits.add(workUnit);
    }
    for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
        workUnits.add(workUnit);
    }
    return workUnits;
}

Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Extract(org.apache.gobblin.source.workunit.Extract) LinkedList(java.util.LinkedList) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Map(java.util.Map) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)35 Test (org.testng.annotations.Test)16 DefaultCheckpointableWatermark (org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark)12 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)10 CheckpointableWatermark (org.apache.gobblin.source.extractor.CheckpointableWatermark)9 SourceState (org.apache.gobblin.configuration.SourceState)7 State (org.apache.gobblin.configuration.State)7 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)6 IOException (java.io.IOException)5 RecordEnvelope (org.apache.gobblin.stream.RecordEnvelope)5 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)4 Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 Random (java.util.Random)3 TreeSet (java.util.TreeSet)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 ComparableWatermark (org.apache.gobblin.source.extractor.ComparableWatermark)3 Path (org.apache.hadoop.fs.Path)3 Benchmark (org.openjdk.jmh.annotations.Benchmark)3 Group (org.openjdk.jmh.annotations.Group)3 Config (com.typesafe.config.Config)2