Search in sources :

Example 6 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionerTest method testGetHighWatermarkOnAppendExtract.

/**
 * Test getHighWatermark. Extract type: Append.
 */
@Test
public void testGetHighWatermarkOnAppendExtract() {
    String endValue = "20140101000000";
    SourceState sourceState = new SourceState();
    sourceState.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE, endValue);
    ExtractType extractType = ExtractType.APPEND_DAILY;
    TestPartitioner partitioner = new TestPartitioner(sourceState);
    Assert.assertEquals(partitioner.getHighWatermark(extractType, null), Long.parseLong(endValue), "High watermark should be " + endValue);
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), true, "Should mark as user specified high watermark");
    partitioner.reset();
    // Test non-full-dump cases below
    sourceState.removeProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY);
    // No limit type
    Assert.assertEquals(partitioner.getHighWatermark(ExtractType.APPEND_BATCH, null), ConfigurationKeys.DEFAULT_WATERMARK_VALUE, "High watermark should be " + ConfigurationKeys.DEFAULT_WATERMARK_VALUE);
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), false, "Should not mark as user specified high watermark");
    // No limit delta
    long expected = Long.parseLong(TestPartitioner.currentTimeString);
    Assert.assertEquals(partitioner.getHighWatermark(extractType, null), expected, "High watermark should be " + expected);
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), false, "Should not mark as user specified high watermark");
    // CURRENTDATE - 1
    String maxLimit = "CURRENTDATE-1";
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_APPEND_MAX_WATERMARK_LIMIT, maxLimit);
    Assert.assertEquals(partitioner.getHighWatermark(extractType, null), 20161231235959L, "High watermark should be 20161231235959");
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), true, "Should not mark as user specified high watermark");
    partitioner.reset();
    // CURRENTHOUR - 1
    maxLimit = "CURRENTHOUR-1";
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_APPEND_MAX_WATERMARK_LIMIT, maxLimit);
    Assert.assertEquals(partitioner.getHighWatermark(extractType, null), 20161231235959L, "High watermark should be 20161231235959");
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), true, "Should not mark as user specified high watermark");
    partitioner.reset();
    // CURRENTMINUTE - 1
    maxLimit = "CURRENTMINUTE-1";
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_APPEND_MAX_WATERMARK_LIMIT, maxLimit);
    Assert.assertEquals(partitioner.getHighWatermark(extractType, null), 20161231235959L, "High watermark should be 20161231235959");
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), true, "Should not mark as user specified high watermark");
    partitioner.reset();
    // CURRENTSECOND - 1
    maxLimit = "CURRENTSECOND-1";
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_APPEND_MAX_WATERMARK_LIMIT, maxLimit);
    Assert.assertEquals(partitioner.getHighWatermark(extractType, null), 20161231235959L, "High watermark should be 20161231235959");
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), true, "Should not mark as user specified high watermark");
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) ExtractType(org.apache.gobblin.source.extractor.extract.ExtractType) Test(org.testng.annotations.Test)

Example 7 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionerTest method testGetLowWatermarkOnUserOverride.

/**
 * Test getLowWatermark. Is watermark override: true.
 */
@Test
public void testGetLowWatermarkOnUserOverride() {
    String startValue = "20140101000000";
    SourceState sourceState = new SourceState();
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_WATERMARK_OVERRIDE, true);
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_START_VALUE, startValue);
    TestPartitioner partitioner = new TestPartitioner(sourceState);
    Assert.assertEquals(partitioner.getLowWatermark(null, null, -1, 0), Long.parseLong(startValue), "Low watermark should be " + startValue);
    // It works for full dump too
    sourceState.removeProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_WATERMARK_OVERRIDE);
    sourceState.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
    Assert.assertEquals(partitioner.getLowWatermark(null, null, -1, 0), Long.parseLong(startValue), "Low watermark should be " + startValue);
    // Should return ConfigurationKeys.DEFAULT_WATERMARK_VALUE if no SOURCE_QUERYBASED_START_VALUE is specified
    sourceState.removeProp(ConfigurationKeys.SOURCE_QUERYBASED_START_VALUE);
    Assert.assertEquals(partitioner.getLowWatermark(null, null, -1, 0), ConfigurationKeys.DEFAULT_WATERMARK_VALUE, "Low watermark should be " + ConfigurationKeys.DEFAULT_WATERMARK_VALUE);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Test(org.testng.annotations.Test)

Example 8 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionerTest method testGetHighWatermarkOnSnapshotExtract.

/**
 * Test getHighWatermark. Extract type: Snapshot.
 */
@Test
public void testGetHighWatermarkOnSnapshotExtract() {
    String endValue = "20140101000000";
    SourceState sourceState = new SourceState();
    // It won't use SOURCE_QUERYBASED_END_VALUE when extract is full
    sourceState.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE, endValue);
    ExtractType extractType = ExtractType.SNAPSHOT;
    TestPartitioner partitioner = new TestPartitioner(sourceState);
    Assert.assertEquals(partitioner.getHighWatermark(extractType, WatermarkType.SIMPLE), ConfigurationKeys.DEFAULT_WATERMARK_VALUE, "High watermark should be " + ConfigurationKeys.DEFAULT_WATERMARK_VALUE);
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), false, "Should not mark as user specified high watermark");
    long expected = Long.parseLong(TestPartitioner.currentTimeString);
    Assert.assertEquals(partitioner.getHighWatermark(extractType, WatermarkType.TIMESTAMP), expected, "High watermark should be " + expected);
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), false, "Should not mark as user specified high watermark");
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) ExtractType(org.apache.gobblin.source.extractor.extract.ExtractType) Test(org.testng.annotations.Test)

Example 9 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class DatasetFinderSource method createWorkUnitStream.

private Stream<WorkUnit> createWorkUnitStream(SourceState state) throws IOException {
    IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
    Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(0, null);
    if (this.drilldownIntoPartitions) {
        return datasetStream.flatMap(dataset -> {
            if (dataset instanceof PartitionableDataset) {
                try {
                    return (Stream<PartitionableDataset.DatasetPartition>) ((PartitionableDataset) dataset).getPartitions(0, null);
                } catch (IOException ioe) {
                    log.error("Failed to get partitions for dataset " + dataset.getUrn());
                    return Stream.empty();
                }
            } else {
                return Stream.of(new DatasetWrapper(dataset));
            }
        }).map(this::workUnitForPartitionInternal);
    } else {
        return datasetStream.map(this::workUnitForDataset);
    }
}
Also used : DatasetUtils(org.apache.gobblin.data.management.dataset.DatasetUtils) WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) Getter(lombok.Getter) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) BasicWorkUnitStream(org.apache.gobblin.source.workunit.BasicWorkUnitStream) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitStreamSource(org.apache.gobblin.source.WorkUnitStreamSource) HadoopUtils(org.apache.gobblin.util.HadoopUtils) AllArgsConstructor(lombok.AllArgsConstructor) Dataset(org.apache.gobblin.dataset.Dataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) IOException(java.io.IOException)

Example 10 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class HivePurgerSource method initialize.

@VisibleForTesting
protected void initialize(SourceState state) throws IOException {
    setTimeStamp();
    setLowWatermark(state);
    setExecutionCount(state);
    this.metricContext = Instrumented.getMetricContext(state, this.getClass());
    this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, ComplianceEvents.NAMESPACE).build();
    submitCycleCompletionEvent();
    this.maxWorkUnits = state.getPropAsInt(ComplianceConfigurationKeys.MAX_WORKUNITS_KEY, ComplianceConfigurationKeys.DEFAULT_MAX_WORKUNITS);
    this.maxWorkUnitExecutionAttempts = state.getPropAsInt(ComplianceConfigurationKeys.MAX_WORKUNIT_EXECUTION_ATTEMPTS_KEY, ComplianceConfigurationKeys.DEFAULT_MAX_WORKUNIT_EXECUTION_ATTEMPTS);
    // TODO: Event submitter and metrics will be added later
    String datasetFinderClass = state.getProp(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS, HivePartitionFinder.class.getName());
    this.datasetFinder = GobblinConstructorUtils.invokeConstructor(DatasetsFinder.class, datasetFinderClass, state);
    populateDatasets();
    String policyClass = state.getProp(ComplianceConfigurationKeys.PURGE_POLICY_CLASS, HivePurgerPolicy.class.getName());
    this.policy = GobblinConstructorUtils.invokeConstructor(PurgePolicy.class, policyClass, this.lowWatermark);
    this.shouldProxy = state.getPropAsBoolean(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_SHOULD_PROXY, ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_DEFAULT_SHOULD_PROXY);
    if (!this.shouldProxy) {
        return;
    }
    // cancel tokens
    try {
        ProxyUtils.cancelTokens(new State(state));
    } catch (InterruptedException | TException e) {
        throw new IOException(e);
    }
}
Also used : HivePartitionFinder(org.apache.gobblin.compliance.HivePartitionFinder) TException(org.apache.thrift.TException) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) DatasetsFinder(org.apache.gobblin.dataset.DatasetsFinder) IOException(java.io.IOException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)90 Test (org.testng.annotations.Test)76 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)44 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)38 State (org.apache.gobblin.configuration.State)30 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)8 Table (org.apache.hadoop.hive.ql.metadata.Table)8 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)7 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 Extract (org.apache.gobblin.source.workunit.Extract)7 DateTime (org.joda.time.DateTime)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)5 Gson (com.google.gson.Gson)4 JsonObject (com.google.gson.JsonObject)4