Search in sources :

Example 81 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class JsonRecordAvroSchemaToAvroConverterTest method setUp.

@BeforeClass
public void setUp() throws Exception {
    String avroSchemaString = IOUtils.toString(this.getClass().getResourceAsStream("/converter/jsonToAvroSchema.avsc"), StandardCharsets.UTF_8);
    this.jsonRecord = new JsonParser().parse(IOUtils.toString(this.getClass().getResourceAsStream("/converter/jsonToAvroRecord.json"), StandardCharsets.UTF_8)).getAsJsonObject();
    SourceState source = new SourceState();
    this.state = new WorkUnitState(source.createWorkUnit(source.createExtract(TableType.SNAPSHOT_ONLY, "test_table", "test_namespace")));
    this.state.setProp(ConfigurationKeys.CONVERTER_AVRO_SCHEMA_KEY, avroSchemaString);
    this.state.setProp(ConfigurationKeys.CONVERTER_IGNORE_FIELDS, "fieldToIgnore");
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) JsonParser(com.google.gson.JsonParser) BeforeClass(org.testng.annotations.BeforeClass)

Example 82 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class LoopingDatasetFinderSourceTest method testNonDrilldown.

@Test
public void testNonDrilldown() {
    Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
    Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2")));
    Dataset dataset3 = new SimpleDatasetForTesting("dataset3");
    Dataset dataset4 = new SimpleDatasetForTesting("dataset4");
    Dataset dataset5 = new SimpleDatasetForTesting("dataset5");
    IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset5, dataset4, dataset3, dataset2, dataset1));
    MySource mySource = new MySource(false, finder);
    SourceState sourceState = new SourceState();
    sourceState.setProp(LoopingDatasetFinderSource.MAX_WORK_UNITS_PER_RUN_KEY, 3);
    WorkUnitStream workUnitStream = mySource.getWorkunitStream(sourceState);
    List<WorkUnit> workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
    Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertNull(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
    Assert.assertNull(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN));
    // Second run should continue where it left off
    List<WorkUnitState> workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
    SourceState sourceStateSpy = Mockito.spy(sourceState);
    Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
    workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
    workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset4");
    Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset5");
    Assert.assertNull(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertTrue(workUnits.get(2).getPropAsBoolean(LoopingDatasetFinderSource.END_OF_DATASETS_KEY));
    // Loop around
    workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
    Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
    workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
    workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
    Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertNull(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
    Assert.assertNull(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN));
}
Also used : SimpleDatasetPartitionForTesting(org.apache.gobblin.dataset.test.SimpleDatasetPartitionForTesting) WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) SimpleDatasetForTesting(org.apache.gobblin.dataset.test.SimpleDatasetForTesting) SourceState(org.apache.gobblin.configuration.SourceState) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) SimplePartitionableDatasetForTesting(org.apache.gobblin.dataset.test.SimplePartitionableDatasetForTesting) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) StaticDatasetsFinderForTesting(org.apache.gobblin.dataset.test.StaticDatasetsFinderForTesting) Test(org.testng.annotations.Test)

Example 83 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class LoopingDatasetFinderSourceTest method testDrilldown.

@Test
public void testDrilldown() {
    // Create three datasets, two of them partitioned
    Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
    Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2"), new SimpleDatasetPartitionForTesting("p3")));
    Dataset dataset3 = new SimplePartitionableDatasetForTesting("dataset3", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2"), new SimpleDatasetPartitionForTesting("p3")));
    IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset3, dataset2, dataset1));
    MySource mySource = new MySource(true, finder);
    // Limit to 3 wunits per run
    SourceState sourceState = new SourceState();
    sourceState.setProp(LoopingDatasetFinderSource.MAX_WORK_UNITS_PER_RUN_KEY, 3);
    // first run, get three first work units
    WorkUnitStream workUnitStream = mySource.getWorkunitStream(sourceState);
    List<WorkUnit> workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
    Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
    // Second run should continue where it left off
    List<WorkUnitState> workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
    SourceState sourceStateSpy = Mockito.spy(sourceState);
    Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
    workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
    workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN), "p3");
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
    // third run, continue from where it left off
    workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
    Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
    workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
    workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 2);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN), "p3");
    Assert.assertTrue(workUnits.get(1).getPropAsBoolean(LoopingDatasetFinderSource.END_OF_DATASETS_KEY));
    // fourth run, finished all work units, loop around
    workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
    Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
    workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
    workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
    Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
}
Also used : SimpleDatasetPartitionForTesting(org.apache.gobblin.dataset.test.SimpleDatasetPartitionForTesting) WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) SimpleDatasetForTesting(org.apache.gobblin.dataset.test.SimpleDatasetForTesting) SourceState(org.apache.gobblin.configuration.SourceState) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) SimplePartitionableDatasetForTesting(org.apache.gobblin.dataset.test.SimplePartitionableDatasetForTesting) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) StaticDatasetsFinderForTesting(org.apache.gobblin.dataset.test.StaticDatasetsFinderForTesting) Test(org.testng.annotations.Test)

Example 84 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class DatasetFinderSourceTest method testNonDrilledDown.

@Test
public void testNonDrilledDown() {
    Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
    Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2")));
    Dataset dataset3 = new SimpleDatasetForTesting("dataset3");
    IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset1, dataset2, dataset3));
    MySource mySource = new MySource(false, finder);
    List<WorkUnit> workUnits = mySource.getWorkunits(new SourceState());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DATASET_URN), "dataset1");
    Assert.assertNull(workUnits.get(0).getProp(PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DATASET_URN), "dataset2");
    Assert.assertNull(workUnits.get(1).getProp(PARTITION_URN));
    Assert.assertEquals(workUnits.get(2).getProp(DATASET_URN), "dataset3");
    Assert.assertNull(workUnits.get(2).getProp(PARTITION_URN));
    WorkUnitStream workUnitStream = mySource.getWorkunitStream(new SourceState());
    Assert.assertEquals(Lists.newArrayList(workUnitStream.getWorkUnits()), workUnits);
}
Also used : SimpleDatasetPartitionForTesting(org.apache.gobblin.dataset.test.SimpleDatasetPartitionForTesting) WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) SimpleDatasetForTesting(org.apache.gobblin.dataset.test.SimpleDatasetForTesting) SourceState(org.apache.gobblin.configuration.SourceState) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) SimplePartitionableDatasetForTesting(org.apache.gobblin.dataset.test.SimplePartitionableDatasetForTesting) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) StaticDatasetsFinderForTesting(org.apache.gobblin.dataset.test.StaticDatasetsFinderForTesting) Test(org.testng.annotations.Test)

Example 85 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class QueryBasedSource method getCombinedState.

private static SourceState getCombinedState(SourceState state, State tableSpecificState) {
    if (tableSpecificState == null) {
        return state;
    }
    SourceState combinedState = new SourceState(state, state.getPreviousDatasetStatesByUrns(), state.getPreviousWorkUnitStates());
    combinedState.addAll(tableSpecificState);
    return combinedState;
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)90 Test (org.testng.annotations.Test)76 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)44 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)38 State (org.apache.gobblin.configuration.State)30 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)8 Table (org.apache.hadoop.hive.ql.metadata.Table)8 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)7 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 Extract (org.apache.gobblin.source.workunit.Extract)7 DateTime (org.joda.time.DateTime)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)5 Gson (com.google.gson.Gson)4 JsonObject (com.google.gson.JsonObject)4