Search in sources :

Example 6 with IterableDatasetFinder

use of org.apache.gobblin.dataset.IterableDatasetFinder in project incubator-gobblin by apache.

the class LoopingDatasetFinderSourceTest method testDrilldown.

@Test
public void testDrilldown() {
    // Create three datasets, two of them partitioned
    Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
    Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2"), new SimpleDatasetPartitionForTesting("p3")));
    Dataset dataset3 = new SimplePartitionableDatasetForTesting("dataset3", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2"), new SimpleDatasetPartitionForTesting("p3")));
    IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset3, dataset2, dataset1));
    MySource mySource = new MySource(true, finder);
    // Limit to 3 wunits per run
    SourceState sourceState = new SourceState();
    sourceState.setProp(LoopingDatasetFinderSource.MAX_WORK_UNITS_PER_RUN_KEY, 3);
    // first run, get three first work units
    WorkUnitStream workUnitStream = mySource.getWorkunitStream(sourceState);
    List<WorkUnit> workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
    Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
    // Second run should continue where it left off
    List<WorkUnitState> workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
    SourceState sourceStateSpy = Mockito.spy(sourceState);
    Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
    workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
    workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN), "p3");
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
    // third run, continue from where it left off
    workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
    Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
    workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
    workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 2);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset3");
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN), "p3");
    Assert.assertTrue(workUnits.get(1).getPropAsBoolean(LoopingDatasetFinderSource.END_OF_DATASETS_KEY));
    // fourth run, finished all work units, loop around
    workUnitStates = workUnits.stream().map(WorkUnitState::new).collect(Collectors.toList());
    Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
    workUnitStream = mySource.getWorkunitStream(sourceStateSpy);
    workUnits = Lists.newArrayList(workUnitStream.getWorkUnits());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset1");
    Assert.assertNull(workUnits.get(0).getProp(DatasetFinderSourceTest.PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(1).getProp(DatasetFinderSourceTest.PARTITION_URN), "p1");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.DATASET_URN), "dataset2");
    Assert.assertEquals(workUnits.get(2).getProp(DatasetFinderSourceTest.PARTITION_URN), "p2");
}
Also used : SimpleDatasetPartitionForTesting(org.apache.gobblin.dataset.test.SimpleDatasetPartitionForTesting) WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) SimpleDatasetForTesting(org.apache.gobblin.dataset.test.SimpleDatasetForTesting) SourceState(org.apache.gobblin.configuration.SourceState) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) SimplePartitionableDatasetForTesting(org.apache.gobblin.dataset.test.SimplePartitionableDatasetForTesting) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) StaticDatasetsFinderForTesting(org.apache.gobblin.dataset.test.StaticDatasetsFinderForTesting) Test(org.testng.annotations.Test)

Example 7 with IterableDatasetFinder

use of org.apache.gobblin.dataset.IterableDatasetFinder in project incubator-gobblin by apache.

the class DatasetFinderSourceTest method testNonDrilledDown.

@Test
public void testNonDrilledDown() {
    Dataset dataset1 = new SimpleDatasetForTesting("dataset1");
    Dataset dataset2 = new SimplePartitionableDatasetForTesting("dataset2", Lists.newArrayList(new SimpleDatasetPartitionForTesting("p1"), new SimpleDatasetPartitionForTesting("p2")));
    Dataset dataset3 = new SimpleDatasetForTesting("dataset3");
    IterableDatasetFinder finder = new StaticDatasetsFinderForTesting(Lists.newArrayList(dataset1, dataset2, dataset3));
    MySource mySource = new MySource(false, finder);
    List<WorkUnit> workUnits = mySource.getWorkunits(new SourceState());
    Assert.assertEquals(workUnits.size(), 3);
    Assert.assertEquals(workUnits.get(0).getProp(DATASET_URN), "dataset1");
    Assert.assertNull(workUnits.get(0).getProp(PARTITION_URN));
    Assert.assertEquals(workUnits.get(1).getProp(DATASET_URN), "dataset2");
    Assert.assertNull(workUnits.get(1).getProp(PARTITION_URN));
    Assert.assertEquals(workUnits.get(2).getProp(DATASET_URN), "dataset3");
    Assert.assertNull(workUnits.get(2).getProp(PARTITION_URN));
    WorkUnitStream workUnitStream = mySource.getWorkunitStream(new SourceState());
    Assert.assertEquals(Lists.newArrayList(workUnitStream.getWorkUnits()), workUnits);
}
Also used : SimpleDatasetPartitionForTesting(org.apache.gobblin.dataset.test.SimpleDatasetPartitionForTesting) WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) SimpleDatasetForTesting(org.apache.gobblin.dataset.test.SimpleDatasetForTesting) SourceState(org.apache.gobblin.configuration.SourceState) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) SimplePartitionableDatasetForTesting(org.apache.gobblin.dataset.test.SimplePartitionableDatasetForTesting) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) StaticDatasetsFinderForTesting(org.apache.gobblin.dataset.test.StaticDatasetsFinderForTesting) Test(org.testng.annotations.Test)

Example 8 with IterableDatasetFinder

use of org.apache.gobblin.dataset.IterableDatasetFinder in project incubator-gobblin by apache.

the class LoopingDatasetFinderSource method getWorkunitStream.

@Override
public WorkUnitStream getWorkunitStream(SourceState state) {
    try {
        int maxWorkUnits = state.getPropAsInt(MAX_WORK_UNITS_PER_RUN_KEY, MAX_WORK_UNITS_PER_RUN);
        List<WorkUnitState> previousWorkUnitStates = state.getPreviousWorkUnitStates();
        Optional<WorkUnitState> maxWorkUnit;
        try {
            maxWorkUnit = previousWorkUnitStates.stream().reduce((wu1, wu2) -> {
                int wu1Ordinal = wu1.getPropAsInt(WORK_UNIT_ORDINAL);
                int wu2Ordinal = wu2.getPropAsInt(WORK_UNIT_ORDINAL);
                return wu1Ordinal > wu2Ordinal ? wu1 : wu2;
            });
        } catch (NumberFormatException nfe) {
            throw new RuntimeException("Work units in state store are corrupted! Missing or malformed " + WORK_UNIT_ORDINAL);
        }
        String previousDatasetUrnWatermark = null;
        String previousPartitionUrnWatermark = null;
        if (maxWorkUnit.isPresent() && !maxWorkUnit.get().getPropAsBoolean(END_OF_DATASETS_KEY, false)) {
            previousDatasetUrnWatermark = maxWorkUnit.get().getProp(DATASET_URN);
            previousPartitionUrnWatermark = maxWorkUnit.get().getProp(PARTITION_URN);
        }
        IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
        Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(Spliterator.SORTED, this.lexicographicalComparator);
        datasetStream = sortStreamLexicographically(datasetStream);
        return new BasicWorkUnitStream.Builder(new DeepIterator(datasetStream.iterator(), previousDatasetUrnWatermark, previousPartitionUrnWatermark, maxWorkUnits)).setFiniteStream(true).build();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}
Also used : WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) Iterator(java.util.Iterator) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) AbstractIterator(com.google.common.collect.AbstractIterator) IOException(java.io.IOException) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) PeekingIterator(com.google.common.collect.PeekingIterator) Iterators(com.google.common.collect.Iterators) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) NoopTask(org.apache.gobblin.runtime.task.NoopTask) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) Lists(com.google.common.collect.Lists) BasicWorkUnitStream(org.apache.gobblin.source.workunit.BasicWorkUnitStream) SourceState(org.apache.gobblin.configuration.SourceState) Optional(java.util.Optional) URNIdentified(org.apache.gobblin.dataset.URNIdentified) StreamSupport(java.util.stream.StreamSupport) Spliterator(java.util.Spliterator) Dataset(org.apache.gobblin.dataset.Dataset) URNLexicographicalComparator(org.apache.gobblin.dataset.comparators.URNLexicographicalComparator) Nullable(javax.annotation.Nullable) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) IOException(java.io.IOException)

Aggregations

IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)8 SourceState (org.apache.gobblin.configuration.SourceState)7 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 Test (org.testng.annotations.Test)5 SimpleDatasetForTesting (org.apache.gobblin.dataset.test.SimpleDatasetForTesting)4 SimpleDatasetPartitionForTesting (org.apache.gobblin.dataset.test.SimpleDatasetPartitionForTesting)4 SimplePartitionableDatasetForTesting (org.apache.gobblin.dataset.test.SimplePartitionableDatasetForTesting)4 StaticDatasetsFinderForTesting (org.apache.gobblin.dataset.test.StaticDatasetsFinderForTesting)4 IOException (java.io.IOException)3 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)3 List (java.util.List)2 Stream (java.util.stream.Stream)2 Nullable (javax.annotation.Nullable)2 Slf4j (lombok.extern.slf4j.Slf4j)2 CopyableDatasetRequestor (org.apache.gobblin.data.management.partition.CopyableDatasetRequestor)2 FileSet (org.apache.gobblin.data.management.partition.FileSet)2 BasicWorkUnitStream (org.apache.gobblin.source.workunit.BasicWorkUnitStream)2