Search in sources :

Example 86 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class QueryBasedSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    initLogger(state);
    lineageInfo = LineageInfo.getLineageInfo(state.getBroker());
    List<WorkUnit> workUnits = Lists.newArrayList();
    // Map<String, String> tableNameToEntityMap = Maps.newHashMap();
    Set<SourceEntity> entities = getFilteredSourceEntities(state);
    Map<SourceEntity, State> tableSpecificPropsMap = shouldObtainTablePropsFromConfigStore(state) ? getTableSpecificPropsFromConfigStore(entities, state) : getTableSpecificPropsFromState(entities, state);
    Map<SourceEntity, Long> prevWatermarksByTable = getPreviousWatermarksForAllTables(state);
    for (SourceEntity sourceEntity : Sets.union(entities, prevWatermarksByTable.keySet())) {
        log.info("Source entity to be processed: {}, carry-over from previous state: {} ", sourceEntity, !entities.contains(sourceEntity));
        SourceState combinedState = getCombinedState(state, tableSpecificPropsMap.get(sourceEntity));
        long previousWatermark = prevWatermarksByTable.containsKey(sourceEntity) ? prevWatermarksByTable.get(sourceEntity) : ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
        // This is done by overriding the high watermark to be the same as the previous watermark.
        if (!entities.contains(sourceEntity)) {
            combinedState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE, previousWatermark);
        }
        workUnits.addAll(generateWorkUnits(sourceEntity, combinedState, previousWatermark));
    }
    log.info("Total number of workunits for the current run: " + workUnits.size());
    List<WorkUnit> previousWorkUnits = this.getPreviousWorkUnitsForRetry(state);
    log.info("Total number of incomplete tasks from the previous run: " + previousWorkUnits.size());
    workUnits.addAll(previousWorkUnits);
    int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
    return pack(workUnits, numOfMultiWorkunits);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkingState(org.apache.gobblin.configuration.WorkUnitState.WorkingState) SourceState(org.apache.gobblin.configuration.SourceState) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 87 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class TestStressTestingSource method testSourceExtractor.

@Test
public void testSourceExtractor() throws DataRecordException, IOException {
    final int MEM_ALLOC_BYTES = 100;
    final int NUM_WORK_UNITS = 10;
    final int COMPUTE_TIME_MICRO = 10;
    final int NUM_RECORDS = 10000;
    SourceState state = new SourceState();
    state.setProp(StressTestingSource.NUM_WORK_UNITS_KEY, NUM_WORK_UNITS);
    state.setProp(StressTestingSource.MEM_ALLOC_BYTES_KEY, MEM_ALLOC_BYTES);
    state.setProp(StressTestingSource.COMPUTE_TIME_MICRO_KEY, COMPUTE_TIME_MICRO);
    state.setProp(StressTestingSource.NUM_RECORDS_KEY, NUM_RECORDS);
    StressTestingSource source = new StressTestingSource();
    List<WorkUnit> wus = source.getWorkunits(state);
    Assert.assertEquals(wus.size(), NUM_WORK_UNITS);
    for (int i = 0; i < wus.size(); ++i) {
        WorkUnit wu = wus.get(i);
        WorkUnitState wuState = new WorkUnitState(wu, state);
        Extractor<String, byte[]> extractor = source.getExtractor(wuState);
        Assert.assertEquals(extractor.getExpectedRecordCount(), NUM_RECORDS);
        Assert.assertEquals(extractor.readRecord(null).length, 100);
    }
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 88 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class TestStressTestingSource method testComputeTime.

@Test(enabled = false)
public void testComputeTime() throws DataRecordException, IOException {
    final int MEM_ALLOC_BYTES = 100;
    final int NUM_WORK_UNITS = 1;
    final int COMPUTE_TIME_MICRO = 10000;
    final int NUM_RECORDS = 500;
    SourceState state = new SourceState();
    state.setProp(StressTestingSource.NUM_WORK_UNITS_KEY, NUM_WORK_UNITS);
    state.setProp(StressTestingSource.MEM_ALLOC_BYTES_KEY, MEM_ALLOC_BYTES);
    state.setProp(StressTestingSource.COMPUTE_TIME_MICRO_KEY, COMPUTE_TIME_MICRO);
    state.setProp(StressTestingSource.NUM_RECORDS_KEY, NUM_RECORDS);
    StressTestingSource source = new StressTestingSource();
    List<WorkUnit> wus = source.getWorkunits(state);
    Assert.assertEquals(wus.size(), NUM_WORK_UNITS);
    WorkUnit wu = wus.get(0);
    WorkUnitState wuState = new WorkUnitState(wu, state);
    Extractor<String, byte[]> extractor = source.getExtractor(wuState);
    byte[] record;
    long startTimeNano = System.nanoTime();
    while ((record = extractor.readRecord(null)) != null) {
        Assert.assertEquals(record.length, 100);
    }
    long endTimeNano = System.nanoTime();
    long timeSpentMicro = (endTimeNano - startTimeNano) / (1000);
    // check that there is less than 5 second difference between expected and actual time spent
    Assert.assertTrue(Math.abs(timeSpentMicro - (COMPUTE_TIME_MICRO * NUM_RECORDS)) < (5000000), "Time spent " + timeSpentMicro);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 89 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class TestHelloWorldSource method testSourceExtractor.

@Test
public void testSourceExtractor() throws DataRecordException, IOException {
    SourceState state = new SourceState();
    state.setProp(HelloWorldSource.NUM_HELLOS_FULL_KEY, 10);
    HelloWorldSource source = new HelloWorldSource();
    List<WorkUnit> wus = source.getWorkunits(state);
    Assert.assertEquals(wus.size(), 10);
    for (int i = 0; i < wus.size(); ++i) {
        WorkUnit wu = wus.get(i);
        Assert.assertEquals(wu.getPropAsInt(HelloWorldSource.HELLO_ID_FULL_KEY), i + 1);
        WorkUnitState wuState = new WorkUnitState(wu, state);
        Extractor<String, String> extr = source.getExtractor(wuState);
        Assert.assertEquals(extr.getExpectedRecordCount(), 1);
        Assert.assertEquals(extr.readRecord(null), "Hello world " + (i + 1) + " !");
    }
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 90 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class LoopingDatasetFinderSource method getWorkunitStream.

@Override
public WorkUnitStream getWorkunitStream(SourceState state) {
    try {
        int maxWorkUnits = state.getPropAsInt(MAX_WORK_UNITS_PER_RUN_KEY, MAX_WORK_UNITS_PER_RUN);
        List<WorkUnitState> previousWorkUnitStates = state.getPreviousWorkUnitStates();
        Optional<WorkUnitState> maxWorkUnit;
        try {
            maxWorkUnit = previousWorkUnitStates.stream().reduce((wu1, wu2) -> {
                int wu1Ordinal = wu1.getPropAsInt(WORK_UNIT_ORDINAL);
                int wu2Ordinal = wu2.getPropAsInt(WORK_UNIT_ORDINAL);
                return wu1Ordinal > wu2Ordinal ? wu1 : wu2;
            });
        } catch (NumberFormatException nfe) {
            throw new RuntimeException("Work units in state store are corrupted! Missing or malformed " + WORK_UNIT_ORDINAL);
        }
        String previousDatasetUrnWatermark = null;
        String previousPartitionUrnWatermark = null;
        if (maxWorkUnit.isPresent() && !maxWorkUnit.get().getPropAsBoolean(END_OF_DATASETS_KEY, false)) {
            previousDatasetUrnWatermark = maxWorkUnit.get().getProp(DATASET_URN);
            previousPartitionUrnWatermark = maxWorkUnit.get().getProp(PARTITION_URN);
        }
        IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
        Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(Spliterator.SORTED, this.lexicographicalComparator);
        datasetStream = sortStreamLexicographically(datasetStream);
        return new BasicWorkUnitStream.Builder(new DeepIterator(datasetStream.iterator(), previousDatasetUrnWatermark, previousPartitionUrnWatermark, maxWorkUnits)).setFiniteStream(true).build();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}
Also used : WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) Iterator(java.util.Iterator) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) AbstractIterator(com.google.common.collect.AbstractIterator) IOException(java.io.IOException) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) PeekingIterator(com.google.common.collect.PeekingIterator) Iterators(com.google.common.collect.Iterators) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) NoopTask(org.apache.gobblin.runtime.task.NoopTask) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) Lists(com.google.common.collect.Lists) BasicWorkUnitStream(org.apache.gobblin.source.workunit.BasicWorkUnitStream) SourceState(org.apache.gobblin.configuration.SourceState) Optional(java.util.Optional) URNIdentified(org.apache.gobblin.dataset.URNIdentified) StreamSupport(java.util.stream.StreamSupport) Spliterator(java.util.Spliterator) Dataset(org.apache.gobblin.dataset.Dataset) URNLexicographicalComparator(org.apache.gobblin.dataset.comparators.URNLexicographicalComparator) Nullable(javax.annotation.Nullable) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) IOException(java.io.IOException)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)90 Test (org.testng.annotations.Test)76 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)44 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)38 State (org.apache.gobblin.configuration.State)30 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)8 Table (org.apache.hadoop.hive.ql.metadata.Table)8 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)7 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 Extract (org.apache.gobblin.source.workunit.Extract)7 DateTime (org.joda.time.DateTime)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)5 Gson (com.google.gson.Gson)4 JsonObject (com.google.gson.JsonObject)4