use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class QueryBasedSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
initLogger(state);
lineageInfo = LineageInfo.getLineageInfo(state.getBroker());
List<WorkUnit> workUnits = Lists.newArrayList();
// Map<String, String> tableNameToEntityMap = Maps.newHashMap();
Set<SourceEntity> entities = getFilteredSourceEntities(state);
Map<SourceEntity, State> tableSpecificPropsMap = shouldObtainTablePropsFromConfigStore(state) ? getTableSpecificPropsFromConfigStore(entities, state) : getTableSpecificPropsFromState(entities, state);
Map<SourceEntity, Long> prevWatermarksByTable = getPreviousWatermarksForAllTables(state);
for (SourceEntity sourceEntity : Sets.union(entities, prevWatermarksByTable.keySet())) {
log.info("Source entity to be processed: {}, carry-over from previous state: {} ", sourceEntity, !entities.contains(sourceEntity));
SourceState combinedState = getCombinedState(state, tableSpecificPropsMap.get(sourceEntity));
long previousWatermark = prevWatermarksByTable.containsKey(sourceEntity) ? prevWatermarksByTable.get(sourceEntity) : ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
// This is done by overriding the high watermark to be the same as the previous watermark.
if (!entities.contains(sourceEntity)) {
combinedState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE, previousWatermark);
}
workUnits.addAll(generateWorkUnits(sourceEntity, combinedState, previousWatermark));
}
log.info("Total number of workunits for the current run: " + workUnits.size());
List<WorkUnit> previousWorkUnits = this.getPreviousWorkUnitsForRetry(state);
log.info("Total number of incomplete tasks from the previous run: " + previousWorkUnits.size());
workUnits.addAll(previousWorkUnits);
int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
return pack(workUnits, numOfMultiWorkunits);
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class TestStressTestingSource method testSourceExtractor.
@Test
public void testSourceExtractor() throws DataRecordException, IOException {
final int MEM_ALLOC_BYTES = 100;
final int NUM_WORK_UNITS = 10;
final int COMPUTE_TIME_MICRO = 10;
final int NUM_RECORDS = 10000;
SourceState state = new SourceState();
state.setProp(StressTestingSource.NUM_WORK_UNITS_KEY, NUM_WORK_UNITS);
state.setProp(StressTestingSource.MEM_ALLOC_BYTES_KEY, MEM_ALLOC_BYTES);
state.setProp(StressTestingSource.COMPUTE_TIME_MICRO_KEY, COMPUTE_TIME_MICRO);
state.setProp(StressTestingSource.NUM_RECORDS_KEY, NUM_RECORDS);
StressTestingSource source = new StressTestingSource();
List<WorkUnit> wus = source.getWorkunits(state);
Assert.assertEquals(wus.size(), NUM_WORK_UNITS);
for (int i = 0; i < wus.size(); ++i) {
WorkUnit wu = wus.get(i);
WorkUnitState wuState = new WorkUnitState(wu, state);
Extractor<String, byte[]> extractor = source.getExtractor(wuState);
Assert.assertEquals(extractor.getExpectedRecordCount(), NUM_RECORDS);
Assert.assertEquals(extractor.readRecord(null).length, 100);
}
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class TestStressTestingSource method testComputeTime.
@Test(enabled = false)
public void testComputeTime() throws DataRecordException, IOException {
final int MEM_ALLOC_BYTES = 100;
final int NUM_WORK_UNITS = 1;
final int COMPUTE_TIME_MICRO = 10000;
final int NUM_RECORDS = 500;
SourceState state = new SourceState();
state.setProp(StressTestingSource.NUM_WORK_UNITS_KEY, NUM_WORK_UNITS);
state.setProp(StressTestingSource.MEM_ALLOC_BYTES_KEY, MEM_ALLOC_BYTES);
state.setProp(StressTestingSource.COMPUTE_TIME_MICRO_KEY, COMPUTE_TIME_MICRO);
state.setProp(StressTestingSource.NUM_RECORDS_KEY, NUM_RECORDS);
StressTestingSource source = new StressTestingSource();
List<WorkUnit> wus = source.getWorkunits(state);
Assert.assertEquals(wus.size(), NUM_WORK_UNITS);
WorkUnit wu = wus.get(0);
WorkUnitState wuState = new WorkUnitState(wu, state);
Extractor<String, byte[]> extractor = source.getExtractor(wuState);
byte[] record;
long startTimeNano = System.nanoTime();
while ((record = extractor.readRecord(null)) != null) {
Assert.assertEquals(record.length, 100);
}
long endTimeNano = System.nanoTime();
long timeSpentMicro = (endTimeNano - startTimeNano) / (1000);
// check that there is less than 5 second difference between expected and actual time spent
Assert.assertTrue(Math.abs(timeSpentMicro - (COMPUTE_TIME_MICRO * NUM_RECORDS)) < (5000000), "Time spent " + timeSpentMicro);
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class TestHelloWorldSource method testSourceExtractor.
@Test
public void testSourceExtractor() throws DataRecordException, IOException {
SourceState state = new SourceState();
state.setProp(HelloWorldSource.NUM_HELLOS_FULL_KEY, 10);
HelloWorldSource source = new HelloWorldSource();
List<WorkUnit> wus = source.getWorkunits(state);
Assert.assertEquals(wus.size(), 10);
for (int i = 0; i < wus.size(); ++i) {
WorkUnit wu = wus.get(i);
Assert.assertEquals(wu.getPropAsInt(HelloWorldSource.HELLO_ID_FULL_KEY), i + 1);
WorkUnitState wuState = new WorkUnitState(wu, state);
Extractor<String, String> extr = source.getExtractor(wuState);
Assert.assertEquals(extr.getExpectedRecordCount(), 1);
Assert.assertEquals(extr.readRecord(null), "Hello world " + (i + 1) + " !");
}
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class LoopingDatasetFinderSource method getWorkunitStream.
@Override
public WorkUnitStream getWorkunitStream(SourceState state) {
try {
int maxWorkUnits = state.getPropAsInt(MAX_WORK_UNITS_PER_RUN_KEY, MAX_WORK_UNITS_PER_RUN);
List<WorkUnitState> previousWorkUnitStates = state.getPreviousWorkUnitStates();
Optional<WorkUnitState> maxWorkUnit;
try {
maxWorkUnit = previousWorkUnitStates.stream().reduce((wu1, wu2) -> {
int wu1Ordinal = wu1.getPropAsInt(WORK_UNIT_ORDINAL);
int wu2Ordinal = wu2.getPropAsInt(WORK_UNIT_ORDINAL);
return wu1Ordinal > wu2Ordinal ? wu1 : wu2;
});
} catch (NumberFormatException nfe) {
throw new RuntimeException("Work units in state store are corrupted! Missing or malformed " + WORK_UNIT_ORDINAL);
}
String previousDatasetUrnWatermark = null;
String previousPartitionUrnWatermark = null;
if (maxWorkUnit.isPresent() && !maxWorkUnit.get().getPropAsBoolean(END_OF_DATASETS_KEY, false)) {
previousDatasetUrnWatermark = maxWorkUnit.get().getProp(DATASET_URN);
previousPartitionUrnWatermark = maxWorkUnit.get().getProp(PARTITION_URN);
}
IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(Spliterator.SORTED, this.lexicographicalComparator);
datasetStream = sortStreamLexicographically(datasetStream);
return new BasicWorkUnitStream.Builder(new DeepIterator(datasetStream.iterator(), previousDatasetUrnWatermark, previousPartitionUrnWatermark, maxWorkUnits)).setFiniteStream(true).build();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
Aggregations