use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.
the class Task method runSynchronousModel.
@Deprecated
private void runSynchronousModel() throws Exception {
// Get the fork operator. By default IdentityForkOperator is used with a single branch.
ForkOperator forkOperator = closer.register(this.taskContext.getForkOperator());
forkOperator.init(this.taskState);
int branches = forkOperator.getBranches(this.taskState);
// Set fork.branches explicitly here so the rest task flow can pick it up
this.taskState.setProp(ConfigurationKeys.FORK_BRANCHES_KEY, branches);
// Extract, convert, and fork the source schema.
Object schema = converter.convertSchema(extractor.getSchema(), this.taskState);
List<Boolean> forkedSchemas = forkOperator.forkSchema(this.taskState, schema);
if (forkedSchemas.size() != branches) {
throw new ForkBranchMismatchException(String.format("Number of forked schemas [%d] is not equal to number of branches [%d]", forkedSchemas.size(), branches));
}
if (inMultipleBranches(forkedSchemas) && !(CopyHelper.isCopyable(schema))) {
throw new CopyNotSupportedException(schema + " is not copyable");
}
RowLevelPolicyCheckResults rowResults = new RowLevelPolicyCheckResults();
if (!areSingleBranchTasksSynchronous(this.taskContext) || branches > 1) {
// Create one fork for each forked branch
for (int i = 0; i < branches; i++) {
if (forkedSchemas.get(i)) {
AsynchronousFork fork = closer.register(new AsynchronousFork(this.taskContext, schema instanceof Copyable ? ((Copyable) schema).copy() : schema, branches, i, this.taskMode));
configureStreamingFork(fork, watermarkingStrategy);
// Run the Fork
this.forks.put(Optional.<Fork>of(fork), Optional.<Future<?>>of(this.taskExecutor.submit(fork)));
} else {
this.forks.put(Optional.<Fork>absent(), Optional.<Future<?>>absent());
}
}
} else {
SynchronousFork fork = closer.register(new SynchronousFork(this.taskContext, schema instanceof Copyable ? ((Copyable) schema).copy() : schema, branches, 0, this.taskMode));
configureStreamingFork(fork, watermarkingStrategy);
this.forks.put(Optional.<Fork>of(fork), Optional.<Future<?>>of(this.taskExecutor.submit(fork)));
}
if (isStreamingTask()) {
// Start watermark manager and tracker
if (this.watermarkTracker.isPresent()) {
this.watermarkTracker.get().start();
}
this.watermarkManager.get().start();
((StreamingExtractor) this.taskContext.getRawSourceExtractor()).start(this.watermarkStorage.get());
RecordEnvelope recordEnvelope;
// Extract, convert, and fork one source record at a time.
while (!shutdownRequested() && (recordEnvelope = extractor.readRecordEnvelope()) != null) {
onRecordExtract();
AcknowledgableWatermark ackableWatermark = new AcknowledgableWatermark(recordEnvelope.getWatermark());
if (watermarkTracker.isPresent()) {
watermarkTracker.get().track(ackableWatermark);
}
for (Object convertedRecord : converter.convertRecord(schema, recordEnvelope, this.taskState)) {
processRecord(convertedRecord, forkOperator, rowChecker, rowResults, branches, ackableWatermark.incrementAck());
}
ackableWatermark.ack();
}
} else {
RecordEnvelope record;
// Extract, convert, and fork one source record at a time.
long errRecords = 0;
while ((record = extractor.readRecordEnvelope()) != null) {
onRecordExtract();
try {
for (Object convertedRecord : converter.convertRecord(schema, record.getRecord(), this.taskState)) {
processRecord(convertedRecord, forkOperator, rowChecker, rowResults, branches, null);
}
} catch (Exception e) {
if (!(e instanceof DataConversionException) && !(e.getCause() instanceof DataConversionException)) {
LOG.error("Processing record incurs an unexpected exception: ", e);
throw new RuntimeException(e.getCause());
}
errRecords++;
if (errRecords > this.taskState.getPropAsLong(TaskConfigurationKeys.TASK_SKIP_ERROR_RECORDS, TaskConfigurationKeys.DEFAULT_TASK_SKIP_ERROR_RECORDS)) {
throw new RuntimeException(e);
}
}
}
}
LOG.info("Extracted " + this.recordsPulled + " data records");
LOG.info("Row quality checker finished with results: " + rowResults.getResults());
this.taskState.setProp(ConfigurationKeys.EXTRACTOR_ROWS_EXTRACTED, this.recordsPulled);
this.taskState.setProp(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED, extractor.getExpectedRecordCount());
for (Optional<Fork> fork : this.forks.keySet()) {
if (fork.isPresent()) {
// Tell the fork that the main branch is completed and no new incoming data records should be expected
fork.get().markParentTaskDone();
}
}
for (Optional<Future<?>> forkFuture : this.forks.values()) {
if (forkFuture.isPresent()) {
try {
long forkFutureStartTime = System.nanoTime();
forkFuture.get().get();
long forkDuration = System.nanoTime() - forkFutureStartTime;
LOG.info("Task shutdown: Fork future reaped in {} millis", forkDuration / 1000000);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
}
}
}
use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.
the class TaskTest method testForkCorrectnessIdentity.
/**
* Test that forks work correctly when the operator picks all outgoing forks
*/
@Test(dataProvider = "stateOverrides")
public void testForkCorrectnessIdentity(State overrides) throws Exception {
// Create a TaskState
TaskState taskState = getEmptyTestTaskState("testForkTaskId");
taskState.addAll(overrides);
int numRecords = 100;
int numForks = 5;
// Identity Fork Operator looks for number of forks in work unit state.
taskState.setProp(ConfigurationKeys.FORK_BRANCHES_KEY, "" + numForks);
ForkOperator mockForkOperator = new IdentityForkOperator();
ArrayList<ArrayList<Object>> recordCollectors = runTaskAndGetResults(taskState, numRecords, numForks, mockForkOperator);
// Check that we got the right records in the collectors
int recordsPerFork = numRecords;
for (int forkNumber = 0; forkNumber < numForks; ++forkNumber) {
ArrayList<Object> forkRecords = recordCollectors.get(forkNumber);
Assert.assertEquals(forkRecords.size(), recordsPerFork);
for (int j = 0; j < recordsPerFork; ++j) {
Object forkRecord = forkRecords.get(j);
Assert.assertEquals((String) forkRecord, "" + j);
}
}
}
use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.
the class TaskTest method testForkCorrectnessSubset.
/**
* Test that forks work correctly when the operator picks a subset of outgoing forks
*/
@Test(dataProvider = "stateOverrides")
public void testForkCorrectnessSubset(State overrides) throws Exception {
// Create a TaskState
TaskState taskState = getEmptyTestTaskState("testForkTaskId");
taskState.addAll(overrides);
int numRecords = 20;
int numForks = 5;
int subset = 2;
ForkOperator mockForkOperator = new SubsetForkOperator(numForks, subset);
ArrayList<ArrayList<Object>> recordCollectors = runTaskAndGetResults(taskState, numRecords, numForks, mockForkOperator);
log.info("Records collected: {}", recordCollectors);
// Check that we got the right records in the collectors
int totalRecordsExpected = numRecords * subset;
int totalRecordsFound = 0;
HashMap<String, ArrayList<Integer>> recordsMap = new HashMap<>();
for (int forkNumber = 0; forkNumber < numForks; ++forkNumber) {
ArrayList<Object> forkRecords = recordCollectors.get(forkNumber);
for (Object forkRecord : forkRecords) {
String recordAsString = (String) forkRecord;
totalRecordsFound++;
if (recordsMap.containsKey(recordAsString)) {
recordsMap.get(recordAsString).add(forkNumber);
} else {
ArrayList<Integer> forksFound = new ArrayList<>();
forksFound.add(forkNumber);
recordsMap.put(recordAsString, forksFound);
}
}
}
Assert.assertEquals(totalRecordsFound, totalRecordsExpected, "Total records");
for (Map.Entry<String, ArrayList<Integer>> recordForks : recordsMap.entrySet()) {
Assert.assertEquals(recordForks.getValue().size(), subset);
}
}
use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.
the class TaskTest method testForkCorrectnessRoundRobin.
/**
* Test that forks work correctly when the operator picks one outgoing fork
*/
@Test(dataProvider = "stateOverrides")
public void testForkCorrectnessRoundRobin(State overrides) throws Exception {
// Create a TaskState
TaskState taskState = getEmptyTestTaskState("testForkTaskId");
taskState.addAll(overrides);
int numRecords = 9;
int numForks = 3;
ForkOperator mockForkOperator = new RoundRobinForkOperator(numForks);
// The following code depends on exact multiples
Assert.assertTrue(numRecords % numForks == 0);
ArrayList<ArrayList<Object>> recordCollectors = runTaskAndGetResults(taskState, numRecords, numForks, mockForkOperator);
// Check that we got the right records in the collectors
int recordsPerFork = numRecords / numForks;
for (int forkNumber = 0; forkNumber < numForks; ++forkNumber) {
ArrayList<Object> forkRecords = recordCollectors.get(forkNumber);
Assert.assertEquals(forkRecords.size(), recordsPerFork);
for (int j = 0; j < recordsPerFork; ++j) {
Object forkRecord = forkRecords.get(j);
Assert.assertEquals((String) forkRecord, "" + (j * recordsPerFork + forkNumber));
}
}
}
use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.
the class StreamModelTaskRunner method run.
protected void run() throws Exception {
// Get the fork operator. By default IdentityForkOperator is used with a single branch.
ForkOperator forkOperator = closer.register(this.taskContext.getForkOperator());
RecordStreamWithMetadata<?, ?> stream = this.extractor.recordStream(this.shutdownRequested);
ConnectableFlowable connectableStream = stream.getRecordStream().publish();
stream = stream.withRecordStream(connectableStream);
stream = stream.mapRecords(r -> {
this.task.onRecordExtract();
return r;
});
if (this.task.isStreamingTask()) {
// Start watermark manager and tracker
if (this.watermarkTracker.isPresent()) {
this.watermarkTracker.get().start();
}
this.watermarkManager.get().start();
((StreamingExtractor) this.taskContext.getRawSourceExtractor()).start(this.watermarkStorage.get());
stream = stream.mapRecords(r -> {
AcknowledgableWatermark ackableWatermark = new AcknowledgableWatermark(r.getWatermark());
if (watermarkTracker.isPresent()) {
watermarkTracker.get().track(ackableWatermark);
}
r.addCallBack(ackableWatermark);
return r;
});
}
// Use the recordStreamProcessor list if it is configured. This list can contain both all RecordStreamProcessor types
if (!this.recordStreamProcessors.isEmpty()) {
for (RecordStreamProcessor streamProcessor : this.recordStreamProcessors) {
stream = streamProcessor.processStream(stream, this.taskState);
}
} else {
if (this.converter instanceof MultiConverter) {
// if multiconverter, unpack it
for (Converter cverter : ((MultiConverter) this.converter).getConverters()) {
stream = cverter.processStream(stream, this.taskState);
}
} else {
stream = this.converter.processStream(stream, this.taskState);
}
}
stream = this.rowChecker.processStream(stream, this.taskState);
Forker.ForkedStream<?, ?> forkedStreams = new Forker().forkStream(stream, forkOperator, this.taskState);
boolean isForkAsync = !this.task.areSingleBranchTasksSynchronous(this.taskContext) || forkedStreams.getForkedStreams().size() > 1;
int bufferSize = this.taskState.getPropAsInt(ConfigurationKeys.FORK_RECORD_QUEUE_CAPACITY_KEY, ConfigurationKeys.DEFAULT_FORK_RECORD_QUEUE_CAPACITY);
for (int fidx = 0; fidx < forkedStreams.getForkedStreams().size(); fidx++) {
RecordStreamWithMetadata<?, ?> forkedStream = forkedStreams.getForkedStreams().get(fidx);
if (forkedStream != null) {
if (isForkAsync) {
forkedStream = forkedStream.mapStream(f -> f.observeOn(Schedulers.from(this.taskExecutor.getForkExecutor()), false, bufferSize));
}
Fork fork = new Fork(this.taskContext, forkedStream.getGlobalMetadata().getSchema(), forkedStreams.getForkedStreams().size(), fidx, this.taskMode);
fork.consumeRecordStream(forkedStream);
this.forks.put(Optional.of(fork), Optional.of(Futures.immediateFuture(null)));
this.task.configureStreamingFork(fork, this.watermarkingStrategy);
}
}
connectableStream.connect();
if (!ExponentialBackoff.awaitCondition().callable(() -> this.forks.keySet().stream().map(Optional::get).allMatch(Fork::isDone)).initialDelay(1000L).maxDelay(1000L).maxWait(TimeUnit.MINUTES.toMillis(60)).await()) {
throw new TimeoutException("Forks did not finish withing specified timeout.");
}
}
Aggregations