Search in sources :

Example 1 with ForkOperator

use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.

the class Task method runSynchronousModel.

@Deprecated
private void runSynchronousModel() throws Exception {
    // Get the fork operator. By default IdentityForkOperator is used with a single branch.
    ForkOperator forkOperator = closer.register(this.taskContext.getForkOperator());
    forkOperator.init(this.taskState);
    int branches = forkOperator.getBranches(this.taskState);
    // Set fork.branches explicitly here so the rest task flow can pick it up
    this.taskState.setProp(ConfigurationKeys.FORK_BRANCHES_KEY, branches);
    // Extract, convert, and fork the source schema.
    Object schema = converter.convertSchema(extractor.getSchema(), this.taskState);
    List<Boolean> forkedSchemas = forkOperator.forkSchema(this.taskState, schema);
    if (forkedSchemas.size() != branches) {
        throw new ForkBranchMismatchException(String.format("Number of forked schemas [%d] is not equal to number of branches [%d]", forkedSchemas.size(), branches));
    }
    if (inMultipleBranches(forkedSchemas) && !(CopyHelper.isCopyable(schema))) {
        throw new CopyNotSupportedException(schema + " is not copyable");
    }
    RowLevelPolicyCheckResults rowResults = new RowLevelPolicyCheckResults();
    if (!areSingleBranchTasksSynchronous(this.taskContext) || branches > 1) {
        // Create one fork for each forked branch
        for (int i = 0; i < branches; i++) {
            if (forkedSchemas.get(i)) {
                AsynchronousFork fork = closer.register(new AsynchronousFork(this.taskContext, schema instanceof Copyable ? ((Copyable) schema).copy() : schema, branches, i, this.taskMode));
                configureStreamingFork(fork, watermarkingStrategy);
                // Run the Fork
                this.forks.put(Optional.<Fork>of(fork), Optional.<Future<?>>of(this.taskExecutor.submit(fork)));
            } else {
                this.forks.put(Optional.<Fork>absent(), Optional.<Future<?>>absent());
            }
        }
    } else {
        SynchronousFork fork = closer.register(new SynchronousFork(this.taskContext, schema instanceof Copyable ? ((Copyable) schema).copy() : schema, branches, 0, this.taskMode));
        configureStreamingFork(fork, watermarkingStrategy);
        this.forks.put(Optional.<Fork>of(fork), Optional.<Future<?>>of(this.taskExecutor.submit(fork)));
    }
    if (isStreamingTask()) {
        // Start watermark manager and tracker
        if (this.watermarkTracker.isPresent()) {
            this.watermarkTracker.get().start();
        }
        this.watermarkManager.get().start();
        ((StreamingExtractor) this.taskContext.getRawSourceExtractor()).start(this.watermarkStorage.get());
        RecordEnvelope recordEnvelope;
        // Extract, convert, and fork one source record at a time.
        while (!shutdownRequested() && (recordEnvelope = extractor.readRecordEnvelope()) != null) {
            onRecordExtract();
            AcknowledgableWatermark ackableWatermark = new AcknowledgableWatermark(recordEnvelope.getWatermark());
            if (watermarkTracker.isPresent()) {
                watermarkTracker.get().track(ackableWatermark);
            }
            for (Object convertedRecord : converter.convertRecord(schema, recordEnvelope, this.taskState)) {
                processRecord(convertedRecord, forkOperator, rowChecker, rowResults, branches, ackableWatermark.incrementAck());
            }
            ackableWatermark.ack();
        }
    } else {
        RecordEnvelope record;
        // Extract, convert, and fork one source record at a time.
        long errRecords = 0;
        while ((record = extractor.readRecordEnvelope()) != null) {
            onRecordExtract();
            try {
                for (Object convertedRecord : converter.convertRecord(schema, record.getRecord(), this.taskState)) {
                    processRecord(convertedRecord, forkOperator, rowChecker, rowResults, branches, null);
                }
            } catch (Exception e) {
                if (!(e instanceof DataConversionException) && !(e.getCause() instanceof DataConversionException)) {
                    LOG.error("Processing record incurs an unexpected exception: ", e);
                    throw new RuntimeException(e.getCause());
                }
                errRecords++;
                if (errRecords > this.taskState.getPropAsLong(TaskConfigurationKeys.TASK_SKIP_ERROR_RECORDS, TaskConfigurationKeys.DEFAULT_TASK_SKIP_ERROR_RECORDS)) {
                    throw new RuntimeException(e);
                }
            }
        }
    }
    LOG.info("Extracted " + this.recordsPulled + " data records");
    LOG.info("Row quality checker finished with results: " + rowResults.getResults());
    this.taskState.setProp(ConfigurationKeys.EXTRACTOR_ROWS_EXTRACTED, this.recordsPulled);
    this.taskState.setProp(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED, extractor.getExpectedRecordCount());
    for (Optional<Fork> fork : this.forks.keySet()) {
        if (fork.isPresent()) {
            // Tell the fork that the main branch is completed and no new incoming data records should be expected
            fork.get().markParentTaskDone();
        }
    }
    for (Optional<Future<?>> forkFuture : this.forks.values()) {
        if (forkFuture.isPresent()) {
            try {
                long forkFutureStartTime = System.nanoTime();
                forkFuture.get().get();
                long forkDuration = System.nanoTime() - forkFutureStartTime;
                LOG.info("Task shutdown: Fork future reaped in {} millis", forkDuration / 1000000);
            } catch (InterruptedException ie) {
                Thread.currentThread().interrupt();
            }
        }
    }
}
Also used : AsynchronousFork(org.apache.gobblin.runtime.fork.AsynchronousFork) AsynchronousFork(org.apache.gobblin.runtime.fork.AsynchronousFork) Fork(org.apache.gobblin.runtime.fork.Fork) SynchronousFork(org.apache.gobblin.runtime.fork.SynchronousFork) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) StreamingExtractor(org.apache.gobblin.source.extractor.StreamingExtractor) DataConversionException(org.apache.gobblin.converter.DataConversionException) IOException(java.io.IOException) CopyNotSupportedException(org.apache.gobblin.fork.CopyNotSupportedException) ForkOperator(org.apache.gobblin.fork.ForkOperator) Copyable(org.apache.gobblin.fork.Copyable) SynchronousFork(org.apache.gobblin.runtime.fork.SynchronousFork) Future(java.util.concurrent.Future) RowLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.row.RowLevelPolicyCheckResults) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) DataConversionException(org.apache.gobblin.converter.DataConversionException) CopyNotSupportedException(org.apache.gobblin.fork.CopyNotSupportedException)

Example 2 with ForkOperator

use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.

the class TaskTest method testForkCorrectnessIdentity.

/**
 * Test that forks work correctly when the operator picks all outgoing forks
 */
@Test(dataProvider = "stateOverrides")
public void testForkCorrectnessIdentity(State overrides) throws Exception {
    // Create a TaskState
    TaskState taskState = getEmptyTestTaskState("testForkTaskId");
    taskState.addAll(overrides);
    int numRecords = 100;
    int numForks = 5;
    // Identity Fork Operator looks for number of forks in work unit state.
    taskState.setProp(ConfigurationKeys.FORK_BRANCHES_KEY, "" + numForks);
    ForkOperator mockForkOperator = new IdentityForkOperator();
    ArrayList<ArrayList<Object>> recordCollectors = runTaskAndGetResults(taskState, numRecords, numForks, mockForkOperator);
    // Check that we got the right records in the collectors
    int recordsPerFork = numRecords;
    for (int forkNumber = 0; forkNumber < numForks; ++forkNumber) {
        ArrayList<Object> forkRecords = recordCollectors.get(forkNumber);
        Assert.assertEquals(forkRecords.size(), recordsPerFork);
        for (int j = 0; j < recordsPerFork; ++j) {
            Object forkRecord = forkRecords.get(j);
            Assert.assertEquals((String) forkRecord, "" + j);
        }
    }
}
Also used : IdentityForkOperator(org.apache.gobblin.fork.IdentityForkOperator) ForkOperator(org.apache.gobblin.fork.ForkOperator) IdentityForkOperator(org.apache.gobblin.fork.IdentityForkOperator) ArrayList(java.util.ArrayList) Test(org.testng.annotations.Test)

Example 3 with ForkOperator

use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.

the class TaskTest method testForkCorrectnessSubset.

/**
 * Test that forks work correctly when the operator picks a subset of outgoing forks
 */
@Test(dataProvider = "stateOverrides")
public void testForkCorrectnessSubset(State overrides) throws Exception {
    // Create a TaskState
    TaskState taskState = getEmptyTestTaskState("testForkTaskId");
    taskState.addAll(overrides);
    int numRecords = 20;
    int numForks = 5;
    int subset = 2;
    ForkOperator mockForkOperator = new SubsetForkOperator(numForks, subset);
    ArrayList<ArrayList<Object>> recordCollectors = runTaskAndGetResults(taskState, numRecords, numForks, mockForkOperator);
    log.info("Records collected: {}", recordCollectors);
    // Check that we got the right records in the collectors
    int totalRecordsExpected = numRecords * subset;
    int totalRecordsFound = 0;
    HashMap<String, ArrayList<Integer>> recordsMap = new HashMap<>();
    for (int forkNumber = 0; forkNumber < numForks; ++forkNumber) {
        ArrayList<Object> forkRecords = recordCollectors.get(forkNumber);
        for (Object forkRecord : forkRecords) {
            String recordAsString = (String) forkRecord;
            totalRecordsFound++;
            if (recordsMap.containsKey(recordAsString)) {
                recordsMap.get(recordAsString).add(forkNumber);
            } else {
                ArrayList<Integer> forksFound = new ArrayList<>();
                forksFound.add(forkNumber);
                recordsMap.put(recordAsString, forksFound);
            }
        }
    }
    Assert.assertEquals(totalRecordsFound, totalRecordsExpected, "Total records");
    for (Map.Entry<String, ArrayList<Integer>> recordForks : recordsMap.entrySet()) {
        Assert.assertEquals(recordForks.getValue().size(), subset);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ForkOperator(org.apache.gobblin.fork.ForkOperator) IdentityForkOperator(org.apache.gobblin.fork.IdentityForkOperator) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.testng.annotations.Test)

Example 4 with ForkOperator

use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.

the class TaskTest method testForkCorrectnessRoundRobin.

/**
 * Test that forks work correctly when the operator picks one outgoing fork
 */
@Test(dataProvider = "stateOverrides")
public void testForkCorrectnessRoundRobin(State overrides) throws Exception {
    // Create a TaskState
    TaskState taskState = getEmptyTestTaskState("testForkTaskId");
    taskState.addAll(overrides);
    int numRecords = 9;
    int numForks = 3;
    ForkOperator mockForkOperator = new RoundRobinForkOperator(numForks);
    // The following code depends on exact multiples
    Assert.assertTrue(numRecords % numForks == 0);
    ArrayList<ArrayList<Object>> recordCollectors = runTaskAndGetResults(taskState, numRecords, numForks, mockForkOperator);
    // Check that we got the right records in the collectors
    int recordsPerFork = numRecords / numForks;
    for (int forkNumber = 0; forkNumber < numForks; ++forkNumber) {
        ArrayList<Object> forkRecords = recordCollectors.get(forkNumber);
        Assert.assertEquals(forkRecords.size(), recordsPerFork);
        for (int j = 0; j < recordsPerFork; ++j) {
            Object forkRecord = forkRecords.get(j);
            Assert.assertEquals((String) forkRecord, "" + (j * recordsPerFork + forkNumber));
        }
    }
}
Also used : ForkOperator(org.apache.gobblin.fork.ForkOperator) IdentityForkOperator(org.apache.gobblin.fork.IdentityForkOperator) ArrayList(java.util.ArrayList) Test(org.testng.annotations.Test)

Example 5 with ForkOperator

use of org.apache.gobblin.fork.ForkOperator in project incubator-gobblin by apache.

the class StreamModelTaskRunner method run.

protected void run() throws Exception {
    // Get the fork operator. By default IdentityForkOperator is used with a single branch.
    ForkOperator forkOperator = closer.register(this.taskContext.getForkOperator());
    RecordStreamWithMetadata<?, ?> stream = this.extractor.recordStream(this.shutdownRequested);
    ConnectableFlowable connectableStream = stream.getRecordStream().publish();
    stream = stream.withRecordStream(connectableStream);
    stream = stream.mapRecords(r -> {
        this.task.onRecordExtract();
        return r;
    });
    if (this.task.isStreamingTask()) {
        // Start watermark manager and tracker
        if (this.watermarkTracker.isPresent()) {
            this.watermarkTracker.get().start();
        }
        this.watermarkManager.get().start();
        ((StreamingExtractor) this.taskContext.getRawSourceExtractor()).start(this.watermarkStorage.get());
        stream = stream.mapRecords(r -> {
            AcknowledgableWatermark ackableWatermark = new AcknowledgableWatermark(r.getWatermark());
            if (watermarkTracker.isPresent()) {
                watermarkTracker.get().track(ackableWatermark);
            }
            r.addCallBack(ackableWatermark);
            return r;
        });
    }
    // Use the recordStreamProcessor list if it is configured. This list can contain both all RecordStreamProcessor types
    if (!this.recordStreamProcessors.isEmpty()) {
        for (RecordStreamProcessor streamProcessor : this.recordStreamProcessors) {
            stream = streamProcessor.processStream(stream, this.taskState);
        }
    } else {
        if (this.converter instanceof MultiConverter) {
            // if multiconverter, unpack it
            for (Converter cverter : ((MultiConverter) this.converter).getConverters()) {
                stream = cverter.processStream(stream, this.taskState);
            }
        } else {
            stream = this.converter.processStream(stream, this.taskState);
        }
    }
    stream = this.rowChecker.processStream(stream, this.taskState);
    Forker.ForkedStream<?, ?> forkedStreams = new Forker().forkStream(stream, forkOperator, this.taskState);
    boolean isForkAsync = !this.task.areSingleBranchTasksSynchronous(this.taskContext) || forkedStreams.getForkedStreams().size() > 1;
    int bufferSize = this.taskState.getPropAsInt(ConfigurationKeys.FORK_RECORD_QUEUE_CAPACITY_KEY, ConfigurationKeys.DEFAULT_FORK_RECORD_QUEUE_CAPACITY);
    for (int fidx = 0; fidx < forkedStreams.getForkedStreams().size(); fidx++) {
        RecordStreamWithMetadata<?, ?> forkedStream = forkedStreams.getForkedStreams().get(fidx);
        if (forkedStream != null) {
            if (isForkAsync) {
                forkedStream = forkedStream.mapStream(f -> f.observeOn(Schedulers.from(this.taskExecutor.getForkExecutor()), false, bufferSize));
            }
            Fork fork = new Fork(this.taskContext, forkedStream.getGlobalMetadata().getSchema(), forkedStreams.getForkedStreams().size(), fidx, this.taskMode);
            fork.consumeRecordStream(forkedStream);
            this.forks.put(Optional.of(fork), Optional.of(Futures.immediateFuture(null)));
            this.task.configureStreamingFork(fork, this.watermarkingStrategy);
        }
    }
    connectableStream.connect();
    if (!ExponentialBackoff.awaitCondition().callable(() -> this.forks.keySet().stream().map(Optional::get).allMatch(Fork::isDone)).initialDelay(1000L).maxDelay(1000L).maxWait(TimeUnit.MINUTES.toMillis(60)).await()) {
        throw new TimeoutException("Forks did not finish withing specified timeout.");
    }
}
Also used : StreamingExtractor(org.apache.gobblin.source.extractor.StreamingExtractor) WatermarkManager(org.apache.gobblin.writer.WatermarkManager) ForkOperator(org.apache.gobblin.fork.ForkOperator) ConnectableFlowable(io.reactivex.flowables.ConnectableFlowable) TimeoutException(java.util.concurrent.TimeoutException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AcknowledgableWatermark(org.apache.gobblin.writer.AcknowledgableWatermark) Fork(org.apache.gobblin.runtime.fork.Fork) Future(java.util.concurrent.Future) Closer(com.google.common.io.Closer) Optional(com.google.common.base.Optional) Map(java.util.Map) Schedulers(io.reactivex.schedulers.Schedulers) RecordStreamProcessor(org.apache.gobblin.records.RecordStreamProcessor) Forker(org.apache.gobblin.fork.Forker) WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) ExponentialBackoff(org.apache.gobblin.util.ExponentialBackoff) Converter(org.apache.gobblin.converter.Converter) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) TimeUnit(java.util.concurrent.TimeUnit) Extractor(org.apache.gobblin.source.extractor.Extractor) List(java.util.List) Futures(com.google.common.util.concurrent.Futures) FineGrainedWatermarkTracker(org.apache.gobblin.writer.FineGrainedWatermarkTracker) RowLevelPolicyChecker(org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker) RecordStreamWithMetadata(org.apache.gobblin.records.RecordStreamWithMetadata) AllArgsConstructor(lombok.AllArgsConstructor) Fork(org.apache.gobblin.runtime.fork.Fork) StreamingExtractor(org.apache.gobblin.source.extractor.StreamingExtractor) ConnectableFlowable(io.reactivex.flowables.ConnectableFlowable) Forker(org.apache.gobblin.fork.Forker) RecordStreamProcessor(org.apache.gobblin.records.RecordStreamProcessor) AcknowledgableWatermark(org.apache.gobblin.writer.AcknowledgableWatermark) ForkOperator(org.apache.gobblin.fork.ForkOperator) Converter(org.apache.gobblin.converter.Converter) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

ForkOperator (org.apache.gobblin.fork.ForkOperator)5 ArrayList (java.util.ArrayList)3 IdentityForkOperator (org.apache.gobblin.fork.IdentityForkOperator)3 Test (org.testng.annotations.Test)3 Map (java.util.Map)2 Future (java.util.concurrent.Future)2 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)2 Fork (org.apache.gobblin.runtime.fork.Fork)2 StreamingExtractor (org.apache.gobblin.source.extractor.StreamingExtractor)2 Optional (com.google.common.base.Optional)1 Closer (com.google.common.io.Closer)1 Futures (com.google.common.util.concurrent.Futures)1 ConnectableFlowable (io.reactivex.flowables.ConnectableFlowable)1 Schedulers (io.reactivex.schedulers.Schedulers)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 List (java.util.List)1 TimeUnit (java.util.concurrent.TimeUnit)1 TimeoutException (java.util.concurrent.TimeoutException)1 AllArgsConstructor (lombok.AllArgsConstructor)1