Search in sources :

Example 1 with RecordStreamWithMetadata

use of org.apache.gobblin.records.RecordStreamWithMetadata in project incubator-gobblin by apache.

the class ForkerTest method test.

@Test
public void test() throws Exception {
    Forker forker = new Forker();
    MyFlowable<StreamEntity<byte[]>> flowable = new MyFlowable<>();
    RecordStreamWithMetadata<byte[], String> stream = new RecordStreamWithMetadata<>(flowable, GlobalMetadata.<String>builder().schema("schema").build());
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(ConfigurationKeys.FORK_BRANCHES_KEY, "3");
    Forker.ForkedStream<byte[], String> forkedStream = forker.forkStream(stream, new MyForkOperator(), workUnitState);
    Assert.assertEquals(forkedStream.getForkedStreams().size(), 3);
    Queue<StreamEntity<byte[]>> output0 = new LinkedList<>();
    forkedStream.getForkedStreams().get(0).getRecordStream().subscribe(output0::add);
    Queue<StreamEntity<byte[]>> output1 = new LinkedList<>();
    forkedStream.getForkedStreams().get(1).getRecordStream().subscribe(output1::add);
    Queue<StreamEntity<byte[]>> output2 = new LinkedList<>();
    forkedStream.getForkedStreams().get(2).getRecordStream().subscribe(output2::add);
    flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 1, 1, 1 }));
    Assert.assertTrue(output0.poll() instanceof RecordEnvelope);
    Assert.assertTrue(output1.poll() instanceof RecordEnvelope);
    Assert.assertTrue(output2.poll() instanceof RecordEnvelope);
    flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 1, 0, 0 }));
    Assert.assertTrue(output0.poll() instanceof RecordEnvelope);
    Assert.assertNull(output1.poll());
    Assert.assertNull(output2.poll());
    flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 0, 1, 1 }));
    Assert.assertNull(output0.poll());
    Assert.assertTrue(output1.poll() instanceof RecordEnvelope);
    Assert.assertTrue(output2.poll() instanceof RecordEnvelope);
    flowable._subscriber.onNext(new BasicTestControlMessage<byte[]>("control"));
    Assert.assertTrue(output0.poll() instanceof BasicTestControlMessage);
    Assert.assertTrue(output1.poll() instanceof BasicTestControlMessage);
    Assert.assertTrue(output2.poll() instanceof BasicTestControlMessage);
    flowable._subscriber.onComplete();
}
Also used : RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) RecordStreamWithMetadata(org.apache.gobblin.records.RecordStreamWithMetadata) StreamEntity(org.apache.gobblin.stream.StreamEntity) LinkedList(java.util.LinkedList) BasicTestControlMessage(org.apache.gobblin.runtime.BasicTestControlMessage) Test(org.testng.annotations.Test)

Example 2 with RecordStreamWithMetadata

use of org.apache.gobblin.records.RecordStreamWithMetadata in project incubator-gobblin by apache.

the class AsyncConverter1to1Test method testFailedConversion.

@Test
public void testFailedConversion() throws Exception {
    MyAsyncConverter1to1 converter = new MyAsyncConverter1to1();
    List<Throwable> errors = Lists.newArrayList();
    AtomicBoolean done = new AtomicBoolean(false);
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(AsyncConverter1to1.MAX_CONCURRENT_ASYNC_CONVERSIONS_KEY, 3);
    RecordStreamWithMetadata<String, String> stream = new RecordStreamWithMetadata<>(Flowable.just("0", MyAsyncConverter1to1.FAIL, "1").map(RecordEnvelope::new), GlobalMetadata.<String>builder().schema("schema").build());
    Set<String> outputRecords = Sets.newConcurrentHashSet();
    converter.processStream(stream, workUnitState).getRecordStream().subscribeOn(Schedulers.newThread()).subscribe(r -> outputRecords.add(((RecordEnvelope<String>) r).getRecord()), errors::add, () -> done.set(true));
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> errors.size() > 0).await());
    Assert.assertEquals(errors.size(), 1);
    Assert.assertEquals(errors.get(0).getCause().getMessage(), "injected failure");
}
Also used : AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) RecordStreamWithMetadata(org.apache.gobblin.records.RecordStreamWithMetadata) Test(org.testng.annotations.Test)

Example 3 with RecordStreamWithMetadata

use of org.apache.gobblin.records.RecordStreamWithMetadata in project incubator-gobblin by apache.

the class AsyncConverter1to1Test method test1to1.

@Test
public void test1to1() throws Exception {
    MyAsyncConverter1to1 converter = new MyAsyncConverter1to1();
    List<Throwable> errors = Lists.newArrayList();
    AtomicBoolean done = new AtomicBoolean(false);
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(AsyncConverter1to1.MAX_CONCURRENT_ASYNC_CONVERSIONS_KEY, 3);
    RecordStreamWithMetadata<String, String> stream = new RecordStreamWithMetadata<>(Flowable.range(0, 5).map(i -> i.toString()).map(RecordEnvelope::new), GlobalMetadata.<String>builder().schema("schema").build());
    Set<String> outputRecords = Sets.newConcurrentHashSet();
    converter.processStream(stream, workUnitState).getRecordStream().subscribeOn(Schedulers.newThread()).subscribe(r -> outputRecords.add(((RecordEnvelope<String>) r).getRecord()), errors::add, () -> done.set(true));
    // Release record 0
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> converter.completeFutureIfPresent("0")).await());
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> outputRecords.contains("0")).await());
    Assert.assertEquals(outputRecords.size(), 1);
    // Record 4 should not be in the queue yet (max concurrent conversions is 3).
    Assert.assertFalse(ExponentialBackoff.awaitCondition().maxWait(200L).callable(() -> converter.completeFutureIfPresent("4")).await());
    // Release record 3 (out of order)
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> converter.completeFutureIfPresent("3")).await());
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> outputRecords.contains("3")).await());
    // only two records have been released
    Assert.assertEquals(outputRecords.size(), 2);
    // Release record 4 (now in queue)
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> converter.completeFutureIfPresent("4")).await());
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> outputRecords.contains("4")).await());
    Assert.assertEquals(outputRecords.size(), 3);
    // Release records 1 and 2
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> converter.completeFutureIfPresent("1")).await());
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> converter.completeFutureIfPresent("2")).await());
    Assert.assertTrue(ExponentialBackoff.awaitCondition().maxWait(100L).callable(() -> outputRecords.size() == 5).await());
    Assert.assertEquals(outputRecords, Sets.newHashSet("0", "1", "2", "3", "4"));
    Assert.assertTrue(errors.isEmpty());
    Assert.assertTrue(done.get());
}
Also used : AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) RecordStreamWithMetadata(org.apache.gobblin.records.RecordStreamWithMetadata) Test(org.testng.annotations.Test)

Example 4 with RecordStreamWithMetadata

use of org.apache.gobblin.records.RecordStreamWithMetadata in project incubator-gobblin by apache.

the class StreamModelTaskRunner method run.

protected void run() throws Exception {
    // Get the fork operator. By default IdentityForkOperator is used with a single branch.
    ForkOperator forkOperator = closer.register(this.taskContext.getForkOperator());
    RecordStreamWithMetadata<?, ?> stream = this.extractor.recordStream(this.shutdownRequested);
    ConnectableFlowable connectableStream = stream.getRecordStream().publish();
    stream = stream.withRecordStream(connectableStream);
    stream = stream.mapRecords(r -> {
        this.task.onRecordExtract();
        return r;
    });
    if (this.task.isStreamingTask()) {
        // Start watermark manager and tracker
        if (this.watermarkTracker.isPresent()) {
            this.watermarkTracker.get().start();
        }
        this.watermarkManager.get().start();
        ((StreamingExtractor) this.taskContext.getRawSourceExtractor()).start(this.watermarkStorage.get());
        stream = stream.mapRecords(r -> {
            AcknowledgableWatermark ackableWatermark = new AcknowledgableWatermark(r.getWatermark());
            if (watermarkTracker.isPresent()) {
                watermarkTracker.get().track(ackableWatermark);
            }
            r.addCallBack(ackableWatermark);
            return r;
        });
    }
    // Use the recordStreamProcessor list if it is configured. This list can contain both all RecordStreamProcessor types
    if (!this.recordStreamProcessors.isEmpty()) {
        for (RecordStreamProcessor streamProcessor : this.recordStreamProcessors) {
            stream = streamProcessor.processStream(stream, this.taskState);
        }
    } else {
        if (this.converter instanceof MultiConverter) {
            // if multiconverter, unpack it
            for (Converter cverter : ((MultiConverter) this.converter).getConverters()) {
                stream = cverter.processStream(stream, this.taskState);
            }
        } else {
            stream = this.converter.processStream(stream, this.taskState);
        }
    }
    stream = this.rowChecker.processStream(stream, this.taskState);
    Forker.ForkedStream<?, ?> forkedStreams = new Forker().forkStream(stream, forkOperator, this.taskState);
    boolean isForkAsync = !this.task.areSingleBranchTasksSynchronous(this.taskContext) || forkedStreams.getForkedStreams().size() > 1;
    int bufferSize = this.taskState.getPropAsInt(ConfigurationKeys.FORK_RECORD_QUEUE_CAPACITY_KEY, ConfigurationKeys.DEFAULT_FORK_RECORD_QUEUE_CAPACITY);
    for (int fidx = 0; fidx < forkedStreams.getForkedStreams().size(); fidx++) {
        RecordStreamWithMetadata<?, ?> forkedStream = forkedStreams.getForkedStreams().get(fidx);
        if (forkedStream != null) {
            if (isForkAsync) {
                forkedStream = forkedStream.mapStream(f -> f.observeOn(Schedulers.from(this.taskExecutor.getForkExecutor()), false, bufferSize));
            }
            Fork fork = new Fork(this.taskContext, forkedStream.getGlobalMetadata().getSchema(), forkedStreams.getForkedStreams().size(), fidx, this.taskMode);
            fork.consumeRecordStream(forkedStream);
            this.forks.put(Optional.of(fork), Optional.of(Futures.immediateFuture(null)));
            this.task.configureStreamingFork(fork, this.watermarkingStrategy);
        }
    }
    connectableStream.connect();
    if (!ExponentialBackoff.awaitCondition().callable(() -> this.forks.keySet().stream().map(Optional::get).allMatch(Fork::isDone)).initialDelay(1000L).maxDelay(1000L).maxWait(TimeUnit.MINUTES.toMillis(60)).await()) {
        throw new TimeoutException("Forks did not finish withing specified timeout.");
    }
}
Also used : StreamingExtractor(org.apache.gobblin.source.extractor.StreamingExtractor) WatermarkManager(org.apache.gobblin.writer.WatermarkManager) ForkOperator(org.apache.gobblin.fork.ForkOperator) ConnectableFlowable(io.reactivex.flowables.ConnectableFlowable) TimeoutException(java.util.concurrent.TimeoutException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AcknowledgableWatermark(org.apache.gobblin.writer.AcknowledgableWatermark) Fork(org.apache.gobblin.runtime.fork.Fork) Future(java.util.concurrent.Future) Closer(com.google.common.io.Closer) Optional(com.google.common.base.Optional) Map(java.util.Map) Schedulers(io.reactivex.schedulers.Schedulers) RecordStreamProcessor(org.apache.gobblin.records.RecordStreamProcessor) Forker(org.apache.gobblin.fork.Forker) WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) ExponentialBackoff(org.apache.gobblin.util.ExponentialBackoff) Converter(org.apache.gobblin.converter.Converter) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) TimeUnit(java.util.concurrent.TimeUnit) Extractor(org.apache.gobblin.source.extractor.Extractor) List(java.util.List) Futures(com.google.common.util.concurrent.Futures) FineGrainedWatermarkTracker(org.apache.gobblin.writer.FineGrainedWatermarkTracker) RowLevelPolicyChecker(org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker) RecordStreamWithMetadata(org.apache.gobblin.records.RecordStreamWithMetadata) AllArgsConstructor(lombok.AllArgsConstructor) Fork(org.apache.gobblin.runtime.fork.Fork) StreamingExtractor(org.apache.gobblin.source.extractor.StreamingExtractor) ConnectableFlowable(io.reactivex.flowables.ConnectableFlowable) Forker(org.apache.gobblin.fork.Forker) RecordStreamProcessor(org.apache.gobblin.records.RecordStreamProcessor) AcknowledgableWatermark(org.apache.gobblin.writer.AcknowledgableWatermark) ForkOperator(org.apache.gobblin.fork.ForkOperator) Converter(org.apache.gobblin.converter.Converter) TimeoutException(java.util.concurrent.TimeoutException)

Example 5 with RecordStreamWithMetadata

use of org.apache.gobblin.records.RecordStreamWithMetadata in project incubator-gobblin by apache.

the class Fork method consumeRecordStream.

@SuppressWarnings(value = "RV_RETURN_VALUE_IGNORED", justification = "We actually don't care about the return value of subscribe.")
public void consumeRecordStream(RecordStreamWithMetadata<D, S> stream) throws RecordStreamProcessor.StreamProcessingException {
    if (this.converter instanceof MultiConverter) {
        // if multiconverter, unpack it
        for (Converter cverter : ((MultiConverter) this.converter).getConverters()) {
            stream = cverter.processStream(stream, this.taskState);
        }
    } else {
        stream = this.converter.processStream(stream, this.taskState);
    }
    stream = this.rowLevelPolicyChecker.processStream(stream, this.taskState);
    stream = stream.mapStream(s -> s.map(r -> {
        onEachRecord();
        return r;
    }));
    stream = stream.mapStream(s -> s.doOnSubscribe(subscription -> onStart()));
    stream = stream.mapStream(s -> s.doOnComplete(() -> verifyAndSetForkState(ForkState.RUNNING, ForkState.SUCCEEDED)));
    stream = stream.mapStream(s -> s.doOnCancel(() -> verifyAndSetForkState(ForkState.RUNNING, ForkState.SUCCEEDED)));
    stream = stream.mapStream(s -> s.doOnError(exc -> {
        verifyAndSetForkState(ForkState.RUNNING, ForkState.FAILED);
        this.logger.error(String.format("Fork %d of task %s failed to process data records", this.index, this.taskId), exc);
    }));
    stream = stream.mapStream(s -> s.doFinally(this::cleanup));
    stream.getRecordStream().subscribe(r -> {
        if (r instanceof RecordEnvelope) {
            this.writer.get().writeEnvelope((RecordEnvelope) r);
        } else if (r instanceof ControlMessage) {
            this.writer.get().getMessageHandler().handleMessage((ControlMessage) r);
            r.ack();
        }
    }, e -> logger.error("Failed to process record.", e), () -> {
        if (this.writer.isPresent()) {
            this.writer.get().close();
        }
    });
}
Also used : ForkOperatorUtils(org.apache.gobblin.util.ForkOperatorUtils) Tag(org.apache.gobblin.metrics.Tag) SpeculativeAttemptAwareConstruct(org.apache.gobblin.commit.SpeculativeAttemptAwareConstruct) GobblinMetrics(org.apache.gobblin.metrics.GobblinMetrics) ExecutionModel(org.apache.gobblin.runtime.ExecutionModel) LoggerFactory(org.slf4j.LoggerFactory) ControlMessage(org.apache.gobblin.stream.ControlMessage) BoundedBlockingRecordQueue(org.apache.gobblin.runtime.BoundedBlockingRecordQueue) TaskPublisher(org.apache.gobblin.publisher.TaskPublisher) PartitionedDataWriter(org.apache.gobblin.writer.PartitionedDataWriter) AtomicReference(java.util.concurrent.atomic.AtomicReference) Task(org.apache.gobblin.runtime.Task) TaskState(org.apache.gobblin.runtime.TaskState) ImmutableList(com.google.common.collect.ImmutableList) Closer(com.google.common.io.Closer) DataWriterBuilder(org.apache.gobblin.writer.DataWriterBuilder) Optional(com.google.common.base.Optional) SuppressWarnings(edu.umd.cs.findbugs.annotations.SuppressWarnings) RecordStreamProcessor(org.apache.gobblin.records.RecordStreamProcessor) TaskLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults) Logger(org.slf4j.Logger) TaskContext(org.apache.gobblin.runtime.TaskContext) Converter(org.apache.gobblin.converter.Converter) Instrumented(org.apache.gobblin.instrumented.Instrumented) State(org.apache.gobblin.configuration.State) RowLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.row.RowLevelPolicyCheckResults) TaskExecutor(org.apache.gobblin.runtime.TaskExecutor) Throwables(com.google.common.base.Throwables) IOException(java.io.IOException) FinalState(org.apache.gobblin.util.FinalState) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) DataWriter(org.apache.gobblin.writer.DataWriter) List(java.util.List) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) WatermarkAwareWriter(org.apache.gobblin.writer.WatermarkAwareWriter) DataWriterWrapperBuilder(org.apache.gobblin.writer.DataWriterWrapperBuilder) Destination(org.apache.gobblin.writer.Destination) Closeable(java.io.Closeable) RowLevelPolicyChecker(org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker) RecordStreamWithMetadata(org.apache.gobblin.records.RecordStreamWithMetadata) Preconditions(com.google.common.base.Preconditions) RecordStreamConsumer(org.apache.gobblin.records.RecordStreamConsumer) DataConversionException(org.apache.gobblin.converter.DataConversionException) TaskMetrics(org.apache.gobblin.runtime.util.TaskMetrics) Constructs(org.apache.gobblin.Constructs) MultiConverter(org.apache.gobblin.runtime.MultiConverter) ConstructState(org.apache.gobblin.state.ConstructState) MultiConverter(org.apache.gobblin.runtime.MultiConverter) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) Converter(org.apache.gobblin.converter.Converter) MultiConverter(org.apache.gobblin.runtime.MultiConverter) ControlMessage(org.apache.gobblin.stream.ControlMessage) SuppressWarnings(edu.umd.cs.findbugs.annotations.SuppressWarnings)

Aggregations

RecordStreamWithMetadata (org.apache.gobblin.records.RecordStreamWithMetadata)5 RecordEnvelope (org.apache.gobblin.stream.RecordEnvelope)4 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)3 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)3 Test (org.testng.annotations.Test)3 Optional (com.google.common.base.Optional)2 Closer (com.google.common.io.Closer)2 List (java.util.List)2 ConfigurationKeys (org.apache.gobblin.configuration.ConfigurationKeys)2 Converter (org.apache.gobblin.converter.Converter)2 RowLevelPolicyChecker (org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker)2 RecordStreamProcessor (org.apache.gobblin.records.RecordStreamProcessor)2 Preconditions (com.google.common.base.Preconditions)1 Throwables (com.google.common.base.Throwables)1 ImmutableList (com.google.common.collect.ImmutableList)1 Futures (com.google.common.util.concurrent.Futures)1 SuppressWarnings (edu.umd.cs.findbugs.annotations.SuppressWarnings)1 ConnectableFlowable (io.reactivex.flowables.ConnectableFlowable)1 Schedulers (io.reactivex.schedulers.Schedulers)1 Closeable (java.io.Closeable)1