use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class StateSpecFunctions method mapSourceFunction.
/**
* A {@link org.apache.spark.streaming.StateSpec} function to support reading from
* an {@link UnboundedSource}.
*
* <p>This StateSpec function expects the following:
* <ul>
* <li>Key: The (partitioned) Source to read from.</li>
* <li>Value: An optional {@link UnboundedSource.CheckpointMark} to start from.</li>
* <li>State: A byte representation of the (previously) persisted CheckpointMark.</li>
* </ul>
* And returns an iterator over all read values (for the micro-batch).
*
* <p>This stateful operation could be described as a flatMap over a single-element stream, which
* outputs all the elements read from the {@link UnboundedSource} for this micro-batch.
* Since micro-batches are bounded, the provided UnboundedSource is wrapped by a
* {@link MicrobatchSource} that applies bounds in the form of duration and max records
* (per micro-batch).
*
*
* <p>In order to avoid using Spark Guava's classes which pollute the
* classpath, we use the {@link StateSpec#function(scala.Function3)} signature which employs
* scala's native {@link scala.Option}, instead of the
* {@link StateSpec#function(org.apache.spark.api.java.function.Function3)} signature,
* which employs Guava's {@link com.google.common.base.Optional}.
*
* <p>See also <a href="https://issues.apache.org/jira/browse/SPARK-4819">SPARK-4819</a>.</p>
*
* @param runtimeContext A serializable {@link SparkRuntimeContext}.
* @param <T> The type of the input stream elements.
* @param <CheckpointMarkT> The type of the {@link UnboundedSource.CheckpointMark}.
* @return The appropriate {@link org.apache.spark.streaming.StateSpec} function.
*/
public static <T, CheckpointMarkT extends UnboundedSource.CheckpointMark> scala.Function3<Source<T>, scala.Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>> mapSourceFunction(final SparkRuntimeContext runtimeContext, final String stepName) {
return new SerializableFunction3<Source<T>, Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>>() {
@Override
public Tuple2<Iterable<byte[]>, Metadata> apply(Source<T> source, scala.Option<CheckpointMarkT> startCheckpointMark, State<Tuple2<byte[], Instant>> state) {
MetricsContainerStepMap metricsContainers = new MetricsContainerStepMap();
MetricsContainer metricsContainer = metricsContainers.getContainer(stepName);
// since they may report metrics.
try (Closeable ignored = MetricsEnvironment.scopedMetricsContainer(metricsContainer)) {
// source as MicrobatchSource
MicrobatchSource<T, CheckpointMarkT> microbatchSource = (MicrobatchSource<T, CheckpointMarkT>) source;
// Initial high/low watermarks.
Instant lowWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE;
final Instant highWatermark;
// if state exists, use it, otherwise it's first time so use the startCheckpointMark.
// startCheckpointMark may be EmptyCheckpointMark (the Spark Java API tries to apply
// Optional(null)), which is handled by the UnboundedSource implementation.
Coder<CheckpointMarkT> checkpointCoder = microbatchSource.getCheckpointMarkCoder();
CheckpointMarkT checkpointMark;
if (state.exists()) {
// previous (output) watermark is now the low watermark.
lowWatermark = state.get()._2();
checkpointMark = CoderHelpers.fromByteArray(state.get()._1(), checkpointCoder);
LOG.info("Continue reading from an existing CheckpointMark.");
} else if (startCheckpointMark.isDefined() && !startCheckpointMark.get().equals(EmptyCheckpointMark.get())) {
checkpointMark = startCheckpointMark.get();
LOG.info("Start reading from a provided CheckpointMark.");
} else {
checkpointMark = null;
LOG.info("No CheckpointMark provided, start reading from default.");
}
// create reader.
final MicrobatchSource.Reader /*<T>*/
microbatchReader;
final Stopwatch stopwatch = Stopwatch.createStarted();
long readDurationMillis = 0;
try {
microbatchReader = (MicrobatchSource.Reader) microbatchSource.getOrCreateReader(runtimeContext.getPipelineOptions(), checkpointMark);
} catch (IOException e) {
throw new RuntimeException(e);
}
// read microbatch as a serialized collection.
final List<byte[]> readValues = new ArrayList<>();
WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getDefaultOutputCoder(), GlobalWindow.Coder.INSTANCE);
try {
// measure how long a read takes per-partition.
boolean finished = !microbatchReader.start();
while (!finished) {
final WindowedValue<T> wv = WindowedValue.of((T) microbatchReader.getCurrent(), microbatchReader.getCurrentTimestamp(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
readValues.add(CoderHelpers.toByteArray(wv, coder));
finished = !microbatchReader.advance();
}
// end-of-read watermark is the high watermark, but don't allow decrease.
final Instant sourceWatermark = microbatchReader.getWatermark();
highWatermark = sourceWatermark.isAfter(lowWatermark) ? sourceWatermark : lowWatermark;
readDurationMillis = stopwatch.stop().elapsed(TimeUnit.MILLISECONDS);
LOG.info("Source id {} spent {} millis on reading.", microbatchSource.getId(), readDurationMillis);
// if the Source does not supply a CheckpointMark skip updating the state.
@SuppressWarnings("unchecked") final CheckpointMarkT finishedReadCheckpointMark = (CheckpointMarkT) microbatchReader.getCheckpointMark();
byte[] codedCheckpoint = new byte[0];
if (finishedReadCheckpointMark != null) {
codedCheckpoint = CoderHelpers.toByteArray(finishedReadCheckpointMark, checkpointCoder);
} else {
LOG.info("Skipping checkpoint marking because the reader failed to supply one.");
}
// persist the end-of-read (high) watermark for following read, where it will become
// the next low watermark.
state.update(new Tuple2<>(codedCheckpoint, highWatermark));
} catch (IOException e) {
throw new RuntimeException("Failed to read from reader.", e);
}
final ArrayList<byte[]> payload = Lists.newArrayList(Iterators.unmodifiableIterator(readValues.iterator()));
return new Tuple2<>((Iterable<byte[]>) payload, new Metadata(readValues.size(), lowWatermark, highWatermark, readDurationMillis, metricsContainers));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class SparkUnboundedSource method read.
public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read(JavaStreamingContext jssc, SparkRuntimeContext rc, UnboundedSource<T, CheckpointMarkT> source, String stepName) {
SparkPipelineOptions options = rc.getPipelineOptions().as(SparkPipelineOptions.class);
Long maxRecordsPerBatch = options.getMaxRecordsPerBatch();
SourceDStream<T, CheckpointMarkT> sourceDStream = new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch);
JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream = JavaPairInputDStream$.MODULE$.fromInputDStream(sourceDStream, JavaSparkContext$.MODULE$.<Source<T>>fakeClassTag(), JavaSparkContext$.MODULE$.<CheckpointMarkT>fakeClassTag());
// call mapWithState to read from a checkpointable sources.
JavaMapWithStateDStream<Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>> mapWithStateDStream = inputDStream.mapWithState(StateSpec.function(StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName)).numPartitions(sourceDStream.getNumPartitions()));
// set checkpoint duration for read stream, if set.
checkpointStream(mapWithStateDStream, options);
// report the number of input elements for this InputDStream to the InputInfoTracker.
int id = inputDStream.inputDStream().id();
JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction());
// register ReadReportDStream to report information related to this read.
new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName).register();
// output the actual (deserialized) stream.
WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getDefaultOutputCoder(), GlobalWindow.Coder.INSTANCE);
JavaDStream<WindowedValue<T>> readUnboundedStream = mapWithStateDStream.flatMap(new Tuple2byteFlatMapFunction()).map(CoderHelpers.fromByteFunction(coder));
return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id));
}
use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class UnboundedSourceWrapper method open.
/**
* Initialize and restore state before starting execution of the source.
*/
@Override
public void open(Configuration parameters) throws Exception {
FileSystems.setDefaultPipelineOptions(serializedOptions.get());
runtimeContext = (StreamingRuntimeContext) getRuntimeContext();
metricContainer = new FlinkMetricContainer(runtimeContext);
// figure out which split sources we're responsible for
int subtaskIndex = runtimeContext.getIndexOfThisSubtask();
int numSubtasks = runtimeContext.getNumberOfParallelSubtasks();
localSplitSources = new ArrayList<>();
localReaders = new ArrayList<>();
pendingCheckpoints = new LinkedHashMap<>();
if (isRestored) {
// restore the splitSources from the checkpoint to ensure consistent ordering
for (KV<? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT> restored : stateForCheckpoint.get()) {
localSplitSources.add(restored.getKey());
localReaders.add(restored.getKey().createReader(serializedOptions.get(), restored.getValue()));
}
} else {
// initialize localReaders and localSources from scratch
for (int i = 0; i < splitSources.size(); i++) {
if (i % numSubtasks == subtaskIndex) {
UnboundedSource<OutputT, CheckpointMarkT> source = splitSources.get(i);
UnboundedSource.UnboundedReader<OutputT> reader = source.createReader(serializedOptions.get(), null);
localSplitSources.add(source);
localReaders.add(reader);
}
}
}
LOG.info("Unbounded Flink Source {}/{} is reading from sources: {}", subtaskIndex + 1, numSubtasks, localSplitSources);
}
use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class SparkUnboundedSource method read.
public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read(JavaStreamingContext jssc, SerializablePipelineOptions rc, UnboundedSource<T, CheckpointMarkT> source, String stepName) {
SparkPipelineOptions options = rc.get().as(SparkPipelineOptions.class);
Long maxRecordsPerBatch = options.getMaxRecordsPerBatch();
SourceDStream<T, CheckpointMarkT> sourceDStream = new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch);
JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream = JavaPairInputDStream$.MODULE$.fromInputDStream(sourceDStream, JavaSparkContext$.MODULE$.fakeClassTag(), JavaSparkContext$.MODULE$.fakeClassTag());
// call mapWithState to read from a checkpointable sources.
JavaMapWithStateDStream<Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>> mapWithStateDStream = inputDStream.mapWithState(StateSpec.function(StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName)).numPartitions(sourceDStream.getNumPartitions()));
// set checkpoint duration for read stream, if set.
checkpointStream(mapWithStateDStream, options);
// report the number of input elements for this InputDStream to the InputInfoTracker.
int id = inputDStream.inputDStream().id();
JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction());
// register ReadReportDStream to report information related to this read.
new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName).register();
// output the actual (deserialized) stream.
WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);
JavaDStream<WindowedValue<T>> readUnboundedStream = mapWithStateDStream.flatMap(new Tuple2byteFlatMapFunction()).map(CoderHelpers.fromByteFunction(coder));
return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id));
}
use of org.apache.beam.sdk.io.UnboundedSource in project component-runtime by Talend.
the class DIPipeline method wrapTransformIfNeeded.
private <PT extends POutput> PTransform<? super PBegin, PT> wrapTransformIfNeeded(final PTransform<? super PBegin, PT> root) {
if (Read.Bounded.class.isInstance(root)) {
final BoundedSource source = Read.Bounded.class.cast(root).getSource();
final DelegatingBoundedSource boundedSource = new DelegatingBoundedSource(source, null);
setState(boundedSource);
return Read.from(boundedSource);
}
if (Read.Unbounded.class.isInstance(root)) {
final UnboundedSource source = Read.Unbounded.class.cast(root).getSource();
if (InMemoryQueueIO.UnboundedQueuedInput.class.isInstance(source)) {
return root;
}
final DelegatingUnBoundedSource unBoundedSource = new DelegatingUnBoundedSource(source, null);
setState(unBoundedSource);
return Read.from(unBoundedSource);
}
return root;
}
Aggregations