use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class StateSpecFunctions method mapSourceFunction.
/**
* A {@link org.apache.spark.streaming.StateSpec} function to support reading from an {@link
* UnboundedSource}.
*
* <p>This StateSpec function expects the following:
*
* <ul>
* <li>Key: The (partitioned) Source to read from.
* <li>Value: An optional {@link UnboundedSource.CheckpointMark} to start from.
* <li>State: A byte representation of the (previously) persisted CheckpointMark.
* </ul>
*
* And returns an iterator over all read values (for the micro-batch).
*
* <p>This stateful operation could be described as a flatMap over a single-element stream, which
* outputs all the elements read from the {@link UnboundedSource} for this micro-batch. Since
* micro-batches are bounded, the provided UnboundedSource is wrapped by a {@link
* MicrobatchSource} that applies bounds in the form of duration and max records (per
* micro-batch).
*
* <p>In order to avoid using Spark Guava's classes which pollute the classpath, we use the {@link
* StateSpec#function(scala.Function3)} signature which employs scala's native {@link
* scala.Option}, instead of the {@link
* StateSpec#function(org.apache.spark.api.java.function.Function3)} signature, which employs
* Guava's {@link Optional}.
*
* <p>See also <a href="https://issues.apache.org/jira/browse/SPARK-4819">SPARK-4819</a>.
*
* @param options A serializable {@link SerializablePipelineOptions}.
* @param <T> The type of the input stream elements.
* @param <CheckpointMarkT> The type of the {@link UnboundedSource.CheckpointMark}.
* @return The appropriate {@link org.apache.spark.streaming.StateSpec} function.
*/
public static <T, CheckpointMarkT extends UnboundedSource.CheckpointMark> scala.Function3<Source<T>, Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>> mapSourceFunction(final SerializablePipelineOptions options, final String stepName) {
return new SerializableFunction3<Source<T>, Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>>() {
@Override
public Tuple2<Iterable<byte[]>, Metadata> apply(Source<T> source, Option<CheckpointMarkT> startCheckpointMark, State<Tuple2<byte[], Instant>> state) {
MetricsContainerStepMap metricsContainers = new MetricsContainerStepMap();
MetricsContainer metricsContainer = metricsContainers.getContainer(stepName);
// since they may report metrics.
try (Closeable ignored = MetricsEnvironment.scopedMetricsContainer(metricsContainer)) {
// source as MicrobatchSource
MicrobatchSource<T, CheckpointMarkT> microbatchSource = (MicrobatchSource<T, CheckpointMarkT>) source;
// Initial high/low watermarks.
Instant lowWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE;
final Instant highWatermark;
// if state exists, use it, otherwise it's first time so use the startCheckpointMark.
// startCheckpointMark may be EmptyCheckpointMark (the Spark Java API tries to apply
// Optional(null)), which is handled by the UnboundedSource implementation.
Coder<CheckpointMarkT> checkpointCoder = microbatchSource.getCheckpointMarkCoder();
CheckpointMarkT checkpointMark;
if (state.exists()) {
// previous (output) watermark is now the low watermark.
lowWatermark = state.get()._2();
checkpointMark = CoderHelpers.fromByteArray(state.get()._1(), checkpointCoder);
LOG.info("Continue reading from an existing CheckpointMark.");
} else if (startCheckpointMark.isDefined() && !startCheckpointMark.get().equals(EmptyCheckpointMark.get())) {
checkpointMark = startCheckpointMark.get();
LOG.info("Start reading from a provided CheckpointMark.");
} else {
checkpointMark = null;
LOG.info("No CheckpointMark provided, start reading from default.");
}
// create reader.
final MicrobatchSource.Reader /*<T>*/
microbatchReader;
final Stopwatch stopwatch = Stopwatch.createStarted();
long readDurationMillis = 0;
try {
microbatchReader = (MicrobatchSource.Reader) microbatchSource.getOrCreateReader(options.get(), checkpointMark);
} catch (IOException e) {
throw new RuntimeException(e);
}
// read microbatch as a serialized collection.
final List<byte[]> readValues = new ArrayList<>();
WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);
try {
// measure how long a read takes per-partition.
boolean finished = !microbatchReader.start();
while (!finished) {
final WindowedValue<T> wv = WindowedValue.of((T) microbatchReader.getCurrent(), microbatchReader.getCurrentTimestamp(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
readValues.add(CoderHelpers.toByteArray(wv, coder));
finished = !microbatchReader.advance();
}
// end-of-read watermark is the high watermark, but don't allow decrease.
final Instant sourceWatermark = microbatchReader.getWatermark();
highWatermark = sourceWatermark.isAfter(lowWatermark) ? sourceWatermark : lowWatermark;
readDurationMillis = stopwatch.stop().elapsed(TimeUnit.MILLISECONDS);
LOG.info("Source id {} spent {} millis on reading.", microbatchSource.getId(), readDurationMillis);
// if the Source does not supply a CheckpointMark skip updating the state.
@SuppressWarnings("unchecked") final CheckpointMarkT finishedReadCheckpointMark = (CheckpointMarkT) microbatchReader.getCheckpointMark();
byte[] codedCheckpoint = CoderHelpers.toByteArray(finishedReadCheckpointMark, checkpointCoder);
// persist the end-of-read (high) watermark for following read, where it will become
// the next low watermark.
state.update(new Tuple2<>(codedCheckpoint, highWatermark));
} catch (IOException e) {
throw new RuntimeException("Failed to read from reader.", e);
}
final ArrayList<byte[]> payload = Lists.newArrayList(Iterators.unmodifiableIterator(readValues.iterator()));
return new Tuple2<>(payload, new Metadata(readValues.size(), lowWatermark, highWatermark, readDurationMillis, metricsContainers));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class CustomSources method serializeToCloudSource.
public static com.google.api.services.dataflow.model.Source serializeToCloudSource(Source<?> source, PipelineOptions options) throws Exception {
com.google.api.services.dataflow.model.Source cloudSource = new com.google.api.services.dataflow.model.Source();
// We ourselves act as the SourceFormat.
cloudSource.setSpec(CloudObject.forClass(CustomSources.class));
addString(cloudSource.getSpec(), SERIALIZED_SOURCE, encodeBase64String(serializeToByteArray(source)));
SourceMetadata metadata = new SourceMetadata();
if (source instanceof BoundedSource) {
BoundedSource<?> boundedSource = (BoundedSource<?>) source;
// Size estimation is best effort so we continue even if it fails here.
try {
metadata.setEstimatedSizeBytes(boundedSource.getEstimatedSizeBytes(options));
} catch (Exception e) {
LOG.warn("Size estimation of the source failed: " + source, e);
}
} else if (source instanceof UnboundedSource) {
UnboundedSource<?, ?> unboundedSource = (UnboundedSource<?, ?>) source;
metadata.setInfinite(true);
List<String> encodedSplits = new ArrayList<>();
int desiredNumSplits = getDesiredNumUnboundedSourceSplits(options.as(DataflowPipelineOptions.class));
for (UnboundedSource<?, ?> split : unboundedSource.split(desiredNumSplits, options)) {
encodedSplits.add(encodeBase64String(serializeToByteArray(split)));
}
checkArgument(!encodedSplits.isEmpty(), "UnboundedSources must have at least one split");
addStringList(cloudSource.getSpec(), SERIALIZED_SOURCE_SPLITS, encodedSplits);
} else {
throw new IllegalArgumentException("Unexpected source kind: " + source.getClass());
}
cloudSource.setMetadata(metadata);
return cloudSource;
}
use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class ReadTranslationTest method testToFromProtoUnbounded.
@Test
public void testToFromProtoUnbounded() throws Exception {
assumeThat(source, instanceOf(UnboundedSource.class));
UnboundedSource<?, ?> unboundedSource = (UnboundedSource<?, ?>) this.source;
SplittableParDo.PrimitiveUnboundedRead<?> unboundedRead = new SplittableParDo.PrimitiveUnboundedRead<>(Read.from(unboundedSource));
// No environment set for unbounded sources
ReadPayload payload = ReadTranslation.toProto(unboundedRead);
assertThat(payload.getIsBounded(), equalTo(RunnerApi.IsBounded.Enum.UNBOUNDED));
UnboundedSource<?, ?> deserializedSource = ReadTranslation.unboundedSourceFromProto(payload);
assertThat(deserializedSource, equalTo(source));
}
use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class StreamingModeExecutionContext method flushState.
public Map<Long, Runnable> flushState() {
Map<Long, Runnable> callbacks = new HashMap<>();
for (StepContext stepContext : getAllStepContexts()) {
stepContext.flushState();
}
if (activeReader != null) {
Windmill.SourceState.Builder sourceStateBuilder = outputBuilder.getSourceStateUpdatesBuilder();
final UnboundedSource.CheckpointMark checkpointMark = activeReader.getCheckpointMark();
final Instant watermark = activeReader.getWatermark();
long id = ThreadLocalRandom.current().nextLong();
sourceStateBuilder.addFinalizeIds(id);
callbacks.put(id, () -> {
try {
checkpointMark.finalizeCheckpoint();
} catch (IOException e) {
throw new RuntimeException("Exception while finalizing checkpoint", e);
}
});
@SuppressWarnings("unchecked") Coder<UnboundedSource.CheckpointMark> checkpointCoder = ((UnboundedSource<?, UnboundedSource.CheckpointMark>) activeReader.getCurrentSource()).getCheckpointMarkCoder();
if (checkpointCoder != null) {
ByteString.Output stream = ByteString.newOutput();
try {
checkpointCoder.encode(checkpointMark, stream, Coder.Context.OUTER);
} catch (IOException e) {
throw new RuntimeException("Exception while encoding checkpoint", e);
}
sourceStateBuilder.setState(stream.toByteString());
}
outputBuilder.setSourceWatermark(WindmillTimeUtils.harnessToWindmillTimestamp(watermark));
backlogBytes = activeReader.getSplitBacklogBytes();
if (backlogBytes == UnboundedSource.UnboundedReader.BACKLOG_UNKNOWN && WorkerCustomSources.isFirstUnboundedSourceSplit(getSerializedKey())) {
// Only call getTotalBacklogBytes() on the first split.
backlogBytes = activeReader.getTotalBacklogBytes();
}
outputBuilder.setSourceBacklogBytes(backlogBytes);
readerCache.cacheReader(getComputationKey(), getWork().getCacheToken(), getWork().getWorkToken(), activeReader);
activeReader = null;
}
return callbacks;
}
use of org.apache.beam.sdk.io.UnboundedSource in project beam by apache.
the class UnboundedSourceWrapper method initializeState.
@Override
public void initializeState(FunctionInitializationContext context) throws Exception {
if (checkpointCoder == null) {
// no checkpoint coder available in this source
return;
}
OperatorStateStore stateStore = context.getOperatorStateStore();
@SuppressWarnings("unchecked") CoderTypeInformation<KV<? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT>> typeInformation = (CoderTypeInformation) new CoderTypeInformation<>(checkpointCoder, serializedOptions.get());
stateForCheckpoint = stateStore.getListState(new ListStateDescriptor<>(DefaultOperatorStateBackend.DEFAULT_OPERATOR_STATE_NAME, typeInformation.createSerializer(new ExecutionConfig())));
if (context.isRestored()) {
isRestored = true;
LOG.info("Restoring state in the UnboundedSourceWrapper.");
} else {
LOG.info("No restore state for UnboundedSourceWrapper.");
}
}
Aggregations