use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class ApexParDoOperator method processElementInReadyWindows.
private Iterable<WindowedValue<InputT>> processElementInReadyWindows(WindowedValue<InputT> elem) {
try {
pushbackDoFnRunner.startBundle();
if (currentKeyStateInternals != null) {
InputT value = elem.getValue();
final Object key;
final Coder<Object> keyCoder;
@SuppressWarnings({ "rawtypes", "unchecked" }) WindowedValueCoder<InputT> wvCoder = (WindowedValueCoder) inputCoder;
if (value instanceof KeyedWorkItem) {
key = ((KeyedWorkItem) value).key();
@SuppressWarnings({ "rawtypes", "unchecked" }) KeyedWorkItemCoder<Object, ?> kwiCoder = (KeyedWorkItemCoder) wvCoder.getValueCoder();
keyCoder = kwiCoder.getKeyCoder();
} else {
key = ((KV) value).getKey();
@SuppressWarnings({ "rawtypes", "unchecked" }) KvCoder<Object, ?> kwiCoder = (KvCoder) wvCoder.getValueCoder();
keyCoder = kwiCoder.getKeyCoder();
}
((StateInternalsProxy) currentKeyStateInternals).setKey(key);
currentKeyTimerInternals.setContext(key, keyCoder, new Instant(this.currentInputWatermark), new Instant(this.currentOutputWatermark));
}
Iterable<WindowedValue<InputT>> pushedBack = pushbackDoFnRunner.processElementInReadyWindows(elem);
pushbackDoFnRunner.finishBundle();
return pushedBack;
} catch (UserCodeException ue) {
if (ue.getCause() instanceof AssertionError) {
ApexRunner.ASSERTION_ERROR.set((AssertionError) ue.getCause());
}
throw ue;
}
}
use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class ParDoTranslatorTest method testSerialization.
@Test
public void testSerialization() throws Exception {
ApexPipelineOptions options = PipelineOptionsFactory.create().as(ApexPipelineOptions.class);
options.setRunner(TestApexRunner.class);
Pipeline pipeline = Pipeline.create(options);
Coder<WindowedValue<Integer>> coder = WindowedValue.getValueOnlyCoder(VarIntCoder.of());
PCollectionView<Integer> singletonView = pipeline.apply(Create.of(1)).apply(Sum.integersGlobally().asSingletonView());
ApexParDoOperator<Integer, Integer> operator = new ApexParDoOperator<>(options, new Add(singletonView), new TupleTag<Integer>(), TupleTagList.empty().getAll(), WindowingStrategy.globalDefault(), Collections.<PCollectionView<?>>singletonList(singletonView), coder, new ApexStateInternals.ApexStateBackend());
operator.setup(null);
operator.beginWindow(0);
WindowedValue<Integer> wv1 = WindowedValue.valueInGlobalWindow(1);
WindowedValue<Iterable<?>> sideInput = WindowedValue.<Iterable<?>>valueInGlobalWindow(Lists.<Integer>newArrayList(22));
// pushed back input
operator.input.process(ApexStreamTuple.DataTuple.of(wv1));
final List<Object> results = Lists.newArrayList();
Sink<Object> sink = new Sink<Object>() {
@Override
public void put(Object tuple) {
results.add(tuple);
}
@Override
public int getCount(boolean reset) {
return 0;
}
};
// verify pushed back input checkpointing
Assert.assertNotNull("Serialization", operator = KryoCloneUtils.cloneObject(operator));
operator.output.setSink(sink);
operator.setup(null);
operator.beginWindow(1);
WindowedValue<Integer> wv2 = WindowedValue.valueInGlobalWindow(2);
operator.sideInput1.process(ApexStreamTuple.DataTuple.of(sideInput));
Assert.assertEquals("number outputs", 1, results.size());
Assert.assertEquals("result", WindowedValue.valueInGlobalWindow(23), ((ApexStreamTuple.DataTuple<?>) results.get(0)).getValue());
// verify side input checkpointing
results.clear();
Assert.assertNotNull("Serialization", operator = KryoCloneUtils.cloneObject(operator));
operator.output.setSink(sink);
operator.setup(null);
operator.beginWindow(2);
operator.input.process(ApexStreamTuple.DataTuple.of(wv2));
Assert.assertEquals("number outputs", 1, results.size());
Assert.assertEquals("result", WindowedValue.valueInGlobalWindow(24), ((ApexStreamTuple.DataTuple<?>) results.get(0)).getValue());
}
use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class ApexGroupByKeyOperatorTest method testGlobalWindowMinTimestamp.
@Test
public void testGlobalWindowMinTimestamp() throws Exception {
ApexPipelineOptions options = PipelineOptionsFactory.create().as(ApexPipelineOptions.class);
options.setRunner(TestApexRunner.class);
Pipeline pipeline = Pipeline.create(options);
WindowingStrategy<?, ?> ws = WindowingStrategy.of(FixedWindows.of(Duration.standardSeconds(10)));
PCollection<KV<String, Integer>> input = PCollection.createPrimitiveOutputInternal(pipeline, ws, IsBounded.BOUNDED);
input.setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()));
ApexGroupByKeyOperator<String, Integer> operator = new ApexGroupByKeyOperator<>(options, input, new ApexStateInternals.ApexStateBackend());
operator.setup(null);
operator.beginWindow(1);
Assert.assertNotNull("Serialization", operator = KryoCloneUtils.cloneObject(operator));
final List<Object> results = Lists.newArrayList();
Sink<Object> sink = new Sink<Object>() {
@Override
public void put(Object tuple) {
results.add(tuple);
}
@Override
public int getCount(boolean reset) {
return 0;
}
};
operator.output.setSink(sink);
operator.setup(null);
operator.beginWindow(1);
Instant windowStart = BoundedWindow.TIMESTAMP_MIN_VALUE;
BoundedWindow window = new IntervalWindow(windowStart, windowStart.plus(10000));
PaneInfo paneInfo = PaneInfo.NO_FIRING;
WindowedValue<KV<String, Integer>> wv1 = WindowedValue.of(KV.of("foo", 1), windowStart, window, paneInfo);
operator.input.process(ApexStreamTuple.DataTuple.of(wv1));
WindowedValue<KV<String, Integer>> wv2 = WindowedValue.of(KV.of("foo", 1), windowStart, window, paneInfo);
operator.input.process(ApexStreamTuple.DataTuple.of(wv2));
ApexStreamTuple<WindowedValue<KV<String, Integer>>> watermark = ApexStreamTuple.WatermarkTuple.of(BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis());
Assert.assertEquals("number outputs", 0, results.size());
operator.input.process(watermark);
Assert.assertEquals("number outputs", 2, results.size());
@SuppressWarnings({ "unchecked", "rawtypes" }) ApexStreamTuple.DataTuple<WindowedValue<KV<String, Iterable<Integer>>>> dataTuple = (ApexStreamTuple.DataTuple) results.get(0);
List<Integer> counts = Lists.newArrayList(1, 1);
Assert.assertEquals("iterable", KV.of("foo", counts), dataTuple.getValue().getValue());
Assert.assertEquals("expected watermark", watermark, results.get(1));
}
use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class StateSpecFunctions method mapSourceFunction.
/**
* A {@link org.apache.spark.streaming.StateSpec} function to support reading from
* an {@link UnboundedSource}.
*
* <p>This StateSpec function expects the following:
* <ul>
* <li>Key: The (partitioned) Source to read from.</li>
* <li>Value: An optional {@link UnboundedSource.CheckpointMark} to start from.</li>
* <li>State: A byte representation of the (previously) persisted CheckpointMark.</li>
* </ul>
* And returns an iterator over all read values (for the micro-batch).
*
* <p>This stateful operation could be described as a flatMap over a single-element stream, which
* outputs all the elements read from the {@link UnboundedSource} for this micro-batch.
* Since micro-batches are bounded, the provided UnboundedSource is wrapped by a
* {@link MicrobatchSource} that applies bounds in the form of duration and max records
* (per micro-batch).
*
*
* <p>In order to avoid using Spark Guava's classes which pollute the
* classpath, we use the {@link StateSpec#function(scala.Function3)} signature which employs
* scala's native {@link scala.Option}, instead of the
* {@link StateSpec#function(org.apache.spark.api.java.function.Function3)} signature,
* which employs Guava's {@link com.google.common.base.Optional}.
*
* <p>See also <a href="https://issues.apache.org/jira/browse/SPARK-4819">SPARK-4819</a>.</p>
*
* @param runtimeContext A serializable {@link SparkRuntimeContext}.
* @param <T> The type of the input stream elements.
* @param <CheckpointMarkT> The type of the {@link UnboundedSource.CheckpointMark}.
* @return The appropriate {@link org.apache.spark.streaming.StateSpec} function.
*/
public static <T, CheckpointMarkT extends UnboundedSource.CheckpointMark> scala.Function3<Source<T>, scala.Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>> mapSourceFunction(final SparkRuntimeContext runtimeContext, final String stepName) {
return new SerializableFunction3<Source<T>, Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>>() {
@Override
public Tuple2<Iterable<byte[]>, Metadata> apply(Source<T> source, scala.Option<CheckpointMarkT> startCheckpointMark, State<Tuple2<byte[], Instant>> state) {
MetricsContainerStepMap metricsContainers = new MetricsContainerStepMap();
MetricsContainer metricsContainer = metricsContainers.getContainer(stepName);
// since they may report metrics.
try (Closeable ignored = MetricsEnvironment.scopedMetricsContainer(metricsContainer)) {
// source as MicrobatchSource
MicrobatchSource<T, CheckpointMarkT> microbatchSource = (MicrobatchSource<T, CheckpointMarkT>) source;
// Initial high/low watermarks.
Instant lowWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE;
final Instant highWatermark;
// if state exists, use it, otherwise it's first time so use the startCheckpointMark.
// startCheckpointMark may be EmptyCheckpointMark (the Spark Java API tries to apply
// Optional(null)), which is handled by the UnboundedSource implementation.
Coder<CheckpointMarkT> checkpointCoder = microbatchSource.getCheckpointMarkCoder();
CheckpointMarkT checkpointMark;
if (state.exists()) {
// previous (output) watermark is now the low watermark.
lowWatermark = state.get()._2();
checkpointMark = CoderHelpers.fromByteArray(state.get()._1(), checkpointCoder);
LOG.info("Continue reading from an existing CheckpointMark.");
} else if (startCheckpointMark.isDefined() && !startCheckpointMark.get().equals(EmptyCheckpointMark.get())) {
checkpointMark = startCheckpointMark.get();
LOG.info("Start reading from a provided CheckpointMark.");
} else {
checkpointMark = null;
LOG.info("No CheckpointMark provided, start reading from default.");
}
// create reader.
final MicrobatchSource.Reader /*<T>*/
microbatchReader;
final Stopwatch stopwatch = Stopwatch.createStarted();
long readDurationMillis = 0;
try {
microbatchReader = (MicrobatchSource.Reader) microbatchSource.getOrCreateReader(runtimeContext.getPipelineOptions(), checkpointMark);
} catch (IOException e) {
throw new RuntimeException(e);
}
// read microbatch as a serialized collection.
final List<byte[]> readValues = new ArrayList<>();
WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getDefaultOutputCoder(), GlobalWindow.Coder.INSTANCE);
try {
// measure how long a read takes per-partition.
boolean finished = !microbatchReader.start();
while (!finished) {
final WindowedValue<T> wv = WindowedValue.of((T) microbatchReader.getCurrent(), microbatchReader.getCurrentTimestamp(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
readValues.add(CoderHelpers.toByteArray(wv, coder));
finished = !microbatchReader.advance();
}
// end-of-read watermark is the high watermark, but don't allow decrease.
final Instant sourceWatermark = microbatchReader.getWatermark();
highWatermark = sourceWatermark.isAfter(lowWatermark) ? sourceWatermark : lowWatermark;
readDurationMillis = stopwatch.stop().elapsed(TimeUnit.MILLISECONDS);
LOG.info("Source id {} spent {} millis on reading.", microbatchSource.getId(), readDurationMillis);
// if the Source does not supply a CheckpointMark skip updating the state.
@SuppressWarnings("unchecked") final CheckpointMarkT finishedReadCheckpointMark = (CheckpointMarkT) microbatchReader.getCheckpointMark();
byte[] codedCheckpoint = new byte[0];
if (finishedReadCheckpointMark != null) {
codedCheckpoint = CoderHelpers.toByteArray(finishedReadCheckpointMark, checkpointCoder);
} else {
LOG.info("Skipping checkpoint marking because the reader failed to supply one.");
}
// persist the end-of-read (high) watermark for following read, where it will become
// the next low watermark.
state.update(new Tuple2<>(codedCheckpoint, highWatermark));
} catch (IOException e) {
throw new RuntimeException("Failed to read from reader.", e);
}
final ArrayList<byte[]> payload = Lists.newArrayList(Iterators.unmodifiableIterator(readValues.iterator()));
return new Tuple2<>((Iterable<byte[]>) payload, new Metadata(readValues.size(), lowWatermark, highWatermark, readDurationMillis, metricsContainers));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class EvaluationContext method putBoundedDatasetFromValues.
<T> void putBoundedDatasetFromValues(PTransform<?, ? extends PValue> transform, Iterable<T> values, Coder<T> coder) {
PValue output = getOutput(transform);
if (shouldCache(output)) {
// eagerly create the RDD, as it will be reused.
Iterable<WindowedValue<T>> elems = Iterables.transform(values, WindowingHelpers.<T>windowValueFunction());
WindowedValue.ValueOnlyWindowedValueCoder<T> windowCoder = WindowedValue.getValueOnlyCoder(coder);
JavaRDD<WindowedValue<T>> rdd = getSparkContext().parallelize(CoderHelpers.toByteArrays(elems, windowCoder)).map(CoderHelpers.fromByteFunction(windowCoder));
putDataset(transform, new BoundedDataset<>(rdd));
} else {
// create a BoundedDataset that would create a RDD on demand
datasets.put(getOutput(transform), new BoundedDataset<>(values, jsc, coder));
}
}
Aggregations