use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.
the class MetricsAccumulator method init.
/**
* Init metrics accumulator if it has not been initiated. This method is idempotent.
*/
public static void init(SparkPipelineOptions opts, JavaSparkContext jsc) {
if (instance == null) {
synchronized (MetricsAccumulator.class) {
if (instance == null) {
Optional<CheckpointDir> maybeCheckpointDir = opts.isStreaming() ? Optional.of(new CheckpointDir(opts.getCheckpointDir())) : Optional.<CheckpointDir>absent();
Accumulator<MetricsContainerStepMap> accumulator = jsc.sc().accumulator(new MetricsContainerStepMap(), ACCUMULATOR_NAME, new MetricsAccumulatorParam());
if (maybeCheckpointDir.isPresent()) {
Optional<MetricsContainerStepMap> maybeRecoveredValue = recoverValueFromCheckpoint(jsc, maybeCheckpointDir.get());
if (maybeRecoveredValue.isPresent()) {
accumulator.setValue(maybeRecoveredValue.get());
}
}
instance = accumulator;
}
}
LOG.info("Instantiated metrics accumulator: " + instance.value());
}
}
use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.
the class MetricsAccumulator method recoverValueFromCheckpoint.
private static Optional<MetricsContainerStepMap> recoverValueFromCheckpoint(JavaSparkContext jsc, CheckpointDir checkpointDir) {
try {
Path beamCheckpointPath = checkpointDir.getBeamCheckpointDir();
checkpointFilePath = new Path(beamCheckpointPath, ACCUMULATOR_CHECKPOINT_FILENAME);
fileSystem = checkpointFilePath.getFileSystem(jsc.hadoopConfiguration());
MetricsContainerStepMap recoveredValue = Checkpoint.readObject(fileSystem, checkpointFilePath);
if (recoveredValue != null) {
LOG.info("Recovered metrics from checkpoint.");
return Optional.of(recoveredValue);
} else {
LOG.info("No metrics checkpoint found.");
}
} catch (Exception e) {
throw new RuntimeException("Failure while reading metrics checkpoint.", e);
}
return Optional.absent();
}
use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.
the class StreamingTransformTranslator method parDo.
private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {
public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
final DoFn<InputT, OutputT> doFn = transform.getFn();
rejectSplittable(doFn);
rejectStateAndTimers(doFn);
final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
final SparkPCollectionView pviews = context.getPViews();
final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
@SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = ((UnboundedDataset<InputT>) context.borrowDataset(transform));
JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
final String stepName = context.getCurrentTransform().getFullName();
JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(new Function<JavaRDD<WindowedValue<InputT>>, JavaPairRDD<TupleTag<?>, WindowedValue<?>>>() {
@Override
public JavaPairRDD<TupleTag<?>, WindowedValue<?>> call(JavaRDD<WindowedValue<InputT>> rdd) throws Exception {
final Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
final Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, runtimeContext, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), sideInputs, windowingStrategy, false));
}
});
Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
if (outputs.size() > 1) {
// cache the DStream if we're going to filter it more than once.
all.cache();
}
for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
@SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
@SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
}
}
@Override
public String toNativeString() {
return "mapPartitions(new <fn>())";
}
};
}
use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.
the class StateSpecFunctions method mapSourceFunction.
/**
* A {@link org.apache.spark.streaming.StateSpec} function to support reading from
* an {@link UnboundedSource}.
*
* <p>This StateSpec function expects the following:
* <ul>
* <li>Key: The (partitioned) Source to read from.</li>
* <li>Value: An optional {@link UnboundedSource.CheckpointMark} to start from.</li>
* <li>State: A byte representation of the (previously) persisted CheckpointMark.</li>
* </ul>
* And returns an iterator over all read values (for the micro-batch).
*
* <p>This stateful operation could be described as a flatMap over a single-element stream, which
* outputs all the elements read from the {@link UnboundedSource} for this micro-batch.
* Since micro-batches are bounded, the provided UnboundedSource is wrapped by a
* {@link MicrobatchSource} that applies bounds in the form of duration and max records
* (per micro-batch).
*
*
* <p>In order to avoid using Spark Guava's classes which pollute the
* classpath, we use the {@link StateSpec#function(scala.Function3)} signature which employs
* scala's native {@link scala.Option}, instead of the
* {@link StateSpec#function(org.apache.spark.api.java.function.Function3)} signature,
* which employs Guava's {@link com.google.common.base.Optional}.
*
* <p>See also <a href="https://issues.apache.org/jira/browse/SPARK-4819">SPARK-4819</a>.</p>
*
* @param runtimeContext A serializable {@link SparkRuntimeContext}.
* @param <T> The type of the input stream elements.
* @param <CheckpointMarkT> The type of the {@link UnboundedSource.CheckpointMark}.
* @return The appropriate {@link org.apache.spark.streaming.StateSpec} function.
*/
public static <T, CheckpointMarkT extends UnboundedSource.CheckpointMark> scala.Function3<Source<T>, scala.Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>> mapSourceFunction(final SparkRuntimeContext runtimeContext, final String stepName) {
return new SerializableFunction3<Source<T>, Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>>() {
@Override
public Tuple2<Iterable<byte[]>, Metadata> apply(Source<T> source, scala.Option<CheckpointMarkT> startCheckpointMark, State<Tuple2<byte[], Instant>> state) {
MetricsContainerStepMap metricsContainers = new MetricsContainerStepMap();
MetricsContainer metricsContainer = metricsContainers.getContainer(stepName);
// since they may report metrics.
try (Closeable ignored = MetricsEnvironment.scopedMetricsContainer(metricsContainer)) {
// source as MicrobatchSource
MicrobatchSource<T, CheckpointMarkT> microbatchSource = (MicrobatchSource<T, CheckpointMarkT>) source;
// Initial high/low watermarks.
Instant lowWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE;
final Instant highWatermark;
// if state exists, use it, otherwise it's first time so use the startCheckpointMark.
// startCheckpointMark may be EmptyCheckpointMark (the Spark Java API tries to apply
// Optional(null)), which is handled by the UnboundedSource implementation.
Coder<CheckpointMarkT> checkpointCoder = microbatchSource.getCheckpointMarkCoder();
CheckpointMarkT checkpointMark;
if (state.exists()) {
// previous (output) watermark is now the low watermark.
lowWatermark = state.get()._2();
checkpointMark = CoderHelpers.fromByteArray(state.get()._1(), checkpointCoder);
LOG.info("Continue reading from an existing CheckpointMark.");
} else if (startCheckpointMark.isDefined() && !startCheckpointMark.get().equals(EmptyCheckpointMark.get())) {
checkpointMark = startCheckpointMark.get();
LOG.info("Start reading from a provided CheckpointMark.");
} else {
checkpointMark = null;
LOG.info("No CheckpointMark provided, start reading from default.");
}
// create reader.
final MicrobatchSource.Reader /*<T>*/
microbatchReader;
final Stopwatch stopwatch = Stopwatch.createStarted();
long readDurationMillis = 0;
try {
microbatchReader = (MicrobatchSource.Reader) microbatchSource.getOrCreateReader(runtimeContext.getPipelineOptions(), checkpointMark);
} catch (IOException e) {
throw new RuntimeException(e);
}
// read microbatch as a serialized collection.
final List<byte[]> readValues = new ArrayList<>();
WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getDefaultOutputCoder(), GlobalWindow.Coder.INSTANCE);
try {
// measure how long a read takes per-partition.
boolean finished = !microbatchReader.start();
while (!finished) {
final WindowedValue<T> wv = WindowedValue.of((T) microbatchReader.getCurrent(), microbatchReader.getCurrentTimestamp(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
readValues.add(CoderHelpers.toByteArray(wv, coder));
finished = !microbatchReader.advance();
}
// end-of-read watermark is the high watermark, but don't allow decrease.
final Instant sourceWatermark = microbatchReader.getWatermark();
highWatermark = sourceWatermark.isAfter(lowWatermark) ? sourceWatermark : lowWatermark;
readDurationMillis = stopwatch.stop().elapsed(TimeUnit.MILLISECONDS);
LOG.info("Source id {} spent {} millis on reading.", microbatchSource.getId(), readDurationMillis);
// if the Source does not supply a CheckpointMark skip updating the state.
@SuppressWarnings("unchecked") final CheckpointMarkT finishedReadCheckpointMark = (CheckpointMarkT) microbatchReader.getCheckpointMark();
byte[] codedCheckpoint = new byte[0];
if (finishedReadCheckpointMark != null) {
codedCheckpoint = CoderHelpers.toByteArray(finishedReadCheckpointMark, checkpointCoder);
} else {
LOG.info("Skipping checkpoint marking because the reader failed to supply one.");
}
// persist the end-of-read (high) watermark for following read, where it will become
// the next low watermark.
state.update(new Tuple2<>(codedCheckpoint, highWatermark));
} catch (IOException e) {
throw new RuntimeException("Failed to read from reader.", e);
}
final ArrayList<byte[]> payload = Lists.newArrayList(Iterators.unmodifiableIterator(readValues.iterator()));
return new Tuple2<>((Iterable<byte[]>) payload, new Metadata(readValues.size(), lowWatermark, highWatermark, readDurationMillis, metricsContainers));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.
the class TransformTranslator method parDo.
private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {
@Override
@SuppressWarnings("unchecked")
public void evaluate(ParDo.MultiOutput<InputT, OutputT> transform, EvaluationContext context) {
String stepName = context.getCurrentTransform().getFullName();
DoFn<InputT, OutputT> doFn = transform.getFn();
rejectSplittable(doFn);
JavaRDD<WindowedValue<InputT>> inRDD = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
JavaPairRDD<TupleTag<?>, WindowedValue<?>> all;
DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
boolean stateful = signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0;
MultiDoFnFunction<InputT, OutputT> multiDoFnFunction = new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, context.getRuntimeContext(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), TranslationUtils.getSideInputs(transform.getSideInputs(), context), windowingStrategy, stateful);
if (stateful) {
// Based on the fact that the signature is stateful, DoFnSignatures ensures
// that it is also keyed
all = statefulParDoTransform((KvCoder) context.getInput(transform).getCoder(), windowingStrategy.getWindowFn().windowCoder(), (JavaRDD) inRDD, (MultiDoFnFunction) multiDoFnFunction);
} else {
all = inRDD.mapPartitionsToPair(multiDoFnFunction);
}
Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
if (outputs.size() > 1) {
// cache the RDD if we're going to filter it more than once.
all.cache();
}
for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
// Object is the best we can do since different outputs can have different tags
JavaRDD<WindowedValue<Object>> values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
context.putDataset(output.getValue(), new BoundedDataset<>(values));
}
}
@Override
public String toNativeString() {
return "mapPartitions(new <fn>())";
}
};
}
Aggregations