Search in sources :

Example 1 with MetricsContainerStepMap

use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.

the class MetricsAccumulator method init.

/**
   * Init metrics accumulator if it has not been initiated. This method is idempotent.
   */
public static void init(SparkPipelineOptions opts, JavaSparkContext jsc) {
    if (instance == null) {
        synchronized (MetricsAccumulator.class) {
            if (instance == null) {
                Optional<CheckpointDir> maybeCheckpointDir = opts.isStreaming() ? Optional.of(new CheckpointDir(opts.getCheckpointDir())) : Optional.<CheckpointDir>absent();
                Accumulator<MetricsContainerStepMap> accumulator = jsc.sc().accumulator(new MetricsContainerStepMap(), ACCUMULATOR_NAME, new MetricsAccumulatorParam());
                if (maybeCheckpointDir.isPresent()) {
                    Optional<MetricsContainerStepMap> maybeRecoveredValue = recoverValueFromCheckpoint(jsc, maybeCheckpointDir.get());
                    if (maybeRecoveredValue.isPresent()) {
                        accumulator.setValue(maybeRecoveredValue.get());
                    }
                }
                instance = accumulator;
            }
        }
        LOG.info("Instantiated metrics accumulator: " + instance.value());
    }
}
Also used : MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) CheckpointDir(org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir)

Example 2 with MetricsContainerStepMap

use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.

the class MetricsAccumulator method recoverValueFromCheckpoint.

private static Optional<MetricsContainerStepMap> recoverValueFromCheckpoint(JavaSparkContext jsc, CheckpointDir checkpointDir) {
    try {
        Path beamCheckpointPath = checkpointDir.getBeamCheckpointDir();
        checkpointFilePath = new Path(beamCheckpointPath, ACCUMULATOR_CHECKPOINT_FILENAME);
        fileSystem = checkpointFilePath.getFileSystem(jsc.hadoopConfiguration());
        MetricsContainerStepMap recoveredValue = Checkpoint.readObject(fileSystem, checkpointFilePath);
        if (recoveredValue != null) {
            LOG.info("Recovered metrics from checkpoint.");
            return Optional.of(recoveredValue);
        } else {
            LOG.info("No metrics checkpoint found.");
        }
    } catch (Exception e) {
        throw new RuntimeException("Failure while reading metrics checkpoint.", e);
    }
    return Optional.absent();
}
Also used : Path(org.apache.hadoop.fs.Path) MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) IOException(java.io.IOException)

Example 3 with MetricsContainerStepMap

use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.

the class StreamingTransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
            final DoFn<InputT, OutputT> doFn = transform.getFn();
            rejectSplittable(doFn);
            rejectStateAndTimers(doFn);
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            final SparkPCollectionView pviews = context.getPViews();
            final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = ((UnboundedDataset<InputT>) context.borrowDataset(transform));
            JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
            final String stepName = context.getCurrentTransform().getFullName();
            JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(new Function<JavaRDD<WindowedValue<InputT>>, JavaPairRDD<TupleTag<?>, WindowedValue<?>>>() {

                @Override
                public JavaPairRDD<TupleTag<?>, WindowedValue<?>> call(JavaRDD<WindowedValue<InputT>> rdd) throws Exception {
                    final Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
                    final Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
                    final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
                    return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, runtimeContext, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), sideInputs, windowingStrategy, false));
                }
            });
            Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // cache the DStream if we're going to filter it more than once.
                all.cache();
            }
            for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
                @SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                @SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
                context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : MetricsAccumulator(org.apache.beam.runners.spark.metrics.MetricsAccumulator) AggregatorsAccumulator(org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator) Accumulator(org.apache.spark.Accumulator) TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SideInputBroadcast(org.apache.beam.runners.spark.util.SideInputBroadcast) MultiDoFnFunction(org.apache.beam.runners.spark.translation.MultiDoFnFunction) PValue(org.apache.beam.sdk.values.PValue) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) ParDo(org.apache.beam.sdk.transforms.ParDo) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 4 with MetricsContainerStepMap

use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.

the class StateSpecFunctions method mapSourceFunction.

/**
   * A {@link org.apache.spark.streaming.StateSpec} function to support reading from
   * an {@link UnboundedSource}.
   *
   * <p>This StateSpec function expects the following:
   * <ul>
   * <li>Key: The (partitioned) Source to read from.</li>
   * <li>Value: An optional {@link UnboundedSource.CheckpointMark} to start from.</li>
   * <li>State: A byte representation of the (previously) persisted CheckpointMark.</li>
   * </ul>
   * And returns an iterator over all read values (for the micro-batch).
   *
   * <p>This stateful operation could be described as a flatMap over a single-element stream, which
   * outputs all the elements read from the {@link UnboundedSource} for this micro-batch.
   * Since micro-batches are bounded, the provided UnboundedSource is wrapped by a
   * {@link MicrobatchSource} that applies bounds in the form of duration and max records
   * (per micro-batch).
   *
   *
   * <p>In order to avoid using Spark Guava's classes which pollute the
   * classpath, we use the {@link StateSpec#function(scala.Function3)} signature which employs
   * scala's native {@link scala.Option}, instead of the
   * {@link StateSpec#function(org.apache.spark.api.java.function.Function3)} signature,
   * which employs Guava's {@link com.google.common.base.Optional}.
   *
   * <p>See also <a href="https://issues.apache.org/jira/browse/SPARK-4819">SPARK-4819</a>.</p>
   *
   * @param runtimeContext    A serializable {@link SparkRuntimeContext}.
   * @param <T>               The type of the input stream elements.
   * @param <CheckpointMarkT> The type of the {@link UnboundedSource.CheckpointMark}.
   * @return The appropriate {@link org.apache.spark.streaming.StateSpec} function.
   */
public static <T, CheckpointMarkT extends UnboundedSource.CheckpointMark> scala.Function3<Source<T>, scala.Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>> mapSourceFunction(final SparkRuntimeContext runtimeContext, final String stepName) {
    return new SerializableFunction3<Source<T>, Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>>() {

        @Override
        public Tuple2<Iterable<byte[]>, Metadata> apply(Source<T> source, scala.Option<CheckpointMarkT> startCheckpointMark, State<Tuple2<byte[], Instant>> state) {
            MetricsContainerStepMap metricsContainers = new MetricsContainerStepMap();
            MetricsContainer metricsContainer = metricsContainers.getContainer(stepName);
            // since they may report metrics.
            try (Closeable ignored = MetricsEnvironment.scopedMetricsContainer(metricsContainer)) {
                // source as MicrobatchSource
                MicrobatchSource<T, CheckpointMarkT> microbatchSource = (MicrobatchSource<T, CheckpointMarkT>) source;
                // Initial high/low watermarks.
                Instant lowWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE;
                final Instant highWatermark;
                // if state exists, use it, otherwise it's first time so use the startCheckpointMark.
                // startCheckpointMark may be EmptyCheckpointMark (the Spark Java API tries to apply
                // Optional(null)), which is handled by the UnboundedSource implementation.
                Coder<CheckpointMarkT> checkpointCoder = microbatchSource.getCheckpointMarkCoder();
                CheckpointMarkT checkpointMark;
                if (state.exists()) {
                    // previous (output) watermark is now the low watermark.
                    lowWatermark = state.get()._2();
                    checkpointMark = CoderHelpers.fromByteArray(state.get()._1(), checkpointCoder);
                    LOG.info("Continue reading from an existing CheckpointMark.");
                } else if (startCheckpointMark.isDefined() && !startCheckpointMark.get().equals(EmptyCheckpointMark.get())) {
                    checkpointMark = startCheckpointMark.get();
                    LOG.info("Start reading from a provided CheckpointMark.");
                } else {
                    checkpointMark = null;
                    LOG.info("No CheckpointMark provided, start reading from default.");
                }
                // create reader.
                final MicrobatchSource.Reader /*<T>*/
                microbatchReader;
                final Stopwatch stopwatch = Stopwatch.createStarted();
                long readDurationMillis = 0;
                try {
                    microbatchReader = (MicrobatchSource.Reader) microbatchSource.getOrCreateReader(runtimeContext.getPipelineOptions(), checkpointMark);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                // read microbatch as a serialized collection.
                final List<byte[]> readValues = new ArrayList<>();
                WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getDefaultOutputCoder(), GlobalWindow.Coder.INSTANCE);
                try {
                    // measure how long a read takes per-partition.
                    boolean finished = !microbatchReader.start();
                    while (!finished) {
                        final WindowedValue<T> wv = WindowedValue.of((T) microbatchReader.getCurrent(), microbatchReader.getCurrentTimestamp(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
                        readValues.add(CoderHelpers.toByteArray(wv, coder));
                        finished = !microbatchReader.advance();
                    }
                    // end-of-read watermark is the high watermark, but don't allow decrease.
                    final Instant sourceWatermark = microbatchReader.getWatermark();
                    highWatermark = sourceWatermark.isAfter(lowWatermark) ? sourceWatermark : lowWatermark;
                    readDurationMillis = stopwatch.stop().elapsed(TimeUnit.MILLISECONDS);
                    LOG.info("Source id {} spent {} millis on reading.", microbatchSource.getId(), readDurationMillis);
                    // if the Source does not supply a CheckpointMark skip updating the state.
                    @SuppressWarnings("unchecked") final CheckpointMarkT finishedReadCheckpointMark = (CheckpointMarkT) microbatchReader.getCheckpointMark();
                    byte[] codedCheckpoint = new byte[0];
                    if (finishedReadCheckpointMark != null) {
                        codedCheckpoint = CoderHelpers.toByteArray(finishedReadCheckpointMark, checkpointCoder);
                    } else {
                        LOG.info("Skipping checkpoint marking because the reader failed to supply one.");
                    }
                    // persist the end-of-read (high) watermark for following read, where it will become
                    // the next low watermark.
                    state.update(new Tuple2<>(codedCheckpoint, highWatermark));
                } catch (IOException e) {
                    throw new RuntimeException("Failed to read from reader.", e);
                }
                final ArrayList<byte[]> payload = Lists.newArrayList(Iterators.unmodifiableIterator(readValues.iterator()));
                return new Tuple2<>((Iterable<byte[]>) payload, new Metadata(readValues.size(), lowWatermark, highWatermark, readDurationMillis, metricsContainers));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    };
}
Also used : MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) Closeable(java.io.Closeable) Metadata(org.apache.beam.runners.spark.io.SparkUnboundedSource.Metadata) Stopwatch(com.google.common.base.Stopwatch) ArrayList(java.util.ArrayList) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource) Source(org.apache.beam.sdk.io.Source) MicrobatchSource(org.apache.beam.runners.spark.io.MicrobatchSource) MetricsContainer(org.apache.beam.sdk.metrics.MetricsContainer) WindowedValue(org.apache.beam.sdk.util.WindowedValue) MicrobatchSource(org.apache.beam.runners.spark.io.MicrobatchSource) Instant(org.joda.time.Instant) IOException(java.io.IOException) Tuple2(scala.Tuple2) State(org.apache.spark.streaming.State) Option(scala.Option)

Example 5 with MetricsContainerStepMap

use of org.apache.beam.runners.core.metrics.MetricsContainerStepMap in project beam by apache.

the class TransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        @SuppressWarnings("unchecked")
        public void evaluate(ParDo.MultiOutput<InputT, OutputT> transform, EvaluationContext context) {
            String stepName = context.getCurrentTransform().getFullName();
            DoFn<InputT, OutputT> doFn = transform.getFn();
            rejectSplittable(doFn);
            JavaRDD<WindowedValue<InputT>> inRDD = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
            WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
            Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
            JavaPairRDD<TupleTag<?>, WindowedValue<?>> all;
            DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
            boolean stateful = signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0;
            MultiDoFnFunction<InputT, OutputT> multiDoFnFunction = new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, context.getRuntimeContext(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), TranslationUtils.getSideInputs(transform.getSideInputs(), context), windowingStrategy, stateful);
            if (stateful) {
                // Based on the fact that the signature is stateful, DoFnSignatures ensures
                // that it is also keyed
                all = statefulParDoTransform((KvCoder) context.getInput(transform).getCoder(), windowingStrategy.getWindowFn().windowCoder(), (JavaRDD) inRDD, (MultiDoFnFunction) multiDoFnFunction);
            } else {
                all = inRDD.mapPartitionsToPair(multiDoFnFunction);
            }
            Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // cache the RDD if we're going to filter it more than once.
                all.cache();
            }
            for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
                JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                // Object is the best we can do since different outputs can have different tags
                JavaRDD<WindowedValue<Object>> values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
                context.putDataset(output.getValue(), new BoundedDataset<>(values));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowedValue(org.apache.beam.sdk.util.WindowedValue) NamedAggregators(org.apache.beam.runners.spark.aggregators.NamedAggregators) KvCoder(org.apache.beam.sdk.coders.KvCoder) PValue(org.apache.beam.sdk.values.PValue) JavaRDD(org.apache.spark.api.java.JavaRDD) ParDo(org.apache.beam.sdk.transforms.ParDo) MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) Map(java.util.Map) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Aggregations

MetricsContainerStepMap (org.apache.beam.runners.core.metrics.MetricsContainerStepMap)5 WindowedValue (org.apache.beam.sdk.util.WindowedValue)3 IOException (java.io.IOException)2 Map (java.util.Map)2 ParDo (org.apache.beam.sdk.transforms.ParDo)2 PValue (org.apache.beam.sdk.values.PValue)2 TupleTag (org.apache.beam.sdk.values.TupleTag)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 Stopwatch (com.google.common.base.Stopwatch)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 Closeable (java.io.Closeable)1 ArrayList (java.util.ArrayList)1 AggregatorsAccumulator (org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator)1 NamedAggregators (org.apache.beam.runners.spark.aggregators.NamedAggregators)1 MicrobatchSource (org.apache.beam.runners.spark.io.MicrobatchSource)1 Metadata (org.apache.beam.runners.spark.io.SparkUnboundedSource.Metadata)1 MetricsAccumulator (org.apache.beam.runners.spark.metrics.MetricsAccumulator)1 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)1 MultiDoFnFunction (org.apache.beam.runners.spark.translation.MultiDoFnFunction)1 SparkPCollectionView (org.apache.beam.runners.spark.translation.SparkPCollectionView)1