Search in sources :

Example 11 with WindowedValueCoder

use of org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder in project beam by apache.

the class StreamingDataflowWorker method process.

private void process(final SdkWorkerHarness worker, final ComputationState computationState, final Instant inputDataWatermark, @Nullable final Instant outputDataWatermark, @Nullable final Instant synchronizedProcessingTime, final Work work) {
    final Windmill.WorkItem workItem = work.getWorkItem();
    final String computationId = computationState.getComputationId();
    final ByteString key = workItem.getKey();
    work.setState(State.PROCESSING);
    {
        StringBuilder workIdBuilder = new StringBuilder(33);
        workIdBuilder.append(Long.toHexString(workItem.getShardingKey()));
        workIdBuilder.append('-');
        workIdBuilder.append(Long.toHexString(workItem.getWorkToken()));
        DataflowWorkerLoggingMDC.setWorkId(workIdBuilder.toString());
    }
    DataflowWorkerLoggingMDC.setStageName(computationId);
    LOG.debug("Starting processing for {}:\n{}", computationId, work);
    Windmill.WorkItemCommitRequest.Builder outputBuilder = initializeOutputBuilder(key, workItem);
    // Before any processing starts, call any pending OnCommit callbacks.  Nothing that requires
    // cleanup should be done before this, since we might exit early here.
    callFinalizeCallbacks(workItem);
    if (workItem.getSourceState().getOnlyFinalize()) {
        outputBuilder.setSourceStateUpdates(Windmill.SourceState.newBuilder().setOnlyFinalize(true));
        work.setState(State.COMMIT_QUEUED);
        commitQueue.put(new Commit(outputBuilder.build(), computationState, work));
        return;
    }
    long processingStartTimeNanos = System.nanoTime();
    final MapTask mapTask = computationState.getMapTask();
    StageInfo stageInfo = stageInfoMap.computeIfAbsent(mapTask.getStageName(), s -> new StageInfo(s, mapTask.getSystemName(), this));
    ExecutionState executionState = null;
    try {
        executionState = computationState.getExecutionStateQueue(worker).poll();
        if (executionState == null) {
            MutableNetwork<Node, Edge> mapTaskNetwork = mapTaskToNetwork.apply(mapTask);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Network as Graphviz .dot: {}", Networks.toDot(mapTaskNetwork));
            }
            ParallelInstructionNode readNode = (ParallelInstructionNode) Iterables.find(mapTaskNetwork.nodes(), node -> node instanceof ParallelInstructionNode && ((ParallelInstructionNode) node).getParallelInstruction().getRead() != null);
            InstructionOutputNode readOutputNode = (InstructionOutputNode) Iterables.getOnlyElement(mapTaskNetwork.successors(readNode));
            DataflowExecutionContext.DataflowExecutionStateTracker executionStateTracker = new DataflowExecutionContext.DataflowExecutionStateTracker(ExecutionStateSampler.instance(), stageInfo.executionStateRegistry.getState(NameContext.forStage(mapTask.getStageName()), "other", null, ScopedProfiler.INSTANCE.emptyScope()), stageInfo.deltaCounters, options, computationId);
            StreamingModeExecutionContext context = new StreamingModeExecutionContext(pendingDeltaCounters, computationId, readerCache, !computationState.getTransformUserNameToStateFamily().isEmpty() ? computationState.getTransformUserNameToStateFamily() : stateNameMap, stateCache.forComputation(computationId), stageInfo.metricsContainerRegistry, executionStateTracker, stageInfo.executionStateRegistry, maxSinkBytes);
            DataflowMapTaskExecutor mapTaskExecutor = mapTaskExecutorFactory.create(worker.getControlClientHandler(), worker.getGrpcDataFnServer(), sdkHarnessRegistry.beamFnDataApiServiceDescriptor(), worker.getGrpcStateFnServer(), mapTaskNetwork, options, mapTask.getStageName(), readerRegistry, sinkRegistry, context, pendingDeltaCounters, idGenerator);
            ReadOperation readOperation = mapTaskExecutor.getReadOperation();
            // Disable progress updates since its results are unused  for streaming
            // and involves starting a thread.
            readOperation.setProgressUpdatePeriodMs(ReadOperation.DONT_UPDATE_PERIODICALLY);
            Preconditions.checkState(mapTaskExecutor.supportsRestart(), "Streaming runner requires all operations support restart.");
            Coder<?> readCoder;
            readCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(readOutputNode.getInstructionOutput().getCodec()));
            Coder<?> keyCoder = extractKeyCoder(readCoder);
            // If using a custom source, count bytes read for autoscaling.
            if (CustomSources.class.getName().equals(readNode.getParallelInstruction().getRead().getSource().getSpec().get("@type"))) {
                NameContext nameContext = NameContext.create(mapTask.getStageName(), readNode.getParallelInstruction().getOriginalName(), readNode.getParallelInstruction().getSystemName(), readNode.getParallelInstruction().getName());
                readOperation.receivers[0].addOutputCounter(new OutputObjectAndByteCounter(new IntrinsicMapTaskExecutorFactory.ElementByteSizeObservableCoder<>(readCoder), mapTaskExecutor.getOutputCounters(), nameContext).setSamplingPeriod(100).countBytes("dataflow_input_size-" + mapTask.getSystemName()));
            }
            executionState = new ExecutionState(mapTaskExecutor, context, keyCoder, executionStateTracker);
        }
        WindmillStateReader stateReader = new WindmillStateReader(metricTrackingWindmillServer, computationId, key, workItem.getShardingKey(), workItem.getWorkToken());
        StateFetcher localStateFetcher = stateFetcher.byteTrackingView();
        // If the read output KVs, then we can decode Windmill's byte key into a userland
        // key object and provide it to the execution context for use with per-key state.
        // Otherwise, we pass null.
        // 
        // The coder type that will be present is:
        // WindowedValueCoder(TimerOrElementCoder(KvCoder))
        @Nullable Coder<?> keyCoder = executionState.getKeyCoder();
        @Nullable Object executionKey = keyCoder == null ? null : keyCoder.decode(key.newInput(), Coder.Context.OUTER);
        if (workItem.hasHotKeyInfo()) {
            Windmill.HotKeyInfo hotKeyInfo = workItem.getHotKeyInfo();
            Duration hotKeyAge = Duration.millis(hotKeyInfo.getHotKeyAgeUsec() / 1000);
            // The MapTask instruction is ordered by dependencies, such that the first element is
            // always going to be the shuffle task.
            String stepName = computationState.getMapTask().getInstructions().get(0).getName();
            if (options.isHotKeyLoggingEnabled() && keyCoder != null) {
                hotKeyLogger.logHotKeyDetection(stepName, hotKeyAge, executionKey);
            } else {
                hotKeyLogger.logHotKeyDetection(stepName, hotKeyAge);
            }
        }
        executionState.getContext().start(executionKey, workItem, inputDataWatermark, outputDataWatermark, synchronizedProcessingTime, stateReader, localStateFetcher, outputBuilder);
        // Blocks while executing work.
        executionState.getWorkExecutor().execute();
        Iterables.addAll(this.pendingMonitoringInfos, executionState.getWorkExecutor().extractMetricUpdates());
        commitCallbacks.putAll(executionState.getContext().flushState());
        // Release the execution state for another thread to use.
        computationState.getExecutionStateQueue(worker).offer(executionState);
        executionState = null;
        // Add the output to the commit queue.
        work.setState(State.COMMIT_QUEUED);
        WorkItemCommitRequest commitRequest = outputBuilder.build();
        int byteLimit = maxWorkItemCommitBytes;
        int commitSize = commitRequest.getSerializedSize();
        int estimatedCommitSize = commitSize < 0 ? Integer.MAX_VALUE : commitSize;
        // Detect overflow of integer serialized size or if the byte limit was exceeded.
        windmillMaxObservedWorkItemCommitBytes.addValue(estimatedCommitSize);
        if (commitSize < 0 || commitSize > byteLimit) {
            KeyCommitTooLargeException e = KeyCommitTooLargeException.causedBy(computationId, byteLimit, commitRequest);
            reportFailure(computationId, workItem, e);
            LOG.error(e.toString());
            // Drop the current request in favor of a new, minimal one requesting truncation.
            // Messages, timers, counters, and other commit content will not be used by the service
            // so we're purposefully dropping them here
            commitRequest = buildWorkItemTruncationRequest(key, workItem, estimatedCommitSize);
        }
        commitQueue.put(new Commit(commitRequest, computationState, work));
        // Compute shuffle and state byte statistics these will be flushed asynchronously.
        long stateBytesWritten = outputBuilder.clearOutputMessages().build().getSerializedSize();
        long shuffleBytesRead = 0;
        for (Windmill.InputMessageBundle bundle : workItem.getMessageBundlesList()) {
            for (Windmill.Message message : bundle.getMessagesList()) {
                shuffleBytesRead += message.getSerializedSize();
            }
        }
        long stateBytesRead = stateReader.getBytesRead() + localStateFetcher.getBytesRead();
        windmillShuffleBytesRead.addValue(shuffleBytesRead);
        windmillStateBytesRead.addValue(stateBytesRead);
        windmillStateBytesWritten.addValue(stateBytesWritten);
        LOG.debug("Processing done for work token: {}", workItem.getWorkToken());
    } catch (Throwable t) {
        if (executionState != null) {
            try {
                executionState.getContext().invalidateCache();
                executionState.getWorkExecutor().close();
            } catch (Exception e) {
                LOG.warn("Failed to close map task executor: ", e);
            } finally {
                // Release references to potentially large objects early.
                executionState = null;
            }
        }
        t = t instanceof UserCodeException ? t.getCause() : t;
        boolean retryLocally = false;
        if (KeyTokenInvalidException.isKeyTokenInvalidException(t)) {
            LOG.debug("Execution of work for computation '{}' on key '{}' failed due to token expiration. " + "Work will not be retried locally.", computationId, key.toStringUtf8());
        } else {
            LastExceptionDataProvider.reportException(t);
            LOG.debug("Failed work: {}", work);
            Duration elapsedTimeSinceStart = new Duration(Instant.now(), work.getStartTime());
            if (!reportFailure(computationId, workItem, t)) {
                LOG.error("Execution of work for computation '{}' on key '{}' failed with uncaught exception, " + "and Windmill indicated not to retry locally.", computationId, key.toStringUtf8(), t);
            } else if (isOutOfMemoryError(t)) {
                File heapDump = memoryMonitor.tryToDumpHeap();
                LOG.error("Execution of work for computation '{}' for key '{}' failed with out-of-memory. " + "Work will not be retried locally. Heap dump {}.", computationId, key.toStringUtf8(), heapDump == null ? "not written" : ("written to '" + heapDump + "'"), t);
            } else if (elapsedTimeSinceStart.isLongerThan(MAX_LOCAL_PROCESSING_RETRY_DURATION)) {
                LOG.error("Execution of work for computation '{}' for key '{}' failed with uncaught exception, " + "and it will not be retried locally because the elapsed time since start {} " + "exceeds {}.", computationId, key.toStringUtf8(), elapsedTimeSinceStart, MAX_LOCAL_PROCESSING_RETRY_DURATION, t);
            } else {
                LOG.error("Execution of work for computation '{}' on key '{}' failed with uncaught exception. " + "Work will be retried locally.", computationId, key.toStringUtf8(), t);
                retryLocally = true;
            }
        }
        if (retryLocally) {
            // Try again after some delay and at the end of the queue to avoid a tight loop.
            sleep(retryLocallyDelayMs);
            workUnitExecutor.forceExecute(work, work.getWorkItem().getSerializedSize());
        } else {
            // Consider the item invalid. It will eventually be retried by Windmill if it still needs to
            // be processed.
            computationState.completeWork(ShardedKey.create(key, workItem.getShardingKey()), workItem.getWorkToken());
        }
    } finally {
        // Update total processing time counters. Updating in finally clause ensures that
        // work items causing exceptions are also accounted in time spent.
        long processingTimeMsecs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - processingStartTimeNanos);
        stageInfo.totalProcessingMsecs.addValue(processingTimeMsecs);
        // either here or in DFE.
        if (work.getWorkItem().hasTimers()) {
            stageInfo.timerProcessingMsecs.addValue(processingTimeMsecs);
        }
        DataflowWorkerLoggingMDC.setWorkId(null);
        DataflowWorkerLoggingMDC.setStageName(null);
    }
}
Also used : MetricName(org.apache.beam.sdk.metrics.MetricName) MapTask(com.google.api.services.dataflow.model.MapTask) UserCodeException(org.apache.beam.sdk.util.UserCodeException) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) MetricsLogger(org.apache.beam.runners.core.metrics.MetricsLogger) CommitWorkStream(org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub.CommitWorkStream) CloudObjects(org.apache.beam.runners.dataflow.util.CloudObjects) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) CreateRegisterFnOperationFunction(org.apache.beam.runners.dataflow.worker.graph.CreateRegisterFnOperationFunction) ScopedProfiler(org.apache.beam.runners.dataflow.worker.profiler.ScopedProfiler) StreamPool(org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub.StreamPool) DataflowCounterUpdateExtractor(org.apache.beam.runners.dataflow.worker.counters.DataflowCounterUpdateExtractor) Uninterruptibles(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.Uninterruptibles) TimerTask(java.util.TimerTask) WorkItemStatus(com.google.api.services.dataflow.model.WorkItemStatus) WorkerStatusPages(org.apache.beam.runners.dataflow.worker.status.WorkerStatusPages) RegisterNodeFunction(org.apache.beam.runners.dataflow.worker.graph.RegisterNodeFunction) IdGenerator(org.apache.beam.sdk.fn.IdGenerator) PrintWriter(java.io.PrintWriter) KvCoder(org.apache.beam.sdk.coders.KvCoder) THROTTLING_MSECS_METRIC_NAME(org.apache.beam.runners.dataflow.worker.DataflowSystemMetrics.THROTTLING_MSECS_METRIC_NAME) ReadOperation(org.apache.beam.runners.dataflow.worker.util.common.worker.ReadOperation) CacheBuilder(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.CacheBuilder) Sleeper(org.apache.beam.sdk.util.Sleeper) StreamingModeExecutionStateRegistry(org.apache.beam.runners.dataflow.worker.StreamingModeExecutionContext.StreamingModeExecutionStateRegistry) DebugCapture(org.apache.beam.runners.dataflow.worker.status.DebugCapture) Executors(java.util.concurrent.Executors) BoundedQueueExecutor(org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor) MultimapBuilder(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.MultimapBuilder) VisibleForTesting(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting) WorkItemCommitRequest(org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest) AutoValue(com.google.auto.value.AutoValue) Counter(org.apache.beam.runners.dataflow.worker.counters.Counter) InsertFetchAndFilterStreamingSideInputNodes(org.apache.beam.runners.dataflow.worker.graph.InsertFetchAndFilterStreamingSideInputNodes) Capturable(org.apache.beam.runners.dataflow.worker.status.DebugCapture.Capturable) Networks(org.apache.beam.runners.dataflow.worker.graph.Networks) DeduceNodeLocationsFunction(org.apache.beam.runners.dataflow.worker.graph.DeduceNodeLocationsFunction) ExecutionStateTracker(org.apache.beam.runners.core.metrics.ExecutionStateTracker) Cache(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.Cache) Duration(org.joda.time.Duration) Splitter(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Splitter) Optional(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Optional) ArrayList(java.util.ArrayList) CounterSet(org.apache.beam.runners.dataflow.worker.counters.CounterSet) Status(com.google.api.services.dataflow.model.Status) HttpServletRequest(javax.servlet.http.HttpServletRequest) EvictingQueue(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.EvictingQueue) GetWorkStream(org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub.GetWorkStream) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Windmill(org.apache.beam.runners.dataflow.worker.windmill.Windmill) StreamingComputationConfig(com.google.api.services.dataflow.model.StreamingComputationConfig) DataflowRunner(org.apache.beam.runners.dataflow.DataflowRunner) IOException(java.io.IOException) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) CounterStructuredName(com.google.api.services.dataflow.model.CounterStructuredName) AtomicLong(java.util.concurrent.atomic.AtomicLong) MetricsEnvironment(org.apache.beam.sdk.metrics.MetricsEnvironment) DataflowWorkerLoggingMDC(org.apache.beam.runners.dataflow.worker.logging.DataflowWorkerLoggingMDC) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) ArrayDeque(java.util.ArrayDeque) FileSystems(org.apache.beam.sdk.io.FileSystems) StreamingPerStageSystemCounterNames(org.apache.beam.runners.dataflow.worker.DataflowSystemMetrics.StreamingPerStageSystemCounterNames) State(org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.Work.State) CounterUpdateAggregators(org.apache.beam.runners.dataflow.worker.counters.CounterUpdateAggregators) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge) ReplacePgbkWithPrecombineFunction(org.apache.beam.runners.dataflow.worker.graph.ReplacePgbkWithPrecombineFunction) OutputObjectAndByteCounter(org.apache.beam.runners.dataflow.worker.util.common.worker.OutputObjectAndByteCounter) MoreObjects(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects) LoggerFactory(org.slf4j.LoggerFactory) Random(java.util.Random) Timer(java.util.Timer) MutableNetwork(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.graph.MutableNetwork) HostAndPort(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.net.HostAndPort) BackOff(org.apache.beam.sdk.util.BackOff) BackOffUtils(org.apache.beam.sdk.util.BackOffUtils) StatusDataProvider(org.apache.beam.runners.dataflow.worker.status.StatusDataProvider) DataflowWorkerHarnessOptions(org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions) DataflowRunner.hasExperiment(org.apache.beam.runners.dataflow.DataflowRunner.hasExperiment) Transport(org.apache.beam.sdk.extensions.gcp.util.Transport) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) NameContext(org.apache.beam.runners.dataflow.worker.counters.NameContext) ThreadFactory(java.util.concurrent.ThreadFactory) JvmInitializers(org.apache.beam.sdk.fn.JvmInitializers) Collection(java.util.Collection) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) SdkWorkerHarness(org.apache.beam.runners.dataflow.worker.SdkHarnessRegistry.SdkWorkerHarness) FixMultiOutputInfosOnParDoInstructions(org.apache.beam.runners.dataflow.worker.apiary.FixMultiOutputInfosOnParDoInstructions) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) LastExceptionDataProvider(org.apache.beam.runners.dataflow.worker.status.LastExceptionDataProvider) List(java.util.List) ListMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ListMultimap) Queue(java.util.Queue) BaseStatusServlet(org.apache.beam.runners.dataflow.worker.status.BaseStatusServlet) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) StreamingSystemCounterNames(org.apache.beam.runners.dataflow.worker.DataflowSystemMetrics.StreamingSystemCounterNames) IdGenerators(org.apache.beam.sdk.fn.IdGenerators) CustomSources(org.apache.beam.runners.dataflow.internal.CustomSources) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionStateSampler(org.apache.beam.runners.core.metrics.ExecutionStateSampler) Deque(java.util.Deque) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) WorkItem(com.google.api.services.dataflow.model.WorkItem) Function(java.util.function.Function) StreamingDataflowWorkerOptions(org.apache.beam.runners.dataflow.worker.options.StreamingDataflowWorkerOptions) TextFormat(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.TextFormat) ConcurrentMap(java.util.concurrent.ConcurrentMap) MemoryMonitor(org.apache.beam.runners.dataflow.worker.util.MemoryMonitor) HashSet(java.util.HashSet) DeduceFlattenLocationsFunction(org.apache.beam.runners.dataflow.worker.graph.DeduceFlattenLocationsFunction) StreamingConfigTask(com.google.api.services.dataflow.model.StreamingConfigTask) WindmillServerStub(org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Nullable(org.checkerframework.checker.nullness.qual.Nullable) CounterUpdate(com.google.api.services.dataflow.model.CounterUpdate) MapTaskToNetworkFunction(org.apache.beam.runners.dataflow.worker.graph.MapTaskToNetworkFunction) FluentBackoff(org.apache.beam.sdk.util.FluentBackoff) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) CloneAmbiguousFlattensFunction(org.apache.beam.runners.dataflow.worker.graph.CloneAmbiguousFlattensFunction) UTF_8(java.nio.charset.StandardCharsets.UTF_8) Semaphore(java.util.concurrent.Semaphore) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) HttpServletResponse(javax.servlet.http.HttpServletResponse) TimeUnit(java.util.concurrent.TimeUnit) Preconditions(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions) Instant(org.joda.time.Instant) RemoteGrpcPort(org.apache.beam.model.fnexecution.v1.BeamFnApi.RemoteGrpcPort) Collections(java.util.Collections) LengthPrefixUnknownCoders(org.apache.beam.runners.dataflow.worker.graph.LengthPrefixUnknownCoders) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) UserCodeException(org.apache.beam.sdk.util.UserCodeException) Windmill(org.apache.beam.runners.dataflow.worker.windmill.Windmill) ReadOperation(org.apache.beam.runners.dataflow.worker.util.common.worker.ReadOperation) OutputObjectAndByteCounter(org.apache.beam.runners.dataflow.worker.util.common.worker.OutputObjectAndByteCounter) CustomSources(org.apache.beam.runners.dataflow.internal.CustomSources) NameContext(org.apache.beam.runners.dataflow.worker.counters.NameContext) Duration(org.joda.time.Duration) UserCodeException(org.apache.beam.sdk.util.UserCodeException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) WorkItemCommitRequest(org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest) MapTask(com.google.api.services.dataflow.model.MapTask) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge) File(java.io.File) Nullable(org.checkerframework.checker.nullness.qual.Nullable)

Example 12 with WindowedValueCoder

use of org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder in project beam by apache.

the class IntrinsicMapTaskExecutorFactory method createPartialGroupByKeyOperation.

<K> OperationNode createPartialGroupByKeyOperation(Network<Node, Edge> network, ParallelInstructionNode node, PipelineOptions options, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
    ParallelInstruction instruction = node.getParallelInstruction();
    PartialGroupByKeyInstruction pgbk = instruction.getPartialGroupByKey();
    OutputReceiver[] receivers = getOutputReceivers(network, node);
    Coder<?> windowedCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(pgbk.getInputElementCodec()));
    if (!(windowedCoder instanceof WindowedValueCoder)) {
        throw new IllegalArgumentException(String.format("unexpected kind of input coder for PartialGroupByKeyOperation: %s", windowedCoder));
    }
    Coder<?> elemCoder = ((WindowedValueCoder<?>) windowedCoder).getValueCoder();
    if (!(elemCoder instanceof KvCoder)) {
        throw new IllegalArgumentException(String.format("unexpected kind of input element coder for PartialGroupByKeyOperation: %s", elemCoder));
    }
    @SuppressWarnings("unchecked") KvCoder<K, ?> keyedElementCoder = (KvCoder<K, ?>) elemCoder;
    CloudObject cloudUserFn = pgbk.getValueCombiningFn() != null ? CloudObject.fromSpec(pgbk.getValueCombiningFn()) : null;
    ParDoFn fn = PartialGroupByKeyParDoFns.create(options, keyedElementCoder, cloudUserFn, pgbk.getSideInputs(), Arrays.<Receiver>asList(receivers), executionContext, operationContext);
    return OperationNode.create(new ParDoOperation(fn, receivers, operationContext));
}
Also used : OutputReceiver(org.apache.beam.runners.dataflow.worker.util.common.worker.OutputReceiver) KvCoder(org.apache.beam.sdk.coders.KvCoder) ParDoFn(org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn) ParDoOperation(org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoOperation) ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) PartialGroupByKeyInstruction(com.google.api.services.dataflow.model.PartialGroupByKeyInstruction)

Example 13 with WindowedValueCoder

use of org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder in project beam by apache.

the class IsmReaderFactory method createImpl.

<V> NativeReader<?> createImpl(CloudObject spec, Coder<?> coder, PipelineOptions options, DataflowExecutionContext executionContext, DataflowOperationContext operationContext) throws Exception {
    final ResourceId resourceId = FileSystems.matchNewResource(getString(spec, WorkerPropertyNames.FILENAME), false);
    checkArgument(coder instanceof WindowedValueCoder, "%s only supports using %s but got %s.", IsmReader.class, WindowedValueCoder.class, coder);
    @SuppressWarnings("unchecked") WindowedValueCoder<IsmRecord<V>> windowedCoder = (WindowedValueCoder<IsmRecord<V>>) coder;
    checkArgument(windowedCoder.getValueCoder() instanceof IsmRecordCoder, "%s only supports using %s but got %s.", IsmReader.class, IsmRecordCoder.class, windowedCoder.getValueCoder());
    @SuppressWarnings("unchecked") final IsmRecordCoder<V> ismCoder = (IsmRecordCoder<V>) windowedCoder.getValueCoder();
    checkArgument(executionContext instanceof BatchModeExecutionContext, "%s only supports using %s but got %s.", IsmReader.class, BatchModeExecutionContext.class, executionContext);
    final BatchModeExecutionContext execContext = (BatchModeExecutionContext) executionContext;
    // the same file.
    return execContext.<IsmReaderKey, NativeReader<?>>getLogicalReferenceCache().get(new IsmReaderKey(resourceId.toString()), () -> new IsmReaderImpl<V>(resourceId, ismCoder, execContext.<IsmReaderImpl.IsmShardKey, WeightedValue<NavigableMap<RandomAccessData, WindowedValue<IsmRecord<V>>>>>getDataCache()));
}
Also used : RandomAccessData(org.apache.beam.runners.dataflow.util.RandomAccessData) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) WeightedValue(org.apache.beam.sdk.util.WeightedValue) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Example 14 with WindowedValueCoder

use of org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder in project beam by apache.

the class GroupAlsoByWindowParDoFnFactory method create.

@Override
public ParDoFn create(PipelineOptions options, CloudObject cloudUserFn, @Nullable List<SideInputInfo> sideInputInfos, TupleTag<?> mainOutputTag, Map<TupleTag<?>, Integer> outputTupleTagsToReceiverIndices, final DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
    Map.Entry<TupleTag<?>, Integer> entry = Iterables.getOnlyElement(outputTupleTagsToReceiverIndices.entrySet());
    checkArgument(entry.getKey().equals(mainOutputTag), "Output tags should reference only the main output tag: %s vs %s", entry.getKey(), mainOutputTag);
    checkArgument(entry.getValue() == 0, "There should be a single receiver, but using receiver index %s", entry.getValue());
    byte[] encodedWindowingStrategy = getBytes(cloudUserFn, PropertyNames.SERIALIZED_FN);
    WindowingStrategy windowingStrategy;
    try {
        windowingStrategy = deserializeWindowingStrategy(encodedWindowingStrategy);
    } catch (Exception e) {
        // TODO: Catch block disappears, becoming an error once Python SDK is compliant.
        if (DataflowRunner.hasExperiment(options.as(DataflowPipelineDebugOptions.class), "beam_fn_api")) {
            LOG.info("FnAPI: Unable to deserialize windowing strategy, assuming default", e);
            windowingStrategy = WindowingStrategy.globalDefault();
        } else {
            throw e;
        }
    }
    byte[] serializedCombineFn = getBytes(cloudUserFn, WorkerPropertyNames.COMBINE_FN, null);
    AppliedCombineFn<?, ?, ?, ?> combineFn = null;
    if (serializedCombineFn != null) {
        Object combineFnObj = SerializableUtils.deserializeFromByteArray(serializedCombineFn, "serialized combine fn");
        checkArgument(combineFnObj instanceof AppliedCombineFn, "unexpected kind of AppliedCombineFn: " + combineFnObj.getClass().getName());
        combineFn = (AppliedCombineFn<?, ?, ?, ?>) combineFnObj;
    }
    Map<String, Object> inputCoderObject = getObject(cloudUserFn, WorkerPropertyNames.INPUT_CODER);
    Coder<?> inputCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(inputCoderObject));
    checkArgument(inputCoder instanceof WindowedValueCoder, "Expected WindowedValueCoder for inputCoder, got: " + inputCoder.getClass().getName());
    @SuppressWarnings("unchecked") WindowedValueCoder<?> windowedValueCoder = (WindowedValueCoder<?>) inputCoder;
    Coder<?> elemCoder = windowedValueCoder.getValueCoder();
    checkArgument(elemCoder instanceof KvCoder, "Expected KvCoder for inputCoder, got: " + elemCoder.getClass().getName());
    @SuppressWarnings("unchecked") KvCoder<?, ?> kvCoder = (KvCoder<?, ?>) elemCoder;
    boolean isStreamingPipeline = options.as(StreamingOptions.class).isStreaming();
    SideInputReader sideInputReader = NullSideInputReader.empty();
    @Nullable AppliedCombineFn<?, ?, ?, ?> maybeMergingCombineFn = null;
    if (combineFn != null) {
        sideInputReader = executionContext.getSideInputReader(sideInputInfos, combineFn.getSideInputViews(), operationContext);
        String phase = getString(cloudUserFn, WorkerPropertyNames.PHASE, CombinePhase.ALL);
        checkArgument(phase.equals(CombinePhase.ALL) || phase.equals(CombinePhase.MERGE), "Unexpected phase: %s", phase);
        if (phase.equals(CombinePhase.MERGE)) {
            maybeMergingCombineFn = makeAppliedMergingFunction(combineFn);
        } else {
            maybeMergingCombineFn = combineFn;
        }
    }
    StateInternalsFactory<?> stateInternalsFactory = key -> executionContext.getStepContext(operationContext).stateInternals();
    // This will be a GABW Fn for either batch or streaming, with combiner in it or not
    GroupAlsoByWindowFn<?, ?> fn;
    // This will be a FakeKeyedWorkItemCoder for streaming or null for batch
    Coder<?> gabwInputCoder;
    // TODO: do not do this with mess of "if"
    if (isStreamingPipeline) {
        if (maybeMergingCombineFn == null) {
            fn = StreamingGroupAlsoByWindowsDoFns.createForIterable(windowingStrategy, stateInternalsFactory, ((KvCoder) kvCoder).getValueCoder());
            gabwInputCoder = WindmillKeyedWorkItem.FakeKeyedWorkItemCoder.of(kvCoder);
        } else {
            fn = StreamingGroupAlsoByWindowsDoFns.create(windowingStrategy, stateInternalsFactory, (AppliedCombineFn) maybeMergingCombineFn, ((KvCoder) kvCoder).getKeyCoder());
            gabwInputCoder = WindmillKeyedWorkItem.FakeKeyedWorkItemCoder.of(((AppliedCombineFn) maybeMergingCombineFn).getKvCoder());
        }
    } else {
        if (maybeMergingCombineFn == null) {
            fn = BatchGroupAlsoByWindowsDoFns.createForIterable(windowingStrategy, stateInternalsFactory, ((KvCoder) kvCoder).getValueCoder());
            gabwInputCoder = null;
        } else {
            fn = BatchGroupAlsoByWindowsDoFns.create(windowingStrategy, (AppliedCombineFn) maybeMergingCombineFn);
            gabwInputCoder = null;
        }
    }
    // TODO: or anyhow related to it, do not do this with mess of "if"
    if (maybeMergingCombineFn != null) {
        return new GroupAlsoByWindowsParDoFn(options, fn, windowingStrategy, ((AppliedCombineFn) maybeMergingCombineFn).getSideInputViews(), gabwInputCoder, sideInputReader, mainOutputTag, executionContext.getStepContext(operationContext));
    } else {
        return new GroupAlsoByWindowsParDoFn(options, fn, windowingStrategy, null, gabwInputCoder, sideInputReader, mainOutputTag, executionContext.getStepContext(operationContext));
    }
}
Also used : CombineFn(org.apache.beam.sdk.transforms.Combine.CombineFn) StateInternalsFactory(org.apache.beam.runners.core.StateInternalsFactory) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) CombineFnWithContext(org.apache.beam.sdk.transforms.CombineWithContext.CombineFnWithContext) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) LoggerFactory(org.slf4j.LoggerFactory) CloudObjects(org.apache.beam.runners.dataflow.util.CloudObjects) BatchGroupAlsoByWindowsDoFns(org.apache.beam.runners.dataflow.worker.util.BatchGroupAlsoByWindowsDoFns) Coder(org.apache.beam.sdk.coders.Coder) ListCoder(org.apache.beam.sdk.coders.ListCoder) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents) ArrayList(java.util.ArrayList) GlobalCombineFn(org.apache.beam.sdk.transforms.CombineFnBase.GlobalCombineFn) DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Structs.getBytes(org.apache.beam.runners.dataflow.util.Structs.getBytes) SideInputInfo(com.google.api.services.dataflow.model.SideInputInfo) SideInputReader(org.apache.beam.runners.core.SideInputReader) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) CannotProvideCoderException(org.apache.beam.sdk.coders.CannotProvideCoderException) KvCoder(org.apache.beam.sdk.coders.KvCoder) AppliedCombineFn(org.apache.beam.sdk.util.AppliedCombineFn) Logger(org.slf4j.Logger) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) NullSideInputReader(org.apache.beam.runners.core.NullSideInputReader) DataflowRunner(org.apache.beam.runners.dataflow.DataflowRunner) WorkerPropertyNames(org.apache.beam.runners.dataflow.worker.util.WorkerPropertyNames) RootCase(org.apache.beam.model.pipeline.v1.RunnerApi.MessageWithComponents.RootCase) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ParDoFn(org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn) List(java.util.List) WindowingStrategyTranslation(org.apache.beam.runners.core.construction.WindowingStrategyTranslation) Structs.getObject(org.apache.beam.runners.dataflow.util.Structs.getObject) SerializableUtils(org.apache.beam.sdk.util.SerializableUtils) Context(org.apache.beam.sdk.transforms.CombineWithContext.Context) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) PropertyNames(org.apache.beam.runners.dataflow.util.PropertyNames) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) TupleTag(org.apache.beam.sdk.values.TupleTag) SideInputReader(org.apache.beam.runners.core.SideInputReader) NullSideInputReader(org.apache.beam.runners.core.NullSideInputReader) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) KvCoder(org.apache.beam.sdk.coders.KvCoder) AppliedCombineFn(org.apache.beam.sdk.util.AppliedCombineFn) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) CannotProvideCoderException(org.apache.beam.sdk.coders.CannotProvideCoderException) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) Structs.getObject(org.apache.beam.runners.dataflow.util.Structs.getObject) Map(java.util.Map) Nullable(org.checkerframework.checker.nullness.qual.Nullable)

Example 15 with WindowedValueCoder

use of org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder in project beam by apache.

the class SparkBatchPortablePipelineTranslator method broadcastSideInputs.

/**
 * Broadcast the side inputs of an executable stage. *This can be expensive.*
 *
 * @return Map from PCollection ID to Spark broadcast variable and coder to decode its contents.
 */
private static <SideInputT> ImmutableMap<String, Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>>> broadcastSideInputs(RunnerApi.ExecutableStagePayload stagePayload, SparkTranslationContext context) {
    Map<String, Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>>> broadcastVariables = new HashMap<>();
    for (SideInputId sideInputId : stagePayload.getSideInputsList()) {
        RunnerApi.Components stagePayloadComponents = stagePayload.getComponents();
        String collectionId = stagePayloadComponents.getTransformsOrThrow(sideInputId.getTransformId()).getInputsOrThrow(sideInputId.getLocalName());
        if (broadcastVariables.containsKey(collectionId)) {
            // This PCollection has already been broadcast.
            continue;
        }
        Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>> tuple2 = broadcastSideInput(collectionId, stagePayloadComponents, context);
        broadcastVariables.put(collectionId, tuple2);
    }
    return ImmutableMap.copyOf(broadcastVariables);
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) PipelineTranslatorUtils.getWindowedValueCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.getWindowedValueCoder) HashMap(java.util.HashMap) Broadcast(org.apache.spark.broadcast.Broadcast) Tuple2(scala.Tuple2) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) List(java.util.List) SideInputId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.SideInputId)

Aggregations

WindowedValueCoder (org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder)16 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)9 WindowedValue (org.apache.beam.sdk.util.WindowedValue)8 Coder (org.apache.beam.sdk.coders.Coder)7 KvCoder (org.apache.beam.sdk.coders.KvCoder)7 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)6 List (java.util.List)5 Map (java.util.Map)5 IOException (java.io.IOException)4 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 RehydratedComponents (org.apache.beam.runners.core.construction.RehydratedComponents)4 InvalidProtocolBufferException (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException)4 ParallelInstruction (com.google.api.services.dataflow.model.ParallelInstruction)3 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)3 ParDoFn (org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn)3 KV (org.apache.beam.sdk.values.KV)3 PCollectionView (org.apache.beam.sdk.values.PCollectionView)3 TupleTag (org.apache.beam.sdk.values.TupleTag)3 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)3