Search in sources :

Example 1 with JobBundleFactory

use of org.apache.beam.runners.fnexecution.control.JobBundleFactory in project flink by apache.

the class PythonTestUtils method createMockJobBundleFactory.

public static JobBundleFactory createMockJobBundleFactory() {
    JobBundleFactory jobBundleFactorySpy = spy(JobBundleFactory.class);
    StageBundleFactory stageBundleFactorySpy = spy(StageBundleFactory.class);
    when(jobBundleFactorySpy.forStage(any())).thenReturn(stageBundleFactorySpy);
    ProcessBundleDescriptors.ExecutableProcessBundleDescriptor processBundleDescriptor = spy(ProcessBundleDescriptors.ExecutableProcessBundleDescriptor.class);
    when(stageBundleFactorySpy.getProcessBundleDescriptor()).thenReturn(processBundleDescriptor);
    RemoteBundle remoteBundleSpy = spy(RemoteBundle.class);
    try {
        when(stageBundleFactorySpy.getBundle(any(OutputReceiverFactory.class), any(TimerReceiverFactory.class), any(StateRequestHandler.class), any(BundleProgressHandler.class))).thenReturn(remoteBundleSpy);
    } catch (Exception e) {
    // ignore
    }
    Map<String, FnDataReceiver> inputReceivers = new HashMap<>();
    FnDataReceiver<WindowedValue<?>> windowedValueReceiverSpy = spy(FnDataReceiver.class);
    inputReceivers.put("input", windowedValueReceiverSpy);
    when(remoteBundleSpy.getInputReceivers()).thenReturn(inputReceivers);
    return jobBundleFactorySpy;
}
Also used : StateRequestHandler(org.apache.beam.runners.fnexecution.state.StateRequestHandler) FnDataReceiver(org.apache.beam.sdk.fn.data.FnDataReceiver) HashMap(java.util.HashMap) ProcessBundleDescriptors(org.apache.beam.runners.fnexecution.control.ProcessBundleDescriptors) JobBundleFactory(org.apache.beam.runners.fnexecution.control.JobBundleFactory) StageBundleFactory(org.apache.beam.runners.fnexecution.control.StageBundleFactory) OutputReceiverFactory(org.apache.beam.runners.fnexecution.control.OutputReceiverFactory) WindowedValue(org.apache.beam.sdk.util.WindowedValue) TimerReceiverFactory(org.apache.beam.runners.fnexecution.control.TimerReceiverFactory) RemoteBundle(org.apache.beam.runners.fnexecution.control.RemoteBundle) BundleProgressHandler(org.apache.beam.runners.fnexecution.control.BundleProgressHandler)

Example 2 with JobBundleFactory

use of org.apache.beam.runners.fnexecution.control.JobBundleFactory in project beam by apache.

the class BeamFnMapTaskExecutorFactory method create.

/**
 * Creates a new {@link DataflowMapTaskExecutor} from the given {@link MapTask} definition using
 * the provided {@link ReaderFactory}.
 */
@Override
public DataflowMapTaskExecutor create(InstructionRequestHandler instructionRequestHandler, GrpcFnServer<GrpcDataService> grpcDataFnServer, Endpoints.ApiServiceDescriptor dataApiServiceDescriptor, GrpcFnServer<GrpcStateService> grpcStateFnServer, MutableNetwork<Node, Edge> network, PipelineOptions options, String stageName, ReaderFactory readerFactory, SinkFactory sinkFactory, DataflowExecutionContext<?> executionContext, CounterSet counterSet, IdGenerator idGenerator) {
    // TODO: remove this once we trust the code paths
    checkArgument(DataflowRunner.hasExperiment(options.as(DataflowPipelineDebugOptions.class), "beam_fn_api"), "%s should only be used when beam_fn_api is enabled", getClass().getSimpleName());
    // Swap out all the InstructionOutput nodes with OutputReceiver nodes
    Networks.replaceDirectedNetworkNodes(network, createOutputReceiversTransform(stageName, counterSet));
    if (DataflowRunner.hasExperiment(options.as(DataflowPipelineDebugOptions.class), "use_executable_stage_bundle_execution")) {
        LOG.debug("Using SingleEnvironmentInstanceJobBundleFactory");
        JobBundleFactory jobBundleFactory = SingleEnvironmentInstanceJobBundleFactory.create(StaticRemoteEnvironmentFactory.forService(instructionRequestHandler), grpcDataFnServer, grpcStateFnServer, idGenerator);
        // If the use_executable_stage_bundle_execution is enabled, use ExecutableStage instead.
        Networks.replaceDirectedNetworkNodes(network, createOperationTransformForExecutableStageNode(network, stageName, executionContext, jobBundleFactory));
    } else {
        // Swap out all the RegisterFnRequest nodes with Operation nodes
        Networks.replaceDirectedNetworkNodes(network, createOperationTransformForRegisterFnNodes(idGenerator, instructionRequestHandler, grpcStateFnServer.getService(), stageName, executionContext));
        // Swap out all the RemoteGrpcPort nodes with Operation nodes, note that it is expected
        // that the RegisterFnRequest nodes have already been replaced.
        Networks.replaceDirectedNetworkNodes(network, createOperationTransformForGrpcPortNodes(network, grpcDataFnServer.getService(), // TODO: Set NameContext properly for these operations.
        executionContext.createOperationContext(NameContext.create(stageName, stageName, stageName, stageName))));
    }
    // Swap out all the FetchAndFilterStreamingSideInput nodes with operation nodes
    Networks.replaceDirectedNetworkNodes(network, createOperationTransformForFetchAndFilterStreamingSideInputNodes(network, idGenerator, instructionRequestHandler, grpcDataFnServer.getService(), dataApiServiceDescriptor, executionContext, stageName));
    // Swap out all the ParallelInstruction nodes with Operation nodes
    Networks.replaceDirectedNetworkNodes(network, createOperationTransformForParallelInstructionNodes(stageName, network, options, readerFactory, sinkFactory, executionContext));
    // Collect all the operations within the network and attach all the operations as receivers
    // to preceding output receivers.
    List<Operation> topoSortedOperations = new ArrayList<>();
    for (OperationNode node : Iterables.filter(Networks.topologicalOrder(network), OperationNode.class)) {
        topoSortedOperations.add(node.getOperation());
        for (Node predecessor : Iterables.filter(network.predecessors(node), OutputReceiverNode.class)) {
            ((OutputReceiverNode) predecessor).getOutputReceiver().addOutput((Receiver) node.getOperation());
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.info("Map task network: {}", Networks.toDot(network));
    }
    return BeamFnMapTaskExecutor.withSharedCounterSet(topoSortedOperations, counterSet, executionContext.getExecutionStateTracker());
}
Also used : JobBundleFactory(org.apache.beam.runners.fnexecution.control.JobBundleFactory) SingleEnvironmentInstanceJobBundleFactory(org.apache.beam.runners.fnexecution.control.SingleEnvironmentInstanceJobBundleFactory) OperationNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.OperationNode) RegisterRequestNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RegisterRequestNode) FetchAndFilterStreamingSideInputsNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.FetchAndFilterStreamingSideInputsNode) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) OperationNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.OperationNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) ExecutableStageNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ExecutableStageNode) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) OutputReceiverNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.OutputReceiverNode) ArrayList(java.util.ArrayList) DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions) FlattenOperation(org.apache.beam.runners.dataflow.worker.util.common.worker.FlattenOperation) ParDoOperation(org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoOperation) ReadOperation(org.apache.beam.runners.dataflow.worker.util.common.worker.ReadOperation) WriteOperation(org.apache.beam.runners.dataflow.worker.util.common.worker.WriteOperation) RemoteGrpcPortReadOperation(org.apache.beam.runners.dataflow.worker.fn.data.RemoteGrpcPortReadOperation) RegisterAndProcessBundleOperation(org.apache.beam.runners.dataflow.worker.fn.control.RegisterAndProcessBundleOperation) RemoteGrpcPortWriteOperation(org.apache.beam.runners.dataflow.worker.fn.data.RemoteGrpcPortWriteOperation) ProcessRemoteBundleOperation(org.apache.beam.runners.dataflow.worker.fn.control.ProcessRemoteBundleOperation) Operation(org.apache.beam.runners.dataflow.worker.util.common.worker.Operation)

Example 3 with JobBundleFactory

use of org.apache.beam.runners.fnexecution.control.JobBundleFactory in project flink by apache.

the class BeamPythonFunctionRunner method open.

// ------------------------------------------------------------------------
@Override
public void open(PythonConfig config) throws Exception {
    this.bundleStarted = false;
    this.resultBuffer = new LinkedBlockingQueue<>();
    this.reusableResultTuple = new Tuple2<>();
    // The creation of stageBundleFactory depends on the initialized environment manager.
    environmentManager.open();
    PortablePipelineOptions portableOptions = PipelineOptionsFactory.as(PortablePipelineOptions.class);
    if (jobOptions.containsKey(PythonOptions.STATE_CACHE_SIZE.key())) {
        portableOptions.as(ExperimentalOptions.class).setExperiments(Collections.singletonList(ExperimentalOptions.STATE_CACHE_SIZE + "=" + jobOptions.get(PythonOptions.STATE_CACHE_SIZE.key())));
    }
    Struct pipelineOptions = PipelineOptionsTranslation.toProto(portableOptions);
    if (memoryManager != null && config.isUsingManagedMemory()) {
        Preconditions.checkArgument(managedMemoryFraction > 0 && managedMemoryFraction <= 1.0, "The configured managed memory fraction for Python worker process must be within (0, 1], was: %s. " + "It may be because the consumer type \"Python\" was missing or set to 0 for the config option \"taskmanager.memory.managed.consumer-weights\"." + managedMemoryFraction);
        final LongFunctionWithException<PythonSharedResources, Exception> initializer = (size) -> new PythonSharedResources(createJobBundleFactory(pipelineOptions), createPythonExecutionEnvironment(size));
        sharedResources = memoryManager.getSharedMemoryResourceForManagedMemory(MANAGED_MEMORY_RESOURCE_ID, initializer, managedMemoryFraction);
        LOG.info("Obtained shared Python process of size {} bytes", sharedResources.getSize());
        sharedResources.getResourceHandle().addPythonEnvironmentManager(environmentManager);
        JobBundleFactory jobBundleFactory = sharedResources.getResourceHandle().getJobBundleFactory();
        RunnerApi.Environment environment = sharedResources.getResourceHandle().getEnvironment();
        stageBundleFactory = createStageBundleFactory(jobBundleFactory, environment);
    } else {
        // there is no way to access the MemoryManager for the batch job of old planner,
        // fallback to the way that spawning a Python process for each Python operator
        jobBundleFactory = createJobBundleFactory(pipelineOptions);
        stageBundleFactory = createStageBundleFactory(jobBundleFactory, createPythonExecutionEnvironment(-1));
    }
    progressHandler = getProgressHandler(flinkMetricContainer);
}
Also used : PythonOptions(org.apache.flink.python.PythonOptions) OpaqueMemoryResource(org.apache.flink.runtime.memory.OpaqueMemoryResource) Arrays(java.util.Arrays) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Tuple2(org.apache.flink.api.java.tuple.Tuple2) PortablePipelineOptions(org.apache.beam.sdk.options.PortablePipelineOptions) LoggerFactory(org.slf4j.LoggerFactory) TimerInternals(org.apache.beam.runners.core.TimerInternals) UserStateReference(org.apache.beam.runners.core.construction.graph.UserStateReference) PythonFunctionRunner(org.apache.flink.python.PythonFunctionRunner) WINDOW_CODER_ID(org.apache.flink.python.Constants.WINDOW_CODER_ID) SideInputReference(org.apache.beam.runners.core.construction.graph.SideInputReference) JobBundleFactory(org.apache.beam.runners.fnexecution.control.JobBundleFactory) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Map(java.util.Map) TimerReference(org.apache.beam.runners.core.construction.graph.TimerReference) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) FlinkFnApi(org.apache.flink.fnexecution.v1.FlinkFnApi) JobInfo(org.apache.beam.runners.fnexecution.provisioning.JobInfo) TimerReceiverFactory(org.apache.beam.runners.fnexecution.control.TimerReceiverFactory) TimerRegistration(org.apache.flink.streaming.api.operators.python.timer.TimerRegistration) INPUT_COLLECTION_ID(org.apache.flink.python.Constants.INPUT_COLLECTION_ID) TypeSerializer(org.apache.flink.api.common.typeutils.TypeSerializer) StageBundleFactory(org.apache.beam.runners.fnexecution.control.StageBundleFactory) PythonEnvironment(org.apache.flink.python.env.PythonEnvironment) FnDataReceiver(org.apache.beam.sdk.fn.data.FnDataReceiver) Collection(java.util.Collection) ImmutableExecutableStage(org.apache.beam.runners.core.construction.graph.ImmutableExecutableStage) BundleProgressHandler(org.apache.beam.runners.fnexecution.control.BundleProgressHandler) FlinkMetricContainer(org.apache.flink.python.metric.FlinkMetricContainer) BeamFnApi(org.apache.beam.model.fnexecution.v1.BeamFnApi) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) Preconditions(org.apache.flink.util.Preconditions) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) ModelCoders(org.apache.beam.runners.core.construction.ModelCoders) LongFunctionWithException(org.apache.flink.util.function.LongFunctionWithException) List(java.util.List) WINDOW_STRATEGY(org.apache.flink.python.Constants.WINDOW_STRATEGY) Optional(java.util.Optional) OUTPUT_COLLECTION_ID(org.apache.flink.python.Constants.OUTPUT_COLLECTION_ID) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ExperimentalOptions(org.apache.beam.sdk.options.ExperimentalOptions) Coder(org.apache.beam.sdk.coders.Coder) ProcessPythonEnvironmentManager(org.apache.flink.python.env.process.ProcessPythonEnvironmentManager) PipelineOptionsTranslation(org.apache.beam.runners.core.construction.PipelineOptionsTranslation) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) Environments(org.apache.beam.runners.core.construction.Environments) WRAPPER_TIMER_CODER_ID(org.apache.flink.python.Constants.WRAPPER_TIMER_CODER_ID) RemoteBundle(org.apache.beam.runners.fnexecution.control.RemoteBundle) BiConsumer(java.util.function.BiConsumer) DefaultJobBundleFactory(org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory) StateRequestHandler(org.apache.beam.runners.fnexecution.state.StateRequestHandler) Nullable(javax.annotation.Nullable) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Logger(org.slf4j.Logger) ProtoUtils.createCoderProto(org.apache.flink.streaming.api.utils.ProtoUtils.createCoderProto) OutputReceiverFactory(org.apache.beam.runners.fnexecution.control.OutputReceiverFactory) ProcessPythonEnvironment(org.apache.flink.python.env.process.ProcessPythonEnvironment) IOException(java.io.IOException) KeyedStateBackend(org.apache.flink.runtime.state.KeyedStateBackend) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) Timer(org.apache.beam.runners.core.construction.Timer) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) TIMER_CODER_ID(org.apache.flink.python.Constants.TIMER_CODER_ID) Internal(org.apache.flink.annotation.Internal) Struct(org.apache.beam.vendor.grpc.v1p26p0.com.google.protobuf.Struct) PythonConfig(org.apache.flink.python.PythonConfig) Collections(java.util.Collections) BeamUrns.getUrn(org.apache.beam.runners.core.construction.BeamUrns.getUrn) JobBundleFactory(org.apache.beam.runners.fnexecution.control.JobBundleFactory) DefaultJobBundleFactory(org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) PortablePipelineOptions(org.apache.beam.sdk.options.PortablePipelineOptions) ExperimentalOptions(org.apache.beam.sdk.options.ExperimentalOptions) LongFunctionWithException(org.apache.flink.util.function.LongFunctionWithException) IOException(java.io.IOException) Struct(org.apache.beam.vendor.grpc.v1p26p0.com.google.protobuf.Struct)

Aggregations

JobBundleFactory (org.apache.beam.runners.fnexecution.control.JobBundleFactory)3 BundleProgressHandler (org.apache.beam.runners.fnexecution.control.BundleProgressHandler)2 OutputReceiverFactory (org.apache.beam.runners.fnexecution.control.OutputReceiverFactory)2 RemoteBundle (org.apache.beam.runners.fnexecution.control.RemoteBundle)2 StageBundleFactory (org.apache.beam.runners.fnexecution.control.StageBundleFactory)2 TimerReceiverFactory (org.apache.beam.runners.fnexecution.control.TimerReceiverFactory)2 StateRequestHandler (org.apache.beam.runners.fnexecution.state.StateRequestHandler)2 FnDataReceiver (org.apache.beam.sdk.fn.data.FnDataReceiver)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Optional (java.util.Optional)1 LinkedBlockingQueue (java.util.concurrent.LinkedBlockingQueue)1 BiConsumer (java.util.function.BiConsumer)1