use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class ProcessBundleHandler method createBundleProcessor.
private BundleProcessor createBundleProcessor(String bundleId, BeamFnApi.ProcessBundleRequest processBundleRequest) throws IOException {
BeamFnApi.ProcessBundleDescriptor bundleDescriptor = fnApiRegistry.apply(bundleId);
SetMultimap<String, String> pCollectionIdsToConsumingPTransforms = HashMultimap.create();
MetricsContainerStepMap metricsContainerRegistry = new MetricsContainerStepMap();
ExecutionStateTracker stateTracker = new ExecutionStateTracker(ExecutionStateSampler.instance());
PCollectionConsumerRegistry pCollectionConsumerRegistry = new PCollectionConsumerRegistry(metricsContainerRegistry, stateTracker);
HashSet<String> processedPTransformIds = new HashSet<>();
PTransformFunctionRegistry startFunctionRegistry = new PTransformFunctionRegistry(metricsContainerRegistry, stateTracker, ExecutionStateTracker.START_STATE_NAME);
PTransformFunctionRegistry finishFunctionRegistry = new PTransformFunctionRegistry(metricsContainerRegistry, stateTracker, ExecutionStateTracker.FINISH_STATE_NAME);
List<ThrowingRunnable> resetFunctions = new ArrayList<>();
List<ThrowingRunnable> tearDownFunctions = new ArrayList<>();
List<ProgressRequestCallback> progressRequestCallbacks = new ArrayList<>();
// Build a multimap of PCollection ids to PTransform ids which consume said PCollections
for (Map.Entry<String, RunnerApi.PTransform> entry : bundleDescriptor.getTransformsMap().entrySet()) {
for (String pCollectionId : entry.getValue().getInputsMap().values()) {
pCollectionIdsToConsumingPTransforms.put(pCollectionId, entry.getKey());
}
}
// Instantiate a State API call handler depending on whether a State ApiServiceDescriptor was
// specified.
HandleStateCallsForBundle beamFnStateClient;
if (bundleDescriptor.hasStateApiServiceDescriptor()) {
BeamFnStateClient underlyingClient = beamFnStateGrpcClientCache.forApiServiceDescriptor(bundleDescriptor.getStateApiServiceDescriptor());
beamFnStateClient = new BlockTillStateCallsFinish(underlyingClient);
} else {
beamFnStateClient = new FailAllStateCallsForBundle(processBundleRequest);
}
BundleSplitListener.InMemory splitListener = BundleSplitListener.InMemory.create();
Collection<CallbackRegistration> bundleFinalizationCallbackRegistrations = new ArrayList<>();
BundleFinalizer bundleFinalizer = new BundleFinalizer() {
@Override
public void afterBundleCommit(Instant callbackExpiry, Callback callback) {
bundleFinalizationCallbackRegistrations.add(CallbackRegistration.create(callbackExpiry, callback));
}
};
BundleProcessor bundleProcessor = BundleProcessor.create(processWideCache, bundleDescriptor, startFunctionRegistry, finishFunctionRegistry, resetFunctions, tearDownFunctions, progressRequestCallbacks, splitListener, pCollectionConsumerRegistry, metricsContainerRegistry, stateTracker, beamFnStateClient, bundleFinalizationCallbackRegistrations, runnerCapabilities);
// Create a BeamFnStateClient
for (Map.Entry<String, RunnerApi.PTransform> entry : bundleDescriptor.getTransformsMap().entrySet()) {
// TODO: Remove source as a root and have it be triggered by the Runner.
if (!DATA_INPUT_URN.equals(entry.getValue().getSpec().getUrn()) && !DATA_OUTPUT_URN.equals(entry.getValue().getSpec().getUrn()) && !JAVA_SOURCE_URN.equals(entry.getValue().getSpec().getUrn()) && !PTransformTranslation.READ_TRANSFORM_URN.equals(entry.getValue().getSpec().getUrn())) {
continue;
}
createRunnerAndConsumersForPTransformRecursively(beamFnStateClient, beamFnDataClient, entry.getKey(), entry.getValue(), bundleProcessor::getInstructionId, bundleProcessor::getCacheTokens, bundleProcessor::getBundleCache, bundleDescriptor, pCollectionIdsToConsumingPTransforms, pCollectionConsumerRegistry, processedPTransformIds, startFunctionRegistry, finishFunctionRegistry, resetFunctions::add, tearDownFunctions::add, (apiServiceDescriptor, dataEndpoint) -> {
if (!bundleProcessor.getInboundEndpointApiServiceDescriptors().contains(apiServiceDescriptor)) {
bundleProcessor.getInboundEndpointApiServiceDescriptors().add(apiServiceDescriptor);
}
bundleProcessor.getInboundDataEndpoints().add(dataEndpoint);
}, (timerEndpoint) -> {
if (!bundleDescriptor.hasTimerApiServiceDescriptor()) {
throw new IllegalStateException(String.format("Timers are unsupported because the " + "ProcessBundleRequest %s does not provide a timer ApiServiceDescriptor.", bundleId));
}
bundleProcessor.getTimerEndpoints().add(timerEndpoint);
}, progressRequestCallbacks::add, splitListener, bundleFinalizer, bundleProcessor.getChannelRoots(), bundleProcessor.getOutboundAggregators(), bundleProcessor.getRunnerCapabilities());
}
bundleProcessor.finish();
return bundleProcessor;
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class RemoteGrpcPortWriteTest method toFromPTransform.
@Test
public void toFromPTransform() throws InvalidProtocolBufferException {
RemoteGrpcPort port = RemoteGrpcPort.newBuilder().setApiServiceDescriptor(ApiServiceDescriptor.newBuilder().setUrl("foo").setAuthentication(AuthenticationSpec.getDefaultInstance()).build()).build();
RemoteGrpcPortWrite write = RemoteGrpcPortWrite.writeToPort("myPort", port);
PTransform ptransform = PTransform.parseFrom(write.toPTransform().toByteArray());
RemoteGrpcPortWrite serDeWrite = RemoteGrpcPortWrite.fromPTransform(ptransform);
assertThat(serDeWrite, equalTo(write));
assertThat(serDeWrite.getPort(), equalTo(write.getPort()));
assertThat(serDeWrite.toPTransform(), equalTo(ptransform));
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class GreedyPipelineFuser method sanitizeDanglingPTransformInputs.
private static ExecutableStage sanitizeDanglingPTransformInputs(ExecutableStage stage) {
/* Possible inputs to a PTransform can only be those which are:
* <ul>
* <li>Explicit input PCollection to the stage
* <li>Outputs of a PTransform within the same stage
* <li>Timer PCollections
* <li>Side input PCollections
* <li>Explicit outputs from the stage
* </ul>
*/
Set<String> possibleInputs = new HashSet<>();
possibleInputs.add(stage.getInputPCollection().getId());
possibleInputs.addAll(stage.getOutputPCollections().stream().map(PCollectionNode::getId).collect(Collectors.toSet()));
possibleInputs.addAll(stage.getSideInputs().stream().map(s -> s.collection().getId()).collect(Collectors.toSet()));
possibleInputs.addAll(stage.getTransforms().stream().flatMap(t -> t.getTransform().getOutputsMap().values().stream()).collect(Collectors.toSet()));
Set<String> danglingInputs = stage.getTransforms().stream().flatMap(t -> t.getTransform().getInputsMap().values().stream()).filter(in -> !possibleInputs.contains(in)).collect(Collectors.toSet());
ImmutableList.Builder<PTransformNode> pTransformNodesBuilder = ImmutableList.builder();
for (PTransformNode transformNode : stage.getTransforms()) {
PTransform transform = transformNode.getTransform();
Map<String, String> validInputs = transform.getInputsMap().entrySet().stream().filter(e -> !danglingInputs.contains(e.getValue())).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
if (!validInputs.equals(transform.getInputsMap())) {
// Dangling inputs found so recreate pTransform without the dangling inputs.
transformNode = PipelineNode.pTransform(transformNode.getId(), transform.toBuilder().clearInputs().putAllInputs(validInputs).build());
}
pTransformNodesBuilder.add(transformNode);
}
ImmutableList<PTransformNode> pTransformNodes = pTransformNodesBuilder.build();
Components.Builder componentBuilder = stage.getComponents().toBuilder();
// Update the pTransforms in components.
componentBuilder.clearTransforms().putAllTransforms(pTransformNodes.stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform)));
Map<String, PCollection> validPCollectionMap = stage.getComponents().getPcollectionsMap().entrySet().stream().filter(e -> !danglingInputs.contains(e.getKey())).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
// Update pCollections in the components.
componentBuilder.clearPcollections().putAllPcollections(validPCollectionMap);
return ImmutableExecutableStage.of(componentBuilder.build(), stage.getEnvironment(), stage.getInputPCollection(), stage.getSideInputs(), stage.getUserStates(), stage.getTimers(), pTransformNodes, stage.getOutputPCollections(), stage.getWireCoderSettings());
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class GreedyPipelineFuser method fusePipeline.
/**
* Fuses a {@link Pipeline} into a collection of {@link ExecutableStage}.
*
* <p>The input is the initial collection of siblings sets which will be fused into {@link
* ExecutableStage stages}. A sibling in this context represents a pair of (PCollection,
* PTransform), where the PTransform consumes input elements on a per-element basis from the
* PCollection, represented by a {@link CollectionConsumer}. A sibling set is a collection of
* siblings which can execute within a single {@link ExecutableStage}, determined by {@link
* GreedyPCollectionFusers#isCompatible(PTransformNode, PTransformNode, QueryablePipeline)}.
*
* <p>While a pending sibling set exists:
*
* <ul>
* <li>Retrieve a pending sibling set from the front of the queue.
* <li>If the pending sibling set has already been created, continue. Each materialized {@link
* PTransformNode} can be consumed by any number of {@link ExecutableStage stages}, but each
* {@link PTransformNode} may only be present in a single stage rooted at a single {@link
* PCollectionNode}, otherwise it will process elements of that {@link PCollectionNode}
* multiple times.
* <li>Create a {@link GreedyStageFuser} with those siblings as the initial consuming transforms
* of the stage
* <li>For each materialized {@link PCollectionNode}, find all of the descendant in-environment
* consumers. See {@link #getDescendantConsumers(PCollectionNode)} for details.
* <li>Construct all of the sibling sets from the descendant in-environment consumers, and add
* them to the queue of sibling sets.
* </ul>
*/
private FusedPipeline fusePipeline(Collection<PTransformNode> initialUnfusedTransforms, NavigableSet<NavigableSet<CollectionConsumer>> initialConsumers, Set<String> requirements) {
Map<CollectionConsumer, ExecutableStage> consumedCollectionsAndTransforms = new HashMap<>();
Set<ExecutableStage> stages = new LinkedHashSet<>();
Set<PTransformNode> unfusedTransforms = new LinkedHashSet<>(initialUnfusedTransforms);
Queue<Set<CollectionConsumer>> pendingSiblingSets = new ArrayDeque<>(initialConsumers);
while (!pendingSiblingSets.isEmpty()) {
// Only introduce new PCollection consumers. Not performing this introduces potential
// duplicate paths through the pipeline.
Set<CollectionConsumer> candidateSiblings = pendingSiblingSets.poll();
Set<CollectionConsumer> siblingSet = Sets.difference(candidateSiblings, consumedCollectionsAndTransforms.keySet());
checkState(siblingSet.equals(candidateSiblings) || siblingSet.isEmpty(), "Inconsistent collection of siblings reported for a %s. Initial attempt missed %s", PCollectionNode.class.getSimpleName(), siblingSet);
if (siblingSet.isEmpty()) {
LOG.debug("Filtered out duplicate stage root {}", candidateSiblings);
continue;
}
// Create the stage with these siblings as the initial consuming transforms
ExecutableStage stage = fuseSiblings(siblingSet);
// don't place them in multiple stages.
for (CollectionConsumer sibling : siblingSet) {
consumedCollectionsAndTransforms.put(sibling, stage);
}
stages.add(stage);
for (PCollectionNode materializedOutput : stage.getOutputPCollections()) {
// Get all of the descendant consumers of each materialized PCollection, and add them to the
// queue of pending siblings.
DescendantConsumers descendantConsumers = getDescendantConsumers(materializedOutput);
unfusedTransforms.addAll(descendantConsumers.getUnfusedNodes());
NavigableSet<NavigableSet<CollectionConsumer>> siblings = groupSiblings(descendantConsumers.getFusibleConsumers());
pendingSiblingSets.addAll(siblings);
}
}
// TODO: Figure out where to store this.
DeduplicationResult deduplicated = OutputDeduplicator.ensureSingleProducer(pipeline, stages, unfusedTransforms);
// as can compatible producers/consumers if a PCollection is only materialized once.
return FusedPipeline.of(deduplicated.getDeduplicatedComponents(), stages.stream().map(stage -> deduplicated.getDeduplicatedStages().getOrDefault(stage, stage)).map(GreedyPipelineFuser::sanitizeDanglingPTransformInputs).collect(Collectors.toSet()), Sets.union(deduplicated.getIntroducedTransforms(), unfusedTransforms.stream().map(transform -> deduplicated.getDeduplicatedTransforms().getOrDefault(transform.getId(), transform)).collect(Collectors.toSet())), requirements);
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class ExecutableStage method toPTransform.
/**
* Returns a composite {@link PTransform} which is equivalent to this {@link ExecutableStage} as
* follows:
*
* <ul>
* <li>The {@link PTransform#getSubtransformsList()} is empty. This ensures that executable
* stages are treated as primitive transforms.
* <li>The only {@link PCollection PCollections} in the {@link PTransform#getInputsMap()} is the
* result of {@link #getInputPCollection()} and {@link #getSideInputs()}.
* <li>The output {@link PCollection PCollections} in the values of {@link
* PTransform#getOutputsMap()} are the {@link PCollectionNode PCollections} returned by
* {@link #getOutputPCollections()}.
* <li>The {@link PTransform#getSpec()} contains an {@link ExecutableStagePayload} with inputs
* and outputs equal to the PTransform's inputs and outputs, and transforms equal to the
* result of {@link #getTransforms}.
* </ul>
*
* <p>The executable stage can be reconstructed from the resulting {@link ExecutableStagePayload}
* via {@link #fromPayload(ExecutableStagePayload)}.
*/
default PTransform toPTransform(String uniqueName) {
PTransform.Builder pt = PTransform.newBuilder().setUniqueName(uniqueName);
ExecutableStagePayload.Builder payload = ExecutableStagePayload.newBuilder();
payload.setEnvironment(getEnvironment());
payload.addAllWireCoderSettings(getWireCoderSettings());
// Populate inputs and outputs of the stage payload and outer PTransform simultaneously.
PCollectionNode input = getInputPCollection();
pt.putInputs("input", getInputPCollection().getId());
payload.setInput(input.getId());
for (SideInputReference sideInput : getSideInputs()) {
// Side inputs of the ExecutableStage itself can be uniquely identified by inner PTransform
// name and local name.
String outerLocalName = String.format("%s:%s", sideInput.transform().getId(), sideInput.localName());
pt.putInputs(outerLocalName, sideInput.collection().getId());
payload.addSideInputs(SideInputId.newBuilder().setTransformId(sideInput.transform().getId()).setLocalName(sideInput.localName()));
}
for (UserStateReference userState : getUserStates()) {
payload.addUserStates(UserStateId.newBuilder().setTransformId(userState.transform().getId()).setLocalName(userState.localName()));
}
for (TimerReference timer : getTimers()) {
payload.addTimers(TimerId.newBuilder().setTransformId(timer.transform().getId()).setLocalName(timer.localName()));
}
int outputIndex = 0;
for (PCollectionNode output : getOutputPCollections()) {
pt.putOutputs(String.format("materialized_%d", outputIndex), output.getId());
payload.addOutputs(output.getId());
outputIndex++;
}
// stage payload.
for (PTransformNode transform : getTransforms()) {
payload.addTransforms(transform.getId());
}
payload.setComponents(getComponents().toBuilder().clearTransforms().putAllTransforms(getTransforms().stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform))));
pt.setSpec(FunctionSpec.newBuilder().setUrn(ExecutableStage.URN).setPayload(payload.build().toByteString()).build());
return pt.build();
}
Aggregations