use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.
the class PipelineValidator method validateComponents.
private static void validateComponents(String context, Components components, Set<String> requirements) {
{
Map<String, String> uniqueNamesById = Maps.newHashMap();
for (String transformId : components.getTransformsMap().keySet()) {
PTransform transform = components.getTransformsOrThrow(transformId);
String previousId = uniqueNamesById.put(transform.getUniqueName(), transformId);
// A transform is allowed to not have unique_name set, but, obviously,
// there can be only one such transform with an empty name.
// It's allowed for the (only) root transform to have the empty unique_name.
checkArgument(previousId == null, "%s: Transforms %s and %s both have unique_name \"%s\"", context, transformId, previousId, transform.getUniqueName());
validateTransform(transformId, transform, components, requirements);
}
}
{
Map<String, String> uniqueNamesById = Maps.newHashMap();
for (String pcollectionId : components.getPcollectionsMap().keySet()) {
PCollection pc = components.getPcollectionsOrThrow(pcollectionId);
checkArgument(!pc.getUniqueName().isEmpty(), "%s: PCollection %s does not have a unique_name set", context, pcollectionId);
String previousId = uniqueNamesById.put(pc.getUniqueName(), pcollectionId);
checkArgument(previousId == null, "%s: PCollections %s and %s both have unique_name \"%s\"", context, pcollectionId, previousId, pc.getUniqueName());
checkArgument(components.containsCoders(pc.getCoderId()), "%s: PCollection %s uses unknown coder %s", context, pcollectionId, pc.getCoderId());
checkArgument(components.containsWindowingStrategies(pc.getWindowingStrategyId()), "%s: PCollection %s uses unknown windowing strategy %s", context, pcollectionId, pc.getWindowingStrategyId());
}
}
for (String strategyId : components.getWindowingStrategiesMap().keySet()) {
WindowingStrategy strategy = components.getWindowingStrategiesOrThrow(strategyId);
checkArgument(components.containsCoders(strategy.getWindowCoderId()), "%s: WindowingStrategy %s uses unknown coder %s", context, strategyId, strategy.getWindowCoderId());
}
for (String coderId : components.getCodersMap().keySet()) {
for (String componentCoderId : components.getCodersOrThrow(coderId).getComponentCoderIdsList()) {
checkArgument(components.containsCoders(componentCoderId), "%s: Coder %s uses unknown component coder %s", context, coderId, componentCoderId);
}
}
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.
the class QueryablePipeline method buildNetwork.
private MutableNetwork<PipelineNode, PipelineEdge> buildNetwork(Collection<String> transformIds, Components components) {
MutableNetwork<PipelineNode, PipelineEdge> network = NetworkBuilder.directed().allowsParallelEdges(true).allowsSelfLoops(false).build();
Set<PCollectionNode> unproducedCollections = new HashSet<>();
for (String transformId : transformIds) {
PTransform transform = components.getTransformsOrThrow(transformId);
PTransformNode transformNode = PipelineNode.pTransform(transformId, this.components.getTransformsOrThrow(transformId));
network.addNode(transformNode);
for (String produced : transform.getOutputsMap().values()) {
PCollectionNode producedNode = PipelineNode.pCollection(produced, components.getPcollectionsOrThrow(produced));
network.addNode(producedNode);
network.addEdge(transformNode, producedNode, new PerElementEdge());
checkArgument(network.inDegree(producedNode) == 1, "A %s should have exactly one producing %s, but found %s:\nPCollection:\n%s\nProducers:\n%s", PCollectionNode.class.getSimpleName(), PTransformNode.class.getSimpleName(), network.predecessors(producedNode).size(), producedNode, network.predecessors(producedNode));
unproducedCollections.remove(producedNode);
}
for (Map.Entry<String, String> consumed : transform.getInputsMap().entrySet()) {
// This loop may add an edge between the consumed PCollection and the current PTransform.
// The local name of the transform must be used to determine the type of edge.
String pcollectionId = consumed.getValue();
PCollectionNode consumedNode = PipelineNode.pCollection(pcollectionId, this.components.getPcollectionsOrThrow(pcollectionId));
if (network.addNode(consumedNode)) {
// This node has been added to the network for the first time, so it has no producer.
unproducedCollections.add(consumedNode);
}
if (getLocalSideInputNames(transform).contains(consumed.getKey())) {
network.addEdge(consumedNode, transformNode, new SingletonEdge());
} else {
network.addEdge(consumedNode, transformNode, new PerElementEdge());
}
}
}
checkArgument(unproducedCollections.isEmpty(), "%ss %s were consumed but never produced", PCollectionNode.class.getSimpleName(), unproducedCollections);
return network;
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.
the class PCollectionTranslationTest method testEncodeDecodeCycle.
@Test
public void testEncodeDecodeCycle() throws Exception {
// Encode
SdkComponents sdkComponents = SdkComponents.create();
sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
RunnerApi.PCollection protoCollection = PCollectionTranslation.toProto(testCollection, sdkComponents);
RehydratedComponents protoComponents = RehydratedComponents.forComponents(sdkComponents.toComponents());
// Decode
Pipeline pipeline = Pipeline.create();
PCollection<?> decodedCollection = PCollectionTranslation.fromProto(protoCollection, pipeline, protoComponents);
// Verify
assertThat(decodedCollection.getCoder(), equalTo(testCollection.getCoder()));
assertThat(decodedCollection.getWindowingStrategy(), equalTo(testCollection.getWindowingStrategy().withEnvironmentId(sdkComponents.getOnlyEnvironmentId()).fixDefaults()));
assertThat(decodedCollection.isBounded(), equalTo(testCollection.isBounded()));
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.
the class OutputDeduplicator method deduplicateStageOutput.
/**
* Returns an {@link ExecutableStage} where all of the {@link PCollectionNode PCollections}
* matching the original are replaced with the introduced partial {@link PCollection} in all
* references made within the {@link ExecutableStage}.
*/
private static ExecutableStage deduplicateStageOutput(ExecutableStage stage, Map<String, PCollectionNode> originalToPartial) {
Collection<PTransformNode> updatedTransforms = new ArrayList<>();
for (PTransformNode transform : stage.getTransforms()) {
PTransform updatedTransform = updateOutputs(transform.getTransform(), originalToPartial);
updatedTransforms.add(PipelineNode.pTransform(transform.getId(), updatedTransform));
}
Collection<PCollectionNode> updatedOutputs = new ArrayList<>();
for (PCollectionNode output : stage.getOutputPCollections()) {
updatedOutputs.add(originalToPartial.getOrDefault(output.getId(), output));
}
RunnerApi.Components updatedStageComponents = stage.getComponents().toBuilder().clearTransforms().putAllTransforms(updatedTransforms.stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform))).putAllPcollections(originalToPartial.values().stream().collect(Collectors.toMap(PCollectionNode::getId, PCollectionNode::getPCollection))).build();
return ImmutableExecutableStage.of(updatedStageComponents, stage.getEnvironment(), stage.getInputPCollection(), stage.getSideInputs(), stage.getUserStates(), stage.getTimers(), updatedTransforms, updatedOutputs, stage.getWireCoderSettings());
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.
the class OutputDeduplicator method ensureSingleProducer.
/**
* Ensure that no {@link PCollection} output by any of the {@code stages} or {@code
* unfusedTransforms} is produced by more than one of those stages or transforms.
*
* <p>For each {@link PCollection} output by multiple stages and/or transforms, each producer is
* rewritten to produce a partial {@link PCollection}, which are then flattened together via an
* introduced Flatten node which produces the original output.
*/
static DeduplicationResult ensureSingleProducer(QueryablePipeline pipeline, Collection<ExecutableStage> stages, Collection<PTransformNode> unfusedTransforms) {
RunnerApi.Components.Builder unzippedComponents = pipeline.getComponents().toBuilder();
Multimap<PCollectionNode, StageOrTransform> pcollectionProducers = getProducers(pipeline, stages, unfusedTransforms);
Multimap<StageOrTransform, PCollectionNode> requiresNewOutput = HashMultimap.create();
// ExecutableStage must also be rewritten to have updated outputs and transforms.
for (Map.Entry<PCollectionNode, Collection<StageOrTransform>> collectionProducer : pcollectionProducers.asMap().entrySet()) {
if (collectionProducer.getValue().size() > 1) {
for (StageOrTransform producer : collectionProducer.getValue()) {
requiresNewOutput.put(producer, collectionProducer.getKey());
}
}
}
Map<ExecutableStage, ExecutableStage> updatedStages = new LinkedHashMap<>();
Map<String, PTransformNode> updatedTransforms = new LinkedHashMap<>();
Multimap<String, PCollectionNode> originalToPartial = HashMultimap.create();
for (Map.Entry<StageOrTransform, Collection<PCollectionNode>> deduplicationTargets : requiresNewOutput.asMap().entrySet()) {
if (deduplicationTargets.getKey().getStage() != null) {
StageDeduplication deduplication = deduplicatePCollections(deduplicationTargets.getKey().getStage(), deduplicationTargets.getValue(), unzippedComponents::containsPcollections);
for (Entry<String, PCollectionNode> originalToPartialReplacement : deduplication.getOriginalToPartialPCollections().entrySet()) {
originalToPartial.put(originalToPartialReplacement.getKey(), originalToPartialReplacement.getValue());
unzippedComponents.putPcollections(originalToPartialReplacement.getValue().getId(), originalToPartialReplacement.getValue().getPCollection());
}
updatedStages.put(deduplicationTargets.getKey().getStage(), deduplication.getUpdatedStage());
} else if (deduplicationTargets.getKey().getTransform() != null) {
PTransformDeduplication deduplication = deduplicatePCollections(deduplicationTargets.getKey().getTransform(), deduplicationTargets.getValue(), unzippedComponents::containsPcollections);
for (Entry<String, PCollectionNode> originalToPartialReplacement : deduplication.getOriginalToPartialPCollections().entrySet()) {
originalToPartial.put(originalToPartialReplacement.getKey(), originalToPartialReplacement.getValue());
unzippedComponents.putPcollections(originalToPartialReplacement.getValue().getId(), originalToPartialReplacement.getValue().getPCollection());
}
updatedTransforms.put(deduplicationTargets.getKey().getTransform().getId(), deduplication.getUpdatedTransform());
} else {
throw new IllegalStateException(String.format("%s with no %s or %s", StageOrTransform.class.getSimpleName(), ExecutableStage.class.getSimpleName(), PTransformNode.class.getSimpleName()));
}
}
Set<PTransformNode> introducedFlattens = new LinkedHashSet<>();
for (Map.Entry<String, Collection<PCollectionNode>> partialFlattenTargets : originalToPartial.asMap().entrySet()) {
String flattenId = SyntheticComponents.uniqueId("unzipped_flatten", unzippedComponents::containsTransforms);
PTransform flattenPartialPCollections = createFlattenOfPartials(flattenId, partialFlattenTargets.getKey(), partialFlattenTargets.getValue());
unzippedComponents.putTransforms(flattenId, flattenPartialPCollections);
introducedFlattens.add(PipelineNode.pTransform(flattenId, flattenPartialPCollections));
}
Components components = unzippedComponents.build();
return DeduplicationResult.of(components, introducedFlattens, updatedStages, updatedTransforms);
}
Aggregations