use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class EnvironmentsTest method getEnvironmentPTransform.
@Test
public void getEnvironmentPTransform() throws IOException {
Pipeline p = Pipeline.create();
SdkComponents components = SdkComponents.create();
Environment env = Environments.createDockerEnvironment("java");
components.registerEnvironment(env);
ParDoPayload payload = ParDoTranslation.translateParDo(ParDo.of(new DoFn<String, String>() {
@ProcessElement
public void process(ProcessContext ctxt) {
}
}).withOutputTags(new TupleTag<>(), TupleTagList.empty()), PCollection.createPrimitiveOutputInternal(p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, StringUtf8Coder.of()), DoFnSchemaInformation.create(), Pipeline.create(), components);
RehydratedComponents rehydratedComponents = RehydratedComponents.forComponents(components.toComponents());
PTransform ptransform = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(payload.toByteString()).build()).setEnvironmentId(components.getOnlyEnvironmentId()).build();
Environment env1 = Environments.getEnvironment(ptransform, rehydratedComponents).get();
assertThat(env1, equalTo(components.toComponents().getEnvironmentsOrThrow(ptransform.getEnvironmentId())));
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class PCollectionTranslationTest method testEncodeDecodeCycle.
@Test
public void testEncodeDecodeCycle() throws Exception {
// Encode
SdkComponents sdkComponents = SdkComponents.create();
sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
RunnerApi.PCollection protoCollection = PCollectionTranslation.toProto(testCollection, sdkComponents);
RehydratedComponents protoComponents = RehydratedComponents.forComponents(sdkComponents.toComponents());
// Decode
Pipeline pipeline = Pipeline.create();
PCollection<?> decodedCollection = PCollectionTranslation.fromProto(protoCollection, pipeline, protoComponents);
// Verify
assertThat(decodedCollection.getCoder(), equalTo(testCollection.getCoder()));
assertThat(decodedCollection.getWindowingStrategy(), equalTo(testCollection.getWindowingStrategy().withEnvironmentId(sdkComponents.getOnlyEnvironmentId()).fixDefaults()));
assertThat(decodedCollection.isBounded(), equalTo(testCollection.isBounded()));
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class OutputDeduplicator method ensureSingleProducer.
/**
* Ensure that no {@link PCollection} output by any of the {@code stages} or {@code
* unfusedTransforms} is produced by more than one of those stages or transforms.
*
* <p>For each {@link PCollection} output by multiple stages and/or transforms, each producer is
* rewritten to produce a partial {@link PCollection}, which are then flattened together via an
* introduced Flatten node which produces the original output.
*/
static DeduplicationResult ensureSingleProducer(QueryablePipeline pipeline, Collection<ExecutableStage> stages, Collection<PTransformNode> unfusedTransforms) {
RunnerApi.Components.Builder unzippedComponents = pipeline.getComponents().toBuilder();
Multimap<PCollectionNode, StageOrTransform> pcollectionProducers = getProducers(pipeline, stages, unfusedTransforms);
Multimap<StageOrTransform, PCollectionNode> requiresNewOutput = HashMultimap.create();
// ExecutableStage must also be rewritten to have updated outputs and transforms.
for (Map.Entry<PCollectionNode, Collection<StageOrTransform>> collectionProducer : pcollectionProducers.asMap().entrySet()) {
if (collectionProducer.getValue().size() > 1) {
for (StageOrTransform producer : collectionProducer.getValue()) {
requiresNewOutput.put(producer, collectionProducer.getKey());
}
}
}
Map<ExecutableStage, ExecutableStage> updatedStages = new LinkedHashMap<>();
Map<String, PTransformNode> updatedTransforms = new LinkedHashMap<>();
Multimap<String, PCollectionNode> originalToPartial = HashMultimap.create();
for (Map.Entry<StageOrTransform, Collection<PCollectionNode>> deduplicationTargets : requiresNewOutput.asMap().entrySet()) {
if (deduplicationTargets.getKey().getStage() != null) {
StageDeduplication deduplication = deduplicatePCollections(deduplicationTargets.getKey().getStage(), deduplicationTargets.getValue(), unzippedComponents::containsPcollections);
for (Entry<String, PCollectionNode> originalToPartialReplacement : deduplication.getOriginalToPartialPCollections().entrySet()) {
originalToPartial.put(originalToPartialReplacement.getKey(), originalToPartialReplacement.getValue());
unzippedComponents.putPcollections(originalToPartialReplacement.getValue().getId(), originalToPartialReplacement.getValue().getPCollection());
}
updatedStages.put(deduplicationTargets.getKey().getStage(), deduplication.getUpdatedStage());
} else if (deduplicationTargets.getKey().getTransform() != null) {
PTransformDeduplication deduplication = deduplicatePCollections(deduplicationTargets.getKey().getTransform(), deduplicationTargets.getValue(), unzippedComponents::containsPcollections);
for (Entry<String, PCollectionNode> originalToPartialReplacement : deduplication.getOriginalToPartialPCollections().entrySet()) {
originalToPartial.put(originalToPartialReplacement.getKey(), originalToPartialReplacement.getValue());
unzippedComponents.putPcollections(originalToPartialReplacement.getValue().getId(), originalToPartialReplacement.getValue().getPCollection());
}
updatedTransforms.put(deduplicationTargets.getKey().getTransform().getId(), deduplication.getUpdatedTransform());
} else {
throw new IllegalStateException(String.format("%s with no %s or %s", StageOrTransform.class.getSimpleName(), ExecutableStage.class.getSimpleName(), PTransformNode.class.getSimpleName()));
}
}
Set<PTransformNode> introducedFlattens = new LinkedHashSet<>();
for (Map.Entry<String, Collection<PCollectionNode>> partialFlattenTargets : originalToPartial.asMap().entrySet()) {
String flattenId = SyntheticComponents.uniqueId("unzipped_flatten", unzippedComponents::containsTransforms);
PTransform flattenPartialPCollections = createFlattenOfPartials(flattenId, partialFlattenTargets.getKey(), partialFlattenTargets.getValue());
unzippedComponents.putTransforms(flattenId, flattenPartialPCollections);
introducedFlattens.add(PipelineNode.pTransform(flattenId, flattenPartialPCollections));
}
Components components = unzippedComponents.build();
return DeduplicationResult.of(components, introducedFlattens, updatedStages, updatedTransforms);
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class FlinkStreamingPortablePipelineTranslator method translateFlatten.
private <T> void translateFlatten(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id);
Map<String, String> allInputs = transform.getInputsMap();
if (allInputs.isEmpty()) {
// create an empty dummy source to satisfy downstream operations
// we cannot create an empty source in Flink, therefore we have to
// add the flatMap that simply never forwards the single element
long shutdownAfterIdleSourcesMs = context.getPipelineOptions().getShutdownSourcesAfterIdleMs();
DataStreamSource<WindowedValue<byte[]>> dummySource = context.getExecutionEnvironment().addSource(new ImpulseSourceFunction(shutdownAfterIdleSourcesMs));
DataStream<WindowedValue<T>> result = dummySource.<WindowedValue<T>>flatMap((s, collector) -> {
// never return anything
}).returns(new CoderTypeInformation<>(WindowedValue.getFullCoder((Coder<T>) VoidCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions()));
context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
} else {
DataStream<T> result = null;
// Determine DataStreams that we use as input several times. For those, we need to uniquify
// input streams because Flink seems to swallow watermarks when we have a union of one and
// the same stream.
HashMultiset<DataStream<T>> inputCounts = HashMultiset.create();
for (String input : allInputs.values()) {
DataStream<T> current = context.getDataStreamOrThrow(input);
inputCounts.add(current, 1);
}
for (String input : allInputs.values()) {
DataStream<T> current = context.getDataStreamOrThrow(input);
final int timesRequired = inputCounts.count(current);
if (timesRequired > 1) {
current = current.flatMap(new FlatMapFunction<T, T>() {
private static final long serialVersionUID = 1L;
@Override
public void flatMap(T t, Collector<T> collector) {
collector.collect(t);
}
});
}
result = (result == null) ? current : result.union(current);
}
context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
}
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class FlinkStreamingPortablePipelineTranslator method translateStreamingImpulse.
private void translateStreamingImpulse(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id);
TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>(WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions());
ObjectMapper objectMapper = new ObjectMapper();
final int intervalMillis;
final int messageCount;
try {
JsonNode config = objectMapper.readTree(pTransform.getSpec().getPayload().toByteArray());
intervalMillis = config.path("interval_ms").asInt(100);
messageCount = config.path("message_count").asInt(0);
} catch (IOException e) {
throw new RuntimeException("Failed to parse configuration for streaming impulse", e);
}
SingleOutputStreamOperator<WindowedValue<byte[]>> source = context.getExecutionEnvironment().addSource(new StreamingImpulseSource(intervalMillis, messageCount), StreamingImpulseSource.class.getSimpleName()).returns(typeInfo);
context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source);
}
Aggregations