use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class GroupByKeyTranslatorBatch method translateNode.
@Override
public void translateNode(GroupByKey<K, V> transform, Twister2BatchTranslationContext context) {
PCollection<KV<K, V>> input = context.getInput(transform);
BatchTSetImpl<WindowedValue<KV<K, V>>> inputTTSet = context.getInputDataSet(input);
final KvCoder<K, V> coder = (KvCoder<K, V>) input.getCoder();
Coder<K> inputKeyCoder = coder.getKeyCoder();
WindowingStrategy windowingStrategy = input.getWindowingStrategy();
WindowFn<KV<K, V>, BoundedWindow> windowFn = (WindowFn<KV<K, V>, BoundedWindow>) windowingStrategy.getWindowFn();
final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
KeyedTSet<byte[], byte[]> keyedTSet = inputTTSet.mapToTuple(new MapToTupleFunction<K, V>(inputKeyCoder, wvCoder));
// todo add support for a partition function to be specified, this would use
// todo keyedPartition function instead of KeyedGather
ComputeTSet<KV<K, Iterable<WindowedValue<V>>>, Iterator<Tuple<byte[], Iterator<byte[]>>>> groupedbyKeyTset = keyedTSet.keyedGather().map(new ByteToWindowFunction(inputKeyCoder, wvCoder));
// --- now group also by window.
SystemReduceFnBuffering reduceFnBuffering = new SystemReduceFnBuffering(coder.getValueCoder());
ComputeTSet<WindowedValue<KV<K, Iterable<V>>>, Iterable<KV<K, Iterator<WindowedValue<V>>>>> outputTset = groupedbyKeyTset.direct().<WindowedValue<KV<K, Iterable<V>>>>flatmap(new GroupByWindowFunction(windowingStrategy, reduceFnBuffering, context.getOptions()));
PCollection output = context.getOutput(transform);
context.setOutputDataSet(output, outputTset);
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class FlattenTranslatorBatch method translateNode.
@Override
public void translateNode(Flatten.PCollections<T> transform, Twister2BatchTranslationContext context) {
Collection<PCollection<?>> pcs = context.getInputs().values();
List<BatchTSetImpl<WindowedValue<T>>> tSets = new ArrayList<>();
BatchTSetImpl<WindowedValue<T>> unionTSet;
if (pcs.isEmpty()) {
final TSetEnvironment tsetEnv = context.getEnvironment();
unionTSet = ((BatchTSetEnvironment) tsetEnv).createSource(new Twister2EmptySource(), context.getOptions().getParallelism());
} else {
for (PValue pc : pcs) {
BatchTSetImpl<WindowedValue<T>> curr = context.getInputDataSet(pc);
tSets.add(curr);
}
BatchTSetImpl<WindowedValue<T>> first = tSets.remove(0);
Collection<TSet<WindowedValue<T>>> others = new ArrayList<>();
others.addAll(tSets);
if (tSets.size() > 0) {
unionTSet = first.union(others);
} else {
unionTSet = first;
}
}
context.setOutputDataSet(context.getOutput(transform), unionTSet);
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class ExpansionService method expand.
@VisibleForTesting
/*package*/
ExpansionApi.ExpansionResponse expand(ExpansionApi.ExpansionRequest request) {
LOG.info("Expanding '{}' with URN '{}'", request.getTransform().getUniqueName(), request.getTransform().getSpec().getUrn());
LOG.debug("Full transform: {}", request.getTransform());
Set<String> existingTransformIds = request.getComponents().getTransformsMap().keySet();
Pipeline pipeline = createPipeline();
boolean isUseDeprecatedRead = ExperimentalOptions.hasExperiment(pipelineOptions, "use_deprecated_read") || ExperimentalOptions.hasExperiment(pipelineOptions, "beam_fn_api_use_deprecated_read");
if (!isUseDeprecatedRead) {
ExperimentalOptions.addExperiment(pipeline.getOptions().as(ExperimentalOptions.class), "beam_fn_api");
// TODO(BEAM-10670): Remove this when we address performance issue.
ExperimentalOptions.addExperiment(pipeline.getOptions().as(ExperimentalOptions.class), "use_sdf_read");
} else {
LOG.warn("Using use_depreacted_read in portable runners is runner-dependent. The " + "ExpansionService will respect that, but if your runner does not have support for " + "native Read transform, your Pipeline will fail during Pipeline submission.");
}
RehydratedComponents rehydratedComponents = RehydratedComponents.forComponents(request.getComponents()).withPipeline(pipeline);
Map<String, PCollection<?>> inputs = request.getTransform().getInputsMap().entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, input -> {
try {
return rehydratedComponents.getPCollection(input.getValue());
} catch (IOException exn) {
throw new RuntimeException(exn);
}
}));
String urn = request.getTransform().getSpec().getUrn();
TransformProvider transformProvider = null;
if (getUrn(ExpansionMethods.Enum.JAVA_CLASS_LOOKUP).equals(urn)) {
AllowList allowList = pipelineOptions.as(ExpansionServiceOptions.class).getJavaClassLookupAllowlist();
assert allowList != null;
transformProvider = new JavaClassLookupTransformProvider(allowList);
} else {
transformProvider = getRegisteredTransforms().get(urn);
if (transformProvider == null) {
throw new UnsupportedOperationException("Unknown urn: " + request.getTransform().getSpec().getUrn());
}
}
List<String> classpathResources = transformProvider.getDependencies(request.getTransform().getSpec(), pipeline.getOptions());
pipeline.getOptions().as(PortablePipelineOptions.class).setFilesToStage(classpathResources);
Map<String, PCollection<?>> outputs = transformProvider.apply(pipeline, request.getTransform().getUniqueName(), request.getTransform().getSpec(), inputs);
// Needed to find which transform was new...
SdkComponents sdkComponents = rehydratedComponents.getSdkComponents(Collections.emptyList()).withNewIdPrefix(request.getNamespace());
sdkComponents.registerEnvironment(Environments.createOrGetDefaultEnvironment(pipeline.getOptions().as(PortablePipelineOptions.class)));
Map<String, String> outputMap = outputs.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, output -> {
try {
return sdkComponents.registerPCollection(output.getValue());
} catch (IOException exn) {
throw new RuntimeException(exn);
}
}));
if (isUseDeprecatedRead) {
SplittableParDo.convertReadBasedSplittableDoFnsToPrimitiveReadsIfNecessary(pipeline);
}
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents);
String expandedTransformId = Iterables.getOnlyElement(pipelineProto.getRootTransformIdsList().stream().filter(id -> !existingTransformIds.contains(id)).collect(Collectors.toList()));
RunnerApi.Components components = pipelineProto.getComponents();
RunnerApi.PTransform expandedTransform = components.getTransformsOrThrow(expandedTransformId).toBuilder().setUniqueName(expandedTransformId).clearOutputs().putAllOutputs(outputMap).build();
LOG.debug("Expanded to {}", expandedTransform);
return ExpansionApi.ExpansionResponse.newBuilder().setComponents(components.toBuilder().removeTransforms(expandedTransformId)).setTransform(expandedTransform).addAllRequirements(pipelineProto.getRequirementsList()).build();
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class ReduceByKeyTranslator method translate.
@Override
public PCollection<KV<KeyT, OutputT>> translate(ReduceByKey<InputT, KeyT, ValueT, ?, OutputT> operator, PCollectionList<InputT> inputs) {
// todo Could we even do values sorting in Beam ? And do we want it?
checkState(!operator.getValueComparator().isPresent(), "Values sorting is not supported.");
final UnaryFunction<InputT, KeyT> keyExtractor = operator.getKeyExtractor();
final UnaryFunction<InputT, ValueT> valueExtractor = operator.getValueExtractor();
final PCollection<InputT> input = operator.getWindow().map(window -> PCollectionLists.getOnlyElement(inputs).apply(window)).orElseGet(() -> PCollectionLists.getOnlyElement(inputs));
// ~ create key & value extractor
final MapElements<InputT, KV<KeyT, ValueT>> extractor = MapElements.via(new KeyValueExtractor<>(keyExtractor, valueExtractor));
final PCollection<KV<KeyT, ValueT>> extracted = input.apply("extract-keys", extractor).setTypeDescriptor(TypeDescriptors.kvs(TypeAwareness.orObjects(operator.getKeyType()), TypeAwareness.orObjects(operator.getValueType())));
final AccumulatorProvider accumulators = new LazyAccumulatorProvider(AccumulatorProvider.of(inputs.getPipeline()));
if (operator.isCombinable()) {
// if operator is combinable we can process it in more efficient way
@SuppressWarnings("unchecked") final PCollection combined;
if (operator.isCombineFnStyle()) {
combined = extracted.apply("combine", Combine.perKey(asCombineFn(operator)));
} else {
combined = extracted.apply("combine", Combine.perKey(asCombiner(operator.getReducer(), accumulators, operator.getName().orElse(null))));
}
@SuppressWarnings("unchecked") final PCollection<KV<KeyT, OutputT>> cast = (PCollection) combined;
return cast.setTypeDescriptor(operator.getOutputType().orElseThrow(() -> new IllegalStateException("Unable to infer output type descriptor.")));
}
return extracted.apply("group", GroupByKey.create()).setTypeDescriptor(TypeDescriptors.kvs(TypeAwareness.orObjects(operator.getKeyType()), TypeDescriptors.iterables(TypeAwareness.orObjects(operator.getValueType())))).apply("reduce", ParDo.of(new ReduceDoFn<>(operator.getReducer(), accumulators, operator.getName().orElse(null)))).setTypeDescriptor(operator.getOutputType().orElseThrow(() -> new IllegalStateException("Unable to infer output type descriptor.")));
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class PubsubTableProviderIT method testSQLWithBytePayload.
@Test
public void testSQLWithBytePayload() throws Exception {
// Prepare messages to send later
List<PubsubMessage> messages = ImmutableList.of(objectsProvider.messageIdName(ts(1), 3, "foo"), objectsProvider.messageIdName(ts(2), 5, "bar"), objectsProvider.messageIdName(ts(3), 7, "baz"));
String createTableString = String.format("CREATE EXTERNAL TABLE message (\n" + "event_timestamp TIMESTAMP, \n" + "attributes MAP<VARCHAR, VARCHAR>, \n" + "payload VARBINARY \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "TBLPROPERTIES '{ " + "\"protoClass\" : \"%s\", " + "\"timestampAttributeKey\" : \"ts\" }'", tableProvider.getTableType(), eventsTopic.topicPath(), PayloadMessages.SimpleMessage.class.getName());
String queryString = "SELECT message.payload AS some_bytes FROM message";
// Initialize SQL environment and create the pubsub table
BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new PubsubTableProvider());
sqlEnv.executeDdl(createTableString);
// Apply the PTransform to query the pubsub topic
PCollection<Row> queryOutput = query(sqlEnv, pipeline, queryString);
// Observe the query results and send success signal after seeing the expected messages
Schema justBytesSchema = Schema.builder().addField("some_bytes", FieldType.BYTES.withNullable(true)).build();
Row expectedRow0 = row(justBytesSchema, (Object) messages.get(0).getPayload());
Row expectedRow1 = row(justBytesSchema, (Object) messages.get(1).getPayload());
Row expectedRow2 = row(justBytesSchema, (Object) messages.get(2).getPayload());
Set<Row> expected = ImmutableSet.of(expectedRow0, expectedRow1, expectedRow2);
queryOutput.apply("waitForSuccess", resultSignal.signalSuccessWhen(SchemaCoder.of(justBytesSchema), observedRows -> observedRows.equals(expected)));
// Start the pipeline
pipeline.run();
// Block until a subscription for this topic exists
eventsTopic.assertSubscriptionEventuallyCreated(pipeline.getOptions().as(GcpOptions.class).getProject(), Duration.standardMinutes(5));
// Start publishing the messages when main pipeline is started and signaling topic is ready
eventsTopic.publish(messages);
// Poll the signaling topic for success message
resultSignal.waitForSuccess(timeout);
}
Aggregations