use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class ParDoTranslator method translate.
@Override
public void translate(ParDo.MultiOutput<InputT, OutputT> transform, TranslationContext context) {
DoFn<InputT, OutputT> doFn = transform.getFn();
DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
if (signature.processElement().isSplittable()) {
throw new UnsupportedOperationException(String.format("%s does not support splittable DoFn: %s", ApexRunner.class.getSimpleName(), doFn));
}
if (signature.stateDeclarations().size() > 0) {
throw new UnsupportedOperationException(String.format("Found %s annotations on %s, but %s cannot yet be used with state in the %s.", DoFn.StateId.class.getSimpleName(), doFn.getClass().getName(), DoFn.class.getSimpleName(), ApexRunner.class.getSimpleName()));
}
if (signature.timerDeclarations().size() > 0) {
throw new UnsupportedOperationException(String.format("Found %s annotations on %s, but %s cannot yet be used with timers in the %s.", DoFn.TimerId.class.getSimpleName(), doFn.getClass().getName(), DoFn.class.getSimpleName(), ApexRunner.class.getSimpleName()));
}
Map<TupleTag<?>, PValue> outputs = context.getOutputs();
PCollection<InputT> input = context.getInput();
List<PCollectionView<?>> sideInputs = transform.getSideInputs();
Coder<InputT> inputCoder = input.getCoder();
WindowedValueCoder<InputT> wvInputCoder = FullWindowedValueCoder.of(inputCoder, input.getWindowingStrategy().getWindowFn().windowCoder());
ApexParDoOperator<InputT, OutputT> operator = new ApexParDoOperator<>(context.getPipelineOptions(), doFn, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), input.getWindowingStrategy(), sideInputs, wvInputCoder, context.getStateBackend());
Map<PCollection<?>, OutputPort<?>> ports = Maps.newHashMapWithExpectedSize(outputs.size());
for (Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
checkArgument(output.getValue() instanceof PCollection, "%s %s outputs non-PCollection %s of type %s", ParDo.MultiOutput.class.getSimpleName(), context.getFullName(), output.getValue(), output.getValue().getClass().getSimpleName());
PCollection<?> pc = (PCollection<?>) output.getValue();
if (output.getKey().equals(transform.getMainOutputTag())) {
ports.put(pc, operator.output);
} else {
int portIndex = 0;
for (TupleTag<?> tag : transform.getAdditionalOutputTags().getAll()) {
if (tag.equals(output.getKey())) {
ports.put(pc, operator.additionalOutputPorts[portIndex]);
break;
}
portIndex++;
}
}
}
context.addOperator(operator, ports);
context.addStream(context.getInput(), operator.input);
if (!sideInputs.isEmpty()) {
addSideInputs(operator.sideInput1, sideInputs, context);
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class TranslationContext method populateDAG.
public void populateDAG(DAG dag) {
for (Map.Entry<String, Operator> nameAndOperator : this.operators.entrySet()) {
dag.addOperator(nameAndOperator.getKey(), nameAndOperator.getValue());
}
int streamIndex = 0;
for (Map.Entry<PCollection, Pair<OutputPortInfo, List<InputPortInfo>>> streamEntry : this.streams.entrySet()) {
List<InputPortInfo> destInfo = streamEntry.getValue().getRight();
InputPort[] sinks = new InputPort[destInfo.size()];
for (int i = 0; i < sinks.length; i++) {
sinks[i] = destInfo.get(i).port;
}
if (sinks.length > 0) {
DAG.StreamMeta streamMeta = dag.addStream("stream" + streamIndex++, streamEntry.getValue().getLeft().port, sinks);
if (pipelineOptions.isParDoFusionEnabled()) {
optimizeStreams(streamMeta, streamEntry);
}
for (InputPort port : sinks) {
PCollection pc = streamEntry.getKey();
Coder coder = pc.getCoder();
if (pc.getWindowingStrategy() != null) {
coder = FullWindowedValueCoder.of(pc.getCoder(), pc.getWindowingStrategy().getWindowFn().windowCoder());
}
Coder<Object> wrapperCoder = ApexStreamTuple.ApexStreamTupleCoder.of(coder);
CoderAdapterStreamCodec streamCodec = new CoderAdapterStreamCodec(wrapperCoder);
dag.setInputPortAttribute(port, PortContext.STREAM_CODEC, streamCodec);
}
}
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class FlattenPCollectionTranslatorTest method test.
@Test
public void test() throws Exception {
ApexPipelineOptions options = PipelineOptionsFactory.as(ApexPipelineOptions.class);
options.setApplicationName("FlattenPCollection");
options.setRunner(ApexRunner.class);
Pipeline p = Pipeline.create(options);
String[][] collections = { { "1" }, { "2" }, { "3" }, { "4" }, { "5" } };
Set<String> expected = Sets.newHashSet();
List<PCollection<String>> pcList = new ArrayList<PCollection<String>>();
for (String[] collection : collections) {
pcList.add(p.apply(Create.of(ImmutableList.copyOf(collection)).withCoder(StringUtf8Coder.of())));
expected.addAll(Arrays.asList(collection));
}
PCollection<String> actual = PCollectionList.of(pcList).apply(Flatten.<String>pCollections());
actual.apply(ParDo.of(new EmbeddedCollector()));
ApexRunnerResult result = (ApexRunnerResult) p.run();
// TODO: verify translation
result.getApexDAG();
long timeout = System.currentTimeMillis() + 30000;
while (System.currentTimeMillis() < timeout && EmbeddedCollector.RESULTS.size() < expected.size()) {
LOG.info("Waiting for expected results.");
Thread.sleep(500);
}
Assert.assertEquals("number results", expected.size(), EmbeddedCollector.RESULTS.size());
Assert.assertEquals(expected, Sets.newHashSet(EmbeddedCollector.RESULTS));
}
use of org.apache.beam.sdk.values.PCollection in project java-docs-samples by GoogleCloudPlatform.
the class SpannerReadAll method main.
public static void main(String[] args) {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline p = Pipeline.create(options);
SpannerConfig spannerConfig = SpannerConfig.create().withInstanceId(options.getInstanceId()).withDatabaseId(options.getDatabaseId());
// [START spanner_dataflow_readall]
PCollection<Struct> allRecords = p.apply(SpannerIO.read().withSpannerConfig(spannerConfig).withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply(MapElements.into(TypeDescriptor.of(ReadOperation.class)).via((SerializableFunction<Struct, ReadOperation>) input -> {
String tableName = input.getString(0);
return ReadOperation.create().withQuery("SELECT * FROM " + tableName);
})).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig));
// [END spanner_dataflow_readall]
PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()).apply(Sum.longsGlobally());
dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()).withoutSharding());
p.run().waitUntilFinish();
}
use of org.apache.beam.sdk.values.PCollection in project components by Talend.
the class SimpleFileIORoundTripRuntimeTest method runRoundTripPipelines.
/**
* Tests a round-trip on the data when writing to the data source using the given output properties, then
* subsequently reading using the given input properties. This is the equivalent of two pipeline jobs.
*
* @param initialData The initial data set to write, then read.
* @param outputProps The properties used to create the output runtime.
* @param inputProps The properties used to create the input runtime.
* @return The data returned from the round-trip.
*/
protected static List<IndexedRecord> runRoundTripPipelines(BeamDirectTestResource beam, List<IndexedRecord> initialData, SimpleFileIOOutputProperties outputProps, SimpleFileIOInputProperties inputProps) {
// Create the runtimes.
SimpleFileIOOutputRuntime outputRuntime = new SimpleFileIOOutputRuntime();
outputRuntime.initialize(null, outputProps);
SimpleFileIOInputRuntime inputRuntime = new SimpleFileIOInputRuntime();
inputRuntime.initialize(null, inputProps);
// Use the runtime in a direct pipeline to test.
PipelineOptions options = PipelineOptionsFactory.create();
options.setRunner(DirectRunner.class);
// Create a pipeline to write the records to the output.
{
final Pipeline p = beam.createPipeline();
PCollection<IndexedRecord> input = p.apply(Create.<IndexedRecord>of(initialData));
input.apply(outputRuntime);
p.run().waitUntilFinish();
}
// Read the records that were written.
try (DirectCollector<IndexedRecord> collector = DirectCollector.of()) {
final Pipeline p = beam.createPipeline();
PCollection<IndexedRecord> input = p.apply(inputRuntime);
input.apply(collector);
p.run().waitUntilFinish();
// Return the list of records from the round trip.
return collector.getRecords();
}
}
Aggregations