use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class UserParDoFnFactory method create.
@Override
public ParDoFn create(PipelineOptions options, CloudObject cloudUserFn, @Nullable List<SideInputInfo> sideInputInfos, TupleTag<?> mainOutputTag, Map<TupleTag<?>, Integer> outputTupleTagsToReceiverIndices, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
DoFnInstanceManager instanceManager = fnCache.get(operationContext.nameContext().systemName(), () -> DoFnInstanceManagers.cloningPool(doFnExtractor.getDoFnInfo(cloudUserFn), options));
DoFnInfo<?, ?> doFnInfo = instanceManager.peek();
DataflowExecutionContext.DataflowStepContext stepContext = executionContext.getStepContext(operationContext);
Iterable<PCollectionView<?>> sideInputViews = doFnInfo.getSideInputViews();
SideInputReader sideInputReader = executionContext.getSideInputReader(sideInputInfos, sideInputViews, operationContext);
if (doFnInfo.getDoFn() instanceof BatchStatefulParDoOverrides.BatchStatefulDoFn) {
// HACK: BatchStatefulDoFn is a class from DataflowRunner's overrides
// that just instructs the worker to execute it differently. This will
// be replaced by metadata in the Runner API payload
BatchStatefulParDoOverrides.BatchStatefulDoFn fn = (BatchStatefulParDoOverrides.BatchStatefulDoFn) doFnInfo.getDoFn();
DoFn underlyingFn = fn.getUnderlyingDoFn();
return new BatchModeUngroupingParDoFn((BatchModeExecutionContext.StepContext) stepContext, new SimpleParDoFn(options, DoFnInstanceManagers.singleInstance(doFnInfo.withFn(underlyingFn)), sideInputReader, doFnInfo.getMainOutput(), outputTupleTagsToReceiverIndices, stepContext, operationContext, doFnInfo.getDoFnSchemaInformation(), doFnInfo.getSideInputMapping(), runnerFactory));
} else if (doFnInfo.getDoFn() instanceof StreamingPCollectionViewWriterFn) {
// HACK: StreamingPCollectionViewWriterFn is a class from
// DataflowPipelineTranslator. Using the class as an indicator is a migration path
// to simply having an indicator string.
checkArgument(stepContext instanceof StreamingModeExecutionContext.StreamingModeStepContext, "stepContext must be a StreamingModeStepContext to use StreamingPCollectionViewWriterFn");
DataflowRunner.StreamingPCollectionViewWriterFn<Object> writerFn = (StreamingPCollectionViewWriterFn<Object>) doFnInfo.getDoFn();
return new StreamingPCollectionViewWriterParDoFn((StreamingModeExecutionContext.StreamingModeStepContext) stepContext, writerFn.getView().getTagInternal(), writerFn.getDataCoder(), (Coder<BoundedWindow>) doFnInfo.getWindowingStrategy().getWindowFn().windowCoder());
} else {
return new SimpleParDoFn(options, instanceManager, sideInputReader, doFnInfo.getMainOutput(), outputTupleTagsToReceiverIndices, stepContext, operationContext, doFnInfo.getDoFnSchemaInformation(), doFnInfo.getSideInputMapping(), runnerFactory);
}
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class Query13 method expand.
@Override
public PCollection<Event> expand(PCollection<Event> events) {
final Coder<Event> coder = events.getCoder();
return events.apply("Pair with random key", ParDo.of(new AssignShardFn<>(configuration.numKeyBuckets))).apply(GroupByKey.create()).apply("ExpandIterable", ParDo.of(new DoFn<KV<Integer, Iterable<Event>>, Event>() {
@ProcessElement
public void processElement(@Element KV<Integer, Iterable<Event>> element, OutputReceiver<Event> r) {
for (Event value : element.getValue()) {
r.output(value);
}
}
})).apply(name + ".Serialize", ParDo.of(new DoFn<Event, Event>() {
private final Counter bytesMetric = Metrics.counter(name, "serde-bytes");
private final Random random = new Random();
private double pardoCPUFactor = (configuration.pardoCPUFactor >= 0.0 && configuration.pardoCPUFactor <= 1.0) ? configuration.pardoCPUFactor : 1.0;
@ProcessElement
public void processElement(ProcessContext c) throws CoderException, IOException {
Event event;
if (random.nextDouble() <= pardoCPUFactor) {
event = encodeDecode(coder, c.element(), bytesMetric);
} else {
event = c.element();
}
c.output(event);
}
}));
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class Query3 method expand.
@Override
public PCollection<NameCityStateId> expand(PCollection<Event> events) {
PCollection<KV<Long, Event>> auctionsBySellerId = events.apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS).apply(name + ".InCategory", Filter.by(auction -> auction.category == 10)).apply("EventByAuctionSeller", ParDo.of(new DoFn<Auction, KV<Long, Event>>() {
@ProcessElement
public void processElement(ProcessContext c) {
Event e = new Event();
e.newAuction = c.element();
c.output(KV.of(c.element().seller, e));
}
}));
PCollection<KV<Long, Event>> personsById = events.apply(NexmarkQueryUtil.JUST_NEW_PERSONS).apply(name + ".InState", Filter.by(person -> "OR".equals(person.state) || "ID".equals(person.state) || "CA".equals(person.state))).apply("EventByPersonId", ParDo.of(new DoFn<Person, KV<Long, Event>>() {
@ProcessElement
public void processElement(ProcessContext c) {
Event e = new Event();
e.newPerson = c.element();
c.output(KV.of(c.element().id, e));
}
}));
// Join auctions and people.
return PCollectionList.of(auctionsBySellerId).and(personsById).apply(Flatten.pCollections()).apply(name + ".Join", ParDo.of(joinDoFn)).apply(name + ".Project", ParDo.of(new DoFn<KV<Auction, Person>, NameCityStateId>() {
@ProcessElement
public void processElement(ProcessContext c) {
Auction auction = c.element().getKey();
Person person = c.element().getValue();
c.output(new NameCityStateId(person.name, person.city, person.state, auction.id));
}
}));
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class BigQueryIOReadTest method checkSetsProject.
private void checkSetsProject(String projectId) throws Exception {
fakeDatasetService.createDataset(projectId, "dataset-id", "", "", null);
String tableId = "sometable";
TableReference tableReference = new TableReference().setProjectId(projectId).setDatasetId("dataset-id").setTableId(tableId);
fakeDatasetService.createTable(new Table().setTableReference(tableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
fakeDatasetService.insertAll(tableReference, expected, null);
TableReference tableRef = new TableReference().setDatasetId("dataset-id").setTableId(tableId);
PCollection<KV<String, Long>> output = p.apply(BigQueryIO.read().from(tableRef).withTestServices(fakeBqServices)).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
}
}));
PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L), KV.of("d", 4L), KV.of("e", 5L), KV.of("f", 6L)));
p.run();
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class BigQueryIOReadTest method testReadFromTable.
private void testReadFromTable(boolean useTemplateCompatibility, boolean useReadTableRows) throws IOException, InterruptedException {
Table sometable = new Table();
sometable.setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
sometable.setTableReference(new TableReference().setProjectId("non-executing-project").setDatasetId("somedataset").setTableId("sometable"));
sometable.setNumBytes(1024L * 1024L);
FakeDatasetService fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset("non-executing-project", "somedataset", "", "", null);
fakeDatasetService.createTable(sometable);
List<TableRow> records = Lists.newArrayList(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L));
fakeDatasetService.insertAll(sometable.getTableReference(), records, null);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
PTransform<PBegin, PCollection<TableRow>> readTransform;
if (useReadTableRows) {
BigQueryIO.Read read = BigQueryIO.read().from("non-executing-project:somedataset.sometable").withTestServices(fakeBqServices).withoutValidation();
readTransform = useTemplateCompatibility ? read.withTemplateCompatibility() : read;
} else {
BigQueryIO.TypedRead<TableRow> read = BigQueryIO.readTableRows().from("non-executing-project:somedataset.sometable").withTestServices(fakeBqServices).withoutValidation();
readTransform = useTemplateCompatibility ? read.withTemplateCompatibility() : read;
}
PCollection<KV<String, Long>> output = p.apply(readTransform).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
}
}));
PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L)));
p.run();
}
Aggregations