use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class SqlBoundedSideInputJoin method expand.
@Override
public PCollection<Bid> expand(PCollection<Event> events) {
PCollection<Row> bids = events.apply(Filter.by(NexmarkQueryUtil.IS_BID)).apply(getName() + ".SelectEvent", new SelectEvent(Event.Type.BID));
checkState(getSideInput() != null, "Configuration error: side input is null");
TupleTag<Row> sideTag = new TupleTag<Row>("side") {
};
TupleTag<Row> bidTag = new TupleTag<Row>("bid") {
};
Schema schema = Schema.of(Schema.Field.of("id", Schema.FieldType.INT64), Schema.Field.of("extra", Schema.FieldType.STRING));
PCollection<Row> sideRows = getSideInput().setSchema(schema, TypeDescriptors.kvs(TypeDescriptors.longs(), TypeDescriptors.strings()), kv -> Row.withSchema(schema).addValues(kv.getKey(), kv.getValue()).build(), row -> KV.of(row.getInt64("id"), row.getString("extra"))).apply("SideToRows", Convert.toRows());
return PCollectionTuple.of(bidTag, bids).and(sideTag, sideRows).apply(SqlTransform.query(String.format(query, configuration.sideInputRowCount)).withQueryPlannerClass(plannerClass)).apply("ResultToBid", Convert.fromRows(Bid.class));
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class BoundedSideInputJoinTest method queryMatchesModel.
/**
* Test {@code query} matches {@code model}.
*/
private <T extends KnownSize> void queryMatchesModel(String name, NexmarkConfiguration config, NexmarkQueryTransform<T> query, NexmarkQueryModel<T> model, boolean streamingMode) throws Exception {
ResourceId sideInputResourceId = FileSystems.matchNewResource(String.format("%s/BoundedSideInputJoin-%s", p.getOptions().getTempLocation(), new Random().nextInt()), false);
config.sideInputUrl = sideInputResourceId.toString();
try {
PCollection<KV<Long, String>> sideInput = NexmarkUtils.prepareSideInput(p, config);
query.setSideInput(sideInput);
PCollection<Event> events = p.apply(name + ".Read", streamingMode ? NexmarkUtils.streamEventsSource(config) : NexmarkUtils.batchEventsSource(config));
PCollection<TimestampedValue<T>> results = (PCollection<TimestampedValue<T>>) events.apply(new NexmarkQuery<>(config, query));
PAssert.that(results).satisfies(model.assertionFor());
PipelineResult result = p.run();
result.waitUntilFinish();
} finally {
NexmarkUtils.cleanUpSideInput(config);
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class BoundedSideInputJoinTest method inputOutputSameEvents.
/**
* A smoke test that the count of input bids and outputs are the same, to help diagnose flakiness
* in more complex tests.
*/
@Test
@Category(NeedsRunner.class)
public void inputOutputSameEvents() throws Exception {
NexmarkConfiguration config = NexmarkConfiguration.DEFAULT.copy();
config.sideInputType = NexmarkUtils.SideInputType.DIRECT;
config.numEventGenerators = 1;
config.numEvents = 5000;
config.sideInputRowCount = 10;
config.sideInputNumShards = 3;
PCollection<KV<Long, String>> sideInput = NexmarkUtils.prepareSideInput(p, config);
try {
PCollection<Event> input = p.apply(NexmarkUtils.batchEventsSource(config));
PCollection<Bid> justBids = input.apply(NexmarkQueryUtil.JUST_BIDS);
PCollection<Long> bidCount = justBids.apply("Count Bids", Count.globally());
NexmarkQueryTransform<Bid> query = new BoundedSideInputJoin(config);
query.setSideInput(sideInput);
PCollection<TimestampedValue<Bid>> output = (PCollection<TimestampedValue<Bid>>) input.apply(new NexmarkQuery(config, query));
PCollection<Long> outputCount = output.apply("Count outputs", Count.globally());
PAssert.that(PCollectionList.of(bidCount).and(outputCount).apply(Flatten.pCollections())).satisfies(counts -> {
assertThat(Iterables.size(counts), equalTo(2));
assertThat(Iterables.get(counts, 0), greaterThan(0L));
assertThat(Iterables.get(counts, 0), equalTo(Iterables.get(counts, 1)));
return null;
});
p.run();
} finally {
NexmarkUtils.cleanUpSideInput(config);
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class SessionSideInputJoinTest method queryMatchesModel.
/**
* Test {@code query} matches {@code model}.
*/
private <T extends KnownSize> void queryMatchesModel(String name, NexmarkConfiguration config, NexmarkQueryTransform<T> query, NexmarkQueryModel<T> model, boolean streamingMode) throws Exception {
ResourceId sideInputResourceId = FileSystems.matchNewResource(String.format("%s/SessionSideInputJoin-%s", p.getOptions().getTempLocation(), new Random().nextInt()), false);
config.sideInputUrl = sideInputResourceId.toString();
try {
PCollection<KV<Long, String>> sideInput = NexmarkUtils.prepareSideInput(p, config);
query.setSideInput(sideInput);
PCollection<Event> events = p.apply(name + ".Read", streamingMode ? NexmarkUtils.streamEventsSource(config) : NexmarkUtils.batchEventsSource(config));
PCollection<TimestampedValue<T>> results = (PCollection<TimestampedValue<T>>) events.apply(new NexmarkQuery<>(config, query));
PAssert.that(results).satisfies(model.assertionFor());
PipelineResult result = p.run();
result.waitUntilFinish();
} finally {
NexmarkUtils.cleanUpSideInput(config);
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class QueryTest method queryMatchesModel.
/**
* Test {@code query} matches {@code model}.
*/
private <T extends KnownSize> void queryMatchesModel(String name, NexmarkQueryTransform<T> query, NexmarkQueryModel<T> model, boolean streamingMode) {
NexmarkUtils.setupPipeline(NexmarkUtils.CoderStrategy.HAND, p);
PCollection<Event> events = p.apply(name + ".Read", streamingMode ? NexmarkUtils.streamEventsSource(CONFIG) : NexmarkUtils.batchEventsSource(CONFIG));
PCollection<TimestampedValue<T>> results = (PCollection<TimestampedValue<T>>) events.apply(new NexmarkQuery<>(CONFIG, query));
PAssert.that(results).satisfies(model.assertionFor());
PipelineResult result = p.run();
result.waitUntilFinish();
}
Aggregations