Search in sources :

Example 26 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class SqlBoundedSideInputJoin method expand.

@Override
public PCollection<Bid> expand(PCollection<Event> events) {
    PCollection<Row> bids = events.apply(Filter.by(NexmarkQueryUtil.IS_BID)).apply(getName() + ".SelectEvent", new SelectEvent(Event.Type.BID));
    checkState(getSideInput() != null, "Configuration error: side input is null");
    TupleTag<Row> sideTag = new TupleTag<Row>("side") {
    };
    TupleTag<Row> bidTag = new TupleTag<Row>("bid") {
    };
    Schema schema = Schema.of(Schema.Field.of("id", Schema.FieldType.INT64), Schema.Field.of("extra", Schema.FieldType.STRING));
    PCollection<Row> sideRows = getSideInput().setSchema(schema, TypeDescriptors.kvs(TypeDescriptors.longs(), TypeDescriptors.strings()), kv -> Row.withSchema(schema).addValues(kv.getKey(), kv.getValue()).build(), row -> KV.of(row.getInt64("id"), row.getString("extra"))).apply("SideToRows", Convert.toRows());
    return PCollectionTuple.of(bidTag, bids).and(sideTag, sideRows).apply(SqlTransform.query(String.format(query, configuration.sideInputRowCount)).withQueryPlannerClass(plannerClass)).apply("ResultToBid", Convert.fromRows(Bid.class));
}
Also used : NexmarkConfiguration(org.apache.beam.sdk.nexmark.NexmarkConfiguration) KV(org.apache.beam.sdk.values.KV) QueryPlanner(org.apache.beam.sdk.extensions.sql.impl.QueryPlanner) ZetaSQLQueryPlanner(org.apache.beam.sdk.extensions.sql.zetasql.ZetaSQLQueryPlanner) Bid(org.apache.beam.sdk.nexmark.model.Bid) SelectEvent(org.apache.beam.sdk.nexmark.model.sql.SelectEvent) PCollection(org.apache.beam.sdk.values.PCollection) SqlTransform(org.apache.beam.sdk.extensions.sql.SqlTransform) Schema(org.apache.beam.sdk.schemas.Schema) Convert(org.apache.beam.sdk.schemas.transforms.Convert) Filter(org.apache.beam.sdk.transforms.Filter) CalciteQueryPlanner(org.apache.beam.sdk.extensions.sql.impl.CalciteQueryPlanner) Event(org.apache.beam.sdk.nexmark.model.Event) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) TupleTag(org.apache.beam.sdk.values.TupleTag) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) NexmarkQueryTransform(org.apache.beam.sdk.nexmark.queries.NexmarkQueryTransform) NexmarkQueryUtil(org.apache.beam.sdk.nexmark.queries.NexmarkQueryUtil) Row(org.apache.beam.sdk.values.Row) Schema(org.apache.beam.sdk.schemas.Schema) TupleTag(org.apache.beam.sdk.values.TupleTag) Row(org.apache.beam.sdk.values.Row) Bid(org.apache.beam.sdk.nexmark.model.Bid) SelectEvent(org.apache.beam.sdk.nexmark.model.sql.SelectEvent)

Example 27 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class BoundedSideInputJoinTest method queryMatchesModel.

/**
 * Test {@code query} matches {@code model}.
 */
private <T extends KnownSize> void queryMatchesModel(String name, NexmarkConfiguration config, NexmarkQueryTransform<T> query, NexmarkQueryModel<T> model, boolean streamingMode) throws Exception {
    ResourceId sideInputResourceId = FileSystems.matchNewResource(String.format("%s/BoundedSideInputJoin-%s", p.getOptions().getTempLocation(), new Random().nextInt()), false);
    config.sideInputUrl = sideInputResourceId.toString();
    try {
        PCollection<KV<Long, String>> sideInput = NexmarkUtils.prepareSideInput(p, config);
        query.setSideInput(sideInput);
        PCollection<Event> events = p.apply(name + ".Read", streamingMode ? NexmarkUtils.streamEventsSource(config) : NexmarkUtils.batchEventsSource(config));
        PCollection<TimestampedValue<T>> results = (PCollection<TimestampedValue<T>>) events.apply(new NexmarkQuery<>(config, query));
        PAssert.that(results).satisfies(model.assertionFor());
        PipelineResult result = p.run();
        result.waitUntilFinish();
    } finally {
        NexmarkUtils.cleanUpSideInput(config);
    }
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) Random(java.util.Random) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) Event(org.apache.beam.sdk.nexmark.model.Event) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV)

Example 28 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class BoundedSideInputJoinTest method inputOutputSameEvents.

/**
 * A smoke test that the count of input bids and outputs are the same, to help diagnose flakiness
 * in more complex tests.
 */
@Test
@Category(NeedsRunner.class)
public void inputOutputSameEvents() throws Exception {
    NexmarkConfiguration config = NexmarkConfiguration.DEFAULT.copy();
    config.sideInputType = NexmarkUtils.SideInputType.DIRECT;
    config.numEventGenerators = 1;
    config.numEvents = 5000;
    config.sideInputRowCount = 10;
    config.sideInputNumShards = 3;
    PCollection<KV<Long, String>> sideInput = NexmarkUtils.prepareSideInput(p, config);
    try {
        PCollection<Event> input = p.apply(NexmarkUtils.batchEventsSource(config));
        PCollection<Bid> justBids = input.apply(NexmarkQueryUtil.JUST_BIDS);
        PCollection<Long> bidCount = justBids.apply("Count Bids", Count.globally());
        NexmarkQueryTransform<Bid> query = new BoundedSideInputJoin(config);
        query.setSideInput(sideInput);
        PCollection<TimestampedValue<Bid>> output = (PCollection<TimestampedValue<Bid>>) input.apply(new NexmarkQuery(config, query));
        PCollection<Long> outputCount = output.apply("Count outputs", Count.globally());
        PAssert.that(PCollectionList.of(bidCount).and(outputCount).apply(Flatten.pCollections())).satisfies(counts -> {
            assertThat(Iterables.size(counts), equalTo(2));
            assertThat(Iterables.get(counts, 0), greaterThan(0L));
            assertThat(Iterables.get(counts, 0), equalTo(Iterables.get(counts, 1)));
            return null;
        });
        p.run();
    } finally {
        NexmarkUtils.cleanUpSideInput(config);
    }
}
Also used : KV(org.apache.beam.sdk.values.KV) PCollection(org.apache.beam.sdk.values.PCollection) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) NexmarkConfiguration(org.apache.beam.sdk.nexmark.NexmarkConfiguration) Event(org.apache.beam.sdk.nexmark.model.Event) Bid(org.apache.beam.sdk.nexmark.model.Bid) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 29 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class SessionSideInputJoinTest method queryMatchesModel.

/**
 * Test {@code query} matches {@code model}.
 */
private <T extends KnownSize> void queryMatchesModel(String name, NexmarkConfiguration config, NexmarkQueryTransform<T> query, NexmarkQueryModel<T> model, boolean streamingMode) throws Exception {
    ResourceId sideInputResourceId = FileSystems.matchNewResource(String.format("%s/SessionSideInputJoin-%s", p.getOptions().getTempLocation(), new Random().nextInt()), false);
    config.sideInputUrl = sideInputResourceId.toString();
    try {
        PCollection<KV<Long, String>> sideInput = NexmarkUtils.prepareSideInput(p, config);
        query.setSideInput(sideInput);
        PCollection<Event> events = p.apply(name + ".Read", streamingMode ? NexmarkUtils.streamEventsSource(config) : NexmarkUtils.batchEventsSource(config));
        PCollection<TimestampedValue<T>> results = (PCollection<TimestampedValue<T>>) events.apply(new NexmarkQuery<>(config, query));
        PAssert.that(results).satisfies(model.assertionFor());
        PipelineResult result = p.run();
        result.waitUntilFinish();
    } finally {
        NexmarkUtils.cleanUpSideInput(config);
    }
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) Random(java.util.Random) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) Event(org.apache.beam.sdk.nexmark.model.Event) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV)

Example 30 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class QueryTest method queryMatchesModel.

/**
 * Test {@code query} matches {@code model}.
 */
private <T extends KnownSize> void queryMatchesModel(String name, NexmarkQueryTransform<T> query, NexmarkQueryModel<T> model, boolean streamingMode) {
    NexmarkUtils.setupPipeline(NexmarkUtils.CoderStrategy.HAND, p);
    PCollection<Event> events = p.apply(name + ".Read", streamingMode ? NexmarkUtils.streamEventsSource(CONFIG) : NexmarkUtils.batchEventsSource(CONFIG));
    PCollection<TimestampedValue<T>> results = (PCollection<TimestampedValue<T>>) events.apply(new NexmarkQuery<>(CONFIG, query));
    PAssert.that(results).satisfies(model.assertionFor());
    PipelineResult result = p.run();
    result.waitUntilFinish();
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) Event(org.apache.beam.sdk.nexmark.model.Event) PipelineResult(org.apache.beam.sdk.PipelineResult)

Aggregations

PCollection (org.apache.beam.sdk.values.PCollection)198 Test (org.junit.Test)133 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)61 KV (org.apache.beam.sdk.values.KV)61 Map (java.util.Map)59 List (java.util.List)58 Rule (org.junit.Rule)57 RunWith (org.junit.runner.RunWith)54 PAssert (org.apache.beam.sdk.testing.PAssert)52 Instant (org.joda.time.Instant)46 Duration (org.joda.time.Duration)45 JUnit4 (org.junit.runners.JUnit4)45 ParDo (org.apache.beam.sdk.transforms.ParDo)44 TupleTag (org.apache.beam.sdk.values.TupleTag)42 Pipeline (org.apache.beam.sdk.Pipeline)41 Create (org.apache.beam.sdk.transforms.Create)41 ArrayList (java.util.ArrayList)40 Serializable (java.io.Serializable)39 PTransform (org.apache.beam.sdk.transforms.PTransform)37 Row (org.apache.beam.sdk.values.Row)37