Search in sources :

Example 1 with CoGbkResult

use of org.apache.beam.sdk.transforms.join.CoGbkResult in project beam by apache.

the class SparkCoGroupByKeyStreamingTest method testInStreamingMode.

@Category(StreamingTest.class)
@Test
public void testInStreamingMode() throws Exception {
    Instant instant = new Instant(0);
    CreateStream<KV<Integer, Integer>> source1 = CreateStream.of(KvCoder.of(VarIntCoder.of(), VarIntCoder.of()), batchDuration()).emptyBatch().advanceWatermarkForNextBatch(instant).nextBatch(TimestampedValue.of(KV.of(1, 1), instant), TimestampedValue.of(KV.of(1, 2), instant), TimestampedValue.of(KV.of(1, 3), instant)).advanceWatermarkForNextBatch(instant.plus(Duration.standardSeconds(1L))).nextBatch(TimestampedValue.of(KV.of(2, 4), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(KV.of(2, 5), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(KV.of(2, 6), instant.plus(Duration.standardSeconds(1L)))).advanceNextBatchWatermarkToInfinity();
    CreateStream<KV<Integer, Integer>> source2 = CreateStream.of(KvCoder.of(VarIntCoder.of(), VarIntCoder.of()), batchDuration()).emptyBatch().advanceWatermarkForNextBatch(instant).nextBatch(TimestampedValue.of(KV.of(1, 11), instant), TimestampedValue.of(KV.of(1, 12), instant), TimestampedValue.of(KV.of(1, 13), instant)).advanceWatermarkForNextBatch(instant.plus(Duration.standardSeconds(1L))).nextBatch(TimestampedValue.of(KV.of(2, 14), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(KV.of(2, 15), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(KV.of(2, 16), instant.plus(Duration.standardSeconds(1L)))).advanceNextBatchWatermarkToInfinity();
    PCollection<KV<Integer, Integer>> input1 = pipeline.apply("create source1", source1).apply("window input1", Window.<KV<Integer, Integer>>into(FixedWindows.of(Duration.standardSeconds(3L))).withAllowedLateness(Duration.ZERO));
    PCollection<KV<Integer, Integer>> input2 = pipeline.apply("create source2", source2).apply("window input2", Window.<KV<Integer, Integer>>into(FixedWindows.of(Duration.standardSeconds(3L))).withAllowedLateness(Duration.ZERO));
    PCollection<KV<Integer, CoGbkResult>> output = KeyedPCollectionTuple.of(INPUT1_TAG, input1).and(INPUT2_TAG, input2).apply(CoGroupByKey.create());
    PAssert.that("Wrong output of the join using CoGroupByKey in streaming mode", output).satisfies((SerializableFunction<Iterable<KV<Integer, CoGbkResult>>, Void>) input -> {
        assertEquals("Wrong size of the output PCollection", 2, Iterables.size(input));
        for (KV<Integer, CoGbkResult> element : input) {
            if (element.getKey() == 1) {
                Iterable<Integer> input1Elements = element.getValue().getAll(INPUT1_TAG);
                assertEquals("Wrong number of values for output elements for tag input1 and key 1", 3, Iterables.size(input1Elements));
                assertThat("Elements of PCollection input1 for key \"1\" are not present in the output PCollection", input1Elements, containsInAnyOrder(1, 2, 3));
                Iterable<Integer> input2Elements = element.getValue().getAll(INPUT2_TAG);
                assertEquals("Wrong number of values for output elements for tag input2 and key 1", 3, Iterables.size(input2Elements));
                assertThat("Elements of PCollection input2 for key \"1\" are not present in the output PCollection", input2Elements, containsInAnyOrder(11, 12, 13));
            } else if (element.getKey() == 2) {
                Iterable<Integer> input1Elements = element.getValue().getAll(INPUT1_TAG);
                assertEquals("Wrong number of values for output elements for tag input1 and key 2", 3, Iterables.size(input1Elements));
                assertThat("Elements of PCollection input1 for key \"2\" are not present in the output PCollection", input1Elements, containsInAnyOrder(4, 5, 6));
                Iterable<Integer> input2Elements = element.getValue().getAll(INPUT2_TAG);
                assertEquals("Wrong number of values for output elements for tag input2 and key 2", 3, Iterables.size(input2Elements));
                assertThat("Elements of PCollection input2 for key \"2\" are not present in the output PCollection", input2Elements, containsInAnyOrder(14, 15, 16));
            } else {
                fail("Unknown key in the output PCollection");
            }
        }
        return null;
    });
    pipeline.run();
}
Also used : KV(org.apache.beam.sdk.values.KV) StreamingTest(org.apache.beam.runners.spark.StreamingTest) Duration(org.joda.time.Duration) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) ReuseSparkContextRule(org.apache.beam.runners.spark.ReuseSparkContextRule) CoGbkResult(org.apache.beam.sdk.transforms.join.CoGbkResult) TupleTag(org.apache.beam.sdk.values.TupleTag) CreateStream(org.apache.beam.runners.spark.io.CreateStream) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Window(org.apache.beam.sdk.transforms.windowing.Window) Assert.fail(org.junit.Assert.fail) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) KeyedPCollectionTuple(org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) KvCoder(org.apache.beam.sdk.coders.KvCoder) PAssert(org.apache.beam.sdk.testing.PAssert) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) PCollection(org.apache.beam.sdk.values.PCollection) Category(org.junit.experimental.categories.Category) CoGroupByKey(org.apache.beam.sdk.transforms.join.CoGroupByKey) Rule(org.junit.Rule) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Instant(org.joda.time.Instant) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) SparkPipelineOptions(org.apache.beam.runners.spark.SparkPipelineOptions) Assert.assertEquals(org.junit.Assert.assertEquals) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) CoGbkResult(org.apache.beam.sdk.transforms.join.CoGbkResult) Category(org.junit.experimental.categories.Category) StreamingTest(org.apache.beam.runners.spark.StreamingTest) Test(org.junit.Test)

Example 2 with CoGbkResult

use of org.apache.beam.sdk.transforms.join.CoGbkResult in project beam by apache.

the class Task method applyTransform.

static PCollection<String> applyTransform(PCollection<String> fruits, PCollection<String> countries) {
    TupleTag<String> fruitsTag = new TupleTag<>();
    TupleTag<String> countriesTag = new TupleTag<>();
    MapElements<String, KV<String, String>> mapToAlphabetKv = MapElements.into(kvs(strings(), strings())).via(word -> KV.of(word.substring(0, 1), word));
    PCollection<KV<String, String>> fruitsPColl = fruits.apply("Fruit to KV", mapToAlphabetKv);
    PCollection<KV<String, String>> countriesPColl = countries.apply("Country to KV", mapToAlphabetKv);
    return KeyedPCollectionTuple.of(fruitsTag, fruitsPColl).and(countriesTag, countriesPColl).apply(CoGroupByKey.create()).apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() {

        @ProcessElement
        public void processElement(@Element KV<String, CoGbkResult> element, OutputReceiver<String> out) {
            String alphabet = element.getKey();
            CoGbkResult coGbkResult = element.getValue();
            String fruit = coGbkResult.getOnly(fruitsTag);
            String country = coGbkResult.getOnly(countriesTag);
            out.output(new WordsAlphabet(alphabet, fruit, country).toString());
        }
    }));
}
Also used : DoFn(org.apache.beam.sdk.transforms.DoFn) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) CoGbkResult(org.apache.beam.sdk.transforms.join.CoGbkResult)

Example 3 with CoGbkResult

use of org.apache.beam.sdk.transforms.join.CoGbkResult in project beam by apache.

the class WinningBids method expand.

@Override
public PCollection<AuctionBid> expand(PCollection<Event> events) {
    // Window auctions and bids into custom auction windows. New people events will be discarded.
    // This will allow us to bring bids and auctions together irrespective of how long
    // each auction is open for.
    events = events.apply("Window", Window.into(auctionOrBidWindowFn));
    // Key auctions by their id.
    PCollection<KV<Long, Auction>> auctionsById = events.apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS).apply("AuctionById:", NexmarkQueryUtil.AUCTION_BY_ID);
    // Key bids by their auction id.
    PCollection<KV<Long, Bid>> bidsByAuctionId = events.apply(NexmarkQueryUtil.JUST_BIDS).apply("BidByAuction", NexmarkQueryUtil.BID_BY_AUCTION);
    // Find the highest price valid bid for each closed auction.
    return // Join auctions and bids.
    KeyedPCollectionTuple.of(NexmarkQueryUtil.AUCTION_TAG, auctionsById).and(NexmarkQueryUtil.BID_TAG, bidsByAuctionId).apply(CoGroupByKey.create()).apply(name + ".Join", ParDo.of(new DoFn<KV<Long, CoGbkResult>, AuctionBid>() {

        private final Counter noAuctionCounter = Metrics.counter(name, "noAuction");

        private final Counter underReserveCounter = Metrics.counter(name, "underReserve");

        private final Counter noValidBidsCounter = Metrics.counter(name, "noValidBids");

        @ProcessElement
        public void processElement(ProcessContext c) {
            @Nullable Auction auction = c.element().getValue().getOnly(NexmarkQueryUtil.AUCTION_TAG, null);
            if (auction == null) {
                // We have bids without a matching auction. Give up.
                noAuctionCounter.inc();
                return;
            }
            // Find the current winning bid for auction.
            // The earliest bid with the maximum price above the reserve wins.
            Bid bestBid = null;
            for (Bid bid : c.element().getValue().getAll(NexmarkQueryUtil.BID_TAG)) {
                // Bids too late for their auction will have been
                // filtered out by the window merge function.
                checkState(bid.dateTime.compareTo(auction.expires) < 0);
                if (bid.price < auction.reserve) {
                    // Bid price is below auction reserve.
                    underReserveCounter.inc();
                    continue;
                }
                if (bestBid == null || Bid.PRICE_THEN_DESCENDING_TIME.compare(bid, bestBid) > 0) {
                    bestBid = bid;
                }
            }
            if (bestBid == null) {
                // We don't have any valid bids for auction.
                noValidBidsCounter.inc();
                return;
            }
            c.output(new AuctionBid(auction, bestBid));
        }
    }));
}
Also used : DoFn(org.apache.beam.sdk.transforms.DoFn) Counter(org.apache.beam.sdk.metrics.Counter) AuctionBid(org.apache.beam.sdk.nexmark.model.AuctionBid) Auction(org.apache.beam.sdk.nexmark.model.Auction) KV(org.apache.beam.sdk.values.KV) Bid(org.apache.beam.sdk.nexmark.model.Bid) AuctionBid(org.apache.beam.sdk.nexmark.model.AuctionBid) Nullable(org.checkerframework.checker.nullness.qual.Nullable) CoGbkResult(org.apache.beam.sdk.transforms.join.CoGbkResult)

Example 4 with CoGbkResult

use of org.apache.beam.sdk.transforms.join.CoGbkResult in project beam by apache.

the class Query8 method expand.

@Override
public PCollection<IdNameReserve> expand(PCollection<Event> events) {
    // Window and key new people by their id.
    PCollection<KV<Long, Person>> personsById = events.apply(NexmarkQueryUtil.JUST_NEW_PERSONS).apply("Query8.WindowPersons", Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec)))).apply("PersonById", NexmarkQueryUtil.PERSON_BY_ID);
    // Window and key new auctions by their id.
    PCollection<KV<Long, Auction>> auctionsBySeller = events.apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS).apply("Query8.WindowAuctions", Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec)))).apply("AuctionBySeller", NexmarkQueryUtil.AUCTION_BY_SELLER);
    // Join people and auctions and project the person id, name and auction reserve price.
    return KeyedPCollectionTuple.of(NexmarkQueryUtil.PERSON_TAG, personsById).and(NexmarkQueryUtil.AUCTION_TAG, auctionsBySeller).apply(CoGroupByKey.create()).apply(name + ".Select", ParDo.of(new DoFn<KV<Long, CoGbkResult>, IdNameReserve>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            @Nullable Person person = c.element().getValue().getOnly(NexmarkQueryUtil.PERSON_TAG, null);
            if (person == null) {
                // Person was not created in last window period.
                return;
            }
            for (Auction auction : c.element().getValue().getAll(NexmarkQueryUtil.AUCTION_TAG)) {
                c.output(new IdNameReserve(person.id, person.name, auction.reserve));
            }
        }
    }));
}
Also used : DoFn(org.apache.beam.sdk.transforms.DoFn) Auction(org.apache.beam.sdk.nexmark.model.Auction) KV(org.apache.beam.sdk.values.KV) IdNameReserve(org.apache.beam.sdk.nexmark.model.IdNameReserve) Person(org.apache.beam.sdk.nexmark.model.Person) Nullable(org.checkerframework.checker.nullness.qual.Nullable) CoGbkResult(org.apache.beam.sdk.transforms.join.CoGbkResult)

Example 5 with CoGbkResult

use of org.apache.beam.sdk.transforms.join.CoGbkResult in project beam by apache.

the class BigQueryIOStorageReadTableRowIT method runPipeline.

private static void runPipeline(BigQueryIOStorageReadTableRowOptions pipelineOptions) {
    Pipeline pipeline = Pipeline.create(pipelineOptions);
    PCollection<KV<String, String>> jsonTableRowsFromExport = pipeline.apply("ExportTable", BigQueryIO.readTableRows().from(pipelineOptions.getInputTable()).withMethod(Method.EXPORT)).apply("MapExportedRows", MapElements.via(new TableRowToKVPairFn()));
    PCollection<KV<String, String>> jsonTableRowsFromDirectRead = pipeline.apply("DirectReadTable", BigQueryIO.readTableRows().from(pipelineOptions.getInputTable()).withMethod(Method.DIRECT_READ)).apply("MapDirectReadRows", MapElements.via(new TableRowToKVPairFn()));
    final TupleTag<String> exportTag = new TupleTag<>();
    final TupleTag<String> directReadTag = new TupleTag<>();
    PCollection<KV<String, Set<String>>> unmatchedRows = KeyedPCollectionTuple.of(exportTag, jsonTableRowsFromExport).and(directReadTag, jsonTableRowsFromDirectRead).apply(CoGroupByKey.create()).apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, KV<String, Set<String>>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            KV<String, CoGbkResult> element = c.element();
            // Add all the exported rows for the key to a collection.
            Set<String> uniqueRows = new HashSet<>();
            for (String row : element.getValue().getAll(exportTag)) {
                uniqueRows.add(row);
            }
            // Compute the disjunctive union of the rows in the direct read collection.
            for (String row : element.getValue().getAll(directReadTag)) {
                if (uniqueRows.contains(row)) {
                    uniqueRows.remove(row);
                } else {
                    uniqueRows.add(row);
                }
            }
            // Emit any rows in the result set.
            if (!uniqueRows.isEmpty()) {
                c.output(KV.of(element.getKey(), uniqueRows));
            }
        }
    }));
    PAssert.that(unmatchedRows).empty();
    pipeline.run().waitUntilFinish();
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) CoGbkResult(org.apache.beam.sdk.transforms.join.CoGbkResult) DoFn(org.apache.beam.sdk.transforms.DoFn) HashSet(java.util.HashSet)

Aggregations

CoGbkResult (org.apache.beam.sdk.transforms.join.CoGbkResult)5 KV (org.apache.beam.sdk.values.KV)5 DoFn (org.apache.beam.sdk.transforms.DoFn)4 TupleTag (org.apache.beam.sdk.values.TupleTag)3 Auction (org.apache.beam.sdk.nexmark.model.Auction)2 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)2 Nullable (org.checkerframework.checker.nullness.qual.Nullable)2 HashSet (java.util.HashSet)1 Set (java.util.Set)1 ReuseSparkContextRule (org.apache.beam.runners.spark.ReuseSparkContextRule)1 SparkPipelineOptions (org.apache.beam.runners.spark.SparkPipelineOptions)1 StreamingTest (org.apache.beam.runners.spark.StreamingTest)1 CreateStream (org.apache.beam.runners.spark.io.CreateStream)1 Pipeline (org.apache.beam.sdk.Pipeline)1 KvCoder (org.apache.beam.sdk.coders.KvCoder)1 VarIntCoder (org.apache.beam.sdk.coders.VarIntCoder)1 Counter (org.apache.beam.sdk.metrics.Counter)1 AuctionBid (org.apache.beam.sdk.nexmark.model.AuctionBid)1 Bid (org.apache.beam.sdk.nexmark.model.Bid)1 IdNameReserve (org.apache.beam.sdk.nexmark.model.IdNameReserve)1