Search in sources :

Example 61 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class BigQueryIOTest method testWritePartition.

private void testWritePartition(long numTables, long numFilesPerTable, long fileSize, long expectedNumPartitionsPerTable) throws Exception {
    p.enableAbandonedNodeEnforcement(false);
    // In the case where a static destination is specified (i.e. not through a dynamic table
    // function) and there is no input data, WritePartition will generate an empty table. This
    // code is to test that path.
    boolean isSingleton = numTables == 1 && numFilesPerTable == 0;
    List<ShardedKey<String>> expectedPartitions = Lists.newArrayList();
    if (isSingleton) {
        expectedPartitions.add(ShardedKey.<String>of(null, 1));
    } else {
        for (int i = 0; i < numTables; ++i) {
            for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
                String tableName = String.format("project-id:dataset-id.tables%05d", i);
                expectedPartitions.add(ShardedKey.of(tableName, j));
            }
        }
    }
    List<WriteBundlesToFiles.Result<String>> files = Lists.newArrayList();
    Map<String, List<String>> filenamesPerTable = Maps.newHashMap();
    for (int i = 0; i < numTables; ++i) {
        String tableName = String.format("project-id:dataset-id.tables%05d", i);
        List<String> filenames = filenamesPerTable.get(tableName);
        if (filenames == null) {
            filenames = Lists.newArrayList();
            filenamesPerTable.put(tableName, filenames);
        }
        for (int j = 0; j < numFilesPerTable; ++j) {
            String fileName = String.format("%s_files%05d", tableName, j);
            filenames.add(fileName);
            files.add(new Result<>(fileName, fileSize, tableName));
        }
    }
    TupleTag<KV<ShardedKey<String>, List<String>>> multiPartitionsTag = new TupleTag<KV<ShardedKey<String>, List<String>>>("multiPartitionsTag") {
    };
    TupleTag<KV<ShardedKey<String>, List<String>>> singlePartitionTag = new TupleTag<KV<ShardedKey<String>, List<String>>>("singlePartitionTag") {
    };
    PCollectionView<Iterable<WriteBundlesToFiles.Result<String>>> resultsView = p.apply(Create.of(files).withCoder(WriteBundlesToFiles.ResultCoder.of(StringUtf8Coder.of()))).apply(View.<WriteBundlesToFiles.Result<String>>asIterable());
    String tempFilePrefix = testFolder.newFolder("BigQueryIOTest").getAbsolutePath();
    PCollectionView<String> tempFilePrefixView = p.apply(Create.of(tempFilePrefix)).apply(View.<String>asSingleton());
    WritePartition<String> writePartition = new WritePartition<>(isSingleton, tempFilePrefixView, resultsView, multiPartitionsTag, singlePartitionTag);
    DoFnTester<Void, KV<ShardedKey<String>, List<String>>> tester = DoFnTester.of(writePartition);
    tester.setSideInput(resultsView, GlobalWindow.INSTANCE, files);
    tester.setSideInput(tempFilePrefixView, GlobalWindow.INSTANCE, tempFilePrefix);
    tester.processElement(null);
    List<KV<ShardedKey<String>, List<String>>> partitions;
    if (expectedNumPartitionsPerTable > 1) {
        partitions = tester.takeOutputElements(multiPartitionsTag);
    } else {
        partitions = tester.takeOutputElements(singlePartitionTag);
    }
    List<ShardedKey<String>> partitionsResult = Lists.newArrayList();
    Map<String, List<String>> filesPerTableResult = Maps.newHashMap();
    for (KV<ShardedKey<String>, List<String>> partition : partitions) {
        String table = partition.getKey().getKey();
        partitionsResult.add(partition.getKey());
        List<String> tableFilesResult = filesPerTableResult.get(table);
        if (tableFilesResult == null) {
            tableFilesResult = Lists.newArrayList();
            filesPerTableResult.put(table, tableFilesResult);
        }
        tableFilesResult.addAll(partition.getValue());
    }
    assertThat(partitionsResult, containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
    if (isSingleton) {
        assertEquals(1, filesPerTableResult.size());
        List<String> singletonFiles = filesPerTableResult.values().iterator().next();
        assertTrue(Files.exists(Paths.get(singletonFiles.get(0))));
        assertThat(Files.readAllBytes(Paths.get(singletonFiles.get(0))).length, Matchers.equalTo(0));
    } else {
        assertEquals(filenamesPerTable, filesPerTableResult);
    }
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) KV(org.apache.beam.sdk.values.KV) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList)

Example 62 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class BigQueryIOTest method testValidateReadSetsDefaultProject.

@Test
public void testValidateReadSetsDefaultProject() throws Exception {
    String projectId = "someproject";
    String datasetId = "somedataset";
    String tableId = "sometable";
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject(projectId);
    Path baseDir = Files.createTempDirectory(tempFolder, "testValidateReadSetsDefaultProject");
    bqOptions.setTempLocation(baseDir.toString());
    FakeDatasetService fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset(projectId, datasetId, "", "");
    TableReference tableReference = new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId);
    fakeDatasetService.createTable(new Table().setTableReference(tableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    fakeDatasetService.insertAll(tableReference, expected, null);
    Pipeline p = TestPipeline.create(bqOptions);
    TableReference tableRef = new TableReference();
    tableRef.setDatasetId(datasetId);
    tableRef.setTableId(tableId);
    PCollection<KV<String, Long>> output = p.apply(BigQueryIO.read().from(tableRef).withTestServices(fakeBqServices)).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
        }
    }));
    PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L), KV.of("d", 4L), KV.of("e", 5L), KV.of("f", 6L)));
    p.run();
}
Also used : Path(java.nio.file.Path) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) KV(org.apache.beam.sdk.values.KV) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) DoFn(org.apache.beam.sdk.transforms.DoFn) TableRow(com.google.api.services.bigquery.model.TableRow) Test(org.junit.Test)

Example 63 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class LeaderBoardTest method testTeamScoresDroppablyLate.

/**
   * A test where elements arrive beyond the maximum allowed lateness. These elements are dropped
   * within {@link CalculateTeamScores} and do not impact the final result.
   */
@Test
public void testTeamScoresDroppablyLate() {
    BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION);
    TestStream<GameActionInfo> infos = TestStream.create(AvroCoder.of(GameActionInfo.class)).addElements(event(TestUser.BLUE_ONE, 12, Duration.ZERO), event(TestUser.RED_ONE, 3, Duration.ZERO)).advanceWatermarkTo(window.maxTimestamp()).addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_TWO, 3, Duration.ZERO), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))).advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS).plus(TEAM_WINDOW_DURATION).plus(Duration.standardMinutes(1))).addElements(event(TestUser.BLUE_TWO, 3, TEAM_WINDOW_DURATION.minus(Duration.standardSeconds(5))), event(TestUser.RED_ONE, 7, Duration.standardMinutes(4))).advanceWatermarkToInfinity();
    PCollection<KV<String, Integer>> teamScores = p.apply(infos).apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS));
    String blueTeam = TestUser.BLUE_ONE.getTeam();
    String redTeam = TestUser.RED_ONE.getTeam();
    // Only one on-time pane and no late panes should be emitted
    PAssert.that(teamScores).inWindow(window).containsInAnyOrder(KV.of(redTeam, 7), KV.of(blueTeam, 18));
    // No elements are added before the watermark passes the end of the window plus the allowed
    // lateness, so no refinement should be emitted
    PAssert.that(teamScores).inFinalPane(window).empty();
    p.run().waitUntilFinish();
}
Also used : GameActionInfo(org.apache.beam.examples.complete.game.UserScore.GameActionInfo) CalculateTeamScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateTeamScores) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) KV(org.apache.beam.sdk.values.KV) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Example 64 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class LeaderBoardTest method testTeamScoresObservablyLate.

/**
   * A test where elements arrive behind the watermark (late data) after the watermark passes the
   * end of the window, but before the maximum allowed lateness. These elements are emitted in a
   * late pane.
   */
@Test
public void testTeamScoresObservablyLate() {
    Instant firstWindowCloses = baseTime.plus(ALLOWED_LATENESS).plus(TEAM_WINDOW_DURATION);
    TestStream<GameActionInfo> createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)).advanceWatermarkTo(baseTime).addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(8))).advanceProcessingTime(Duration.standardMinutes(10)).advanceWatermarkTo(baseTime.plus(Duration.standardMinutes(3))).addElements(event(TestUser.RED_ONE, 3, Duration.standardMinutes(1)), event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(5))).advanceWatermarkTo(firstWindowCloses.minus(Duration.standardMinutes(1))).addElements(event(TestUser.RED_TWO, 2, Duration.ZERO), event(TestUser.RED_TWO, 5, Duration.standardMinutes(1)), event(TestUser.RED_TWO, 3, Duration.standardMinutes(3))).advanceProcessingTime(Duration.standardMinutes(12)).addElements(event(TestUser.RED_TWO, 9, Duration.standardMinutes(1)), event(TestUser.RED_TWO, 1, Duration.standardMinutes(3))).advanceWatermarkToInfinity();
    PCollection<KV<String, Integer>> teamScores = p.apply(createEvents).apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS));
    BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION);
    String blueTeam = TestUser.BLUE_ONE.getTeam();
    String redTeam = TestUser.RED_ONE.getTeam();
    PAssert.that(teamScores).inWindow(window).satisfies((SerializableFunction<Iterable<KV<String, Integer>>, Void>) input -> {
        assertThat(input, hasItem(KV.of(blueTeam, 11)));
        assertThat(input, hasItem(KV.of(redTeam, 27)));
        return null;
    });
    PAssert.thatMap(teamScores).inOnTimePane(window).isEqualTo(ImmutableMap.<String, Integer>builder().put(redTeam, 7).put(blueTeam, 11).build());
    // No final pane is emitted for the blue team, as all of their updates have been taken into
    // account in earlier panes
    PAssert.that(teamScores).inFinalPane(window).containsInAnyOrder(KV.of(redTeam, 27));
    p.run().waitUntilFinish();
}
Also used : KV(org.apache.beam.sdk.values.KV) GameActionInfo(org.apache.beam.examples.complete.game.UserScore.GameActionInfo) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) PTransform(org.apache.beam.sdk.transforms.PTransform) Assert.assertThat(org.junit.Assert.assertThat) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) ImmutableMap(com.google.common.collect.ImmutableMap) PAssert(org.apache.beam.sdk.testing.PAssert) CalculateTeamScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateTeamScores) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) AvroCoder(org.apache.beam.sdk.coders.AvroCoder) Serializable(java.io.Serializable) Matchers.hasItem(org.hamcrest.Matchers.hasItem) Rule(org.junit.Rule) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) CalculateUserScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateUserScores) TestStream(org.apache.beam.sdk.testing.TestStream) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) GameActionInfo(org.apache.beam.examples.complete.game.UserScore.GameActionInfo) Instant(org.joda.time.Instant) CalculateTeamScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateTeamScores) KV(org.apache.beam.sdk.values.KV) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Example 65 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class UserScoreTest method testTeamScoreSums.

/** Tests ExtractAndSumScore("team"). */
@Test
@Category(ValidatesRunner.class)
public void testTeamScoreSums() throws Exception {
    PCollection<String> input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of()));
    PCollection<KV<String, Integer>> output = input.apply(ParDo.of(new ParseEventFn())).apply("ExtractTeamScore", new ExtractAndSumScore("team"));
    // Check the team score sums.
    PAssert.that(output).containsInAnyOrder(TEAM_SUMS);
    p.run().waitUntilFinish();
}
Also used : ExtractAndSumScore(org.apache.beam.examples.complete.game.UserScore.ExtractAndSumScore) KV(org.apache.beam.sdk.values.KV) ParseEventFn(org.apache.beam.examples.complete.game.UserScore.ParseEventFn) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Aggregations

KV (org.apache.beam.sdk.values.KV)192 Test (org.junit.Test)143 Instant (org.joda.time.Instant)66 Category (org.junit.experimental.categories.Category)62 Pipeline (org.apache.beam.sdk.Pipeline)35 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)34 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)33 Matchers.containsString (org.hamcrest.Matchers.containsString)33 StateSpec (org.apache.beam.sdk.state.StateSpec)25 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 ArrayList (java.util.ArrayList)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)19 TupleTag (org.apache.beam.sdk.values.TupleTag)16 TableRow (com.google.api.services.bigquery.model.TableRow)15 Map (java.util.Map)15 ValueState (org.apache.beam.sdk.state.ValueState)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 HashMap (java.util.HashMap)12 Timer (org.apache.beam.sdk.state.Timer)12