Search in sources :

Example 56 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class WatermarkManagerTest method updateWatermarkWithLateData.

/**
   * Demonstrates that updateWatermarks in the presence of late data is monotonic.
   */
@Test
public void updateWatermarkWithLateData() {
    Instant sourceWatermark = new Instant(1_000_000L);
    CommittedBundle<Integer> createdBundle = timestampedBundle(createdInts, TimestampedValue.of(1, sourceWatermark), TimestampedValue.of(2, new Instant(1234L)));
    manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), null, Collections.<CommittedBundle<?>>singleton(createdBundle)), sourceWatermark);
    CommittedBundle<KV<String, Integer>> keyBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 1), sourceWatermark), TimestampedValue.of(KV.of("MyKey", 2), new Instant(1234L)));
    // Finish processing the on-time data. The watermarks should progress to be equal to the source
    manager.updateWatermarks(createdBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), createdBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(keyBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
    manager.refreshAll();
    TransformWatermarks onTimeWatermarks = manager.getWatermarks(graph.getProducer(keyed));
    assertThat(onTimeWatermarks.getInputWatermark(), equalTo(sourceWatermark));
    assertThat(onTimeWatermarks.getOutputWatermark(), equalTo(sourceWatermark));
    CommittedBundle<Integer> lateDataBundle = timestampedBundle(createdInts, TimestampedValue.of(3, new Instant(-1000L)));
    // the late data arrives in a downstream PCollection after its watermark has advanced past it;
    // we don't advance the watermark past the current watermark until we've consumed the late data
    manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), createdBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(lateDataBundle)), new Instant(2_000_000L));
    manager.refreshAll();
    TransformWatermarks bufferedLateWm = manager.getWatermarks(graph.getProducer(createdInts));
    assertThat(bufferedLateWm.getOutputWatermark(), equalTo(new Instant(2_000_000L)));
    // The input watermark should be held to its previous value (not advanced due to late data; not
    // moved backwards in the presence of watermarks due to monotonicity).
    TransformWatermarks lateDataBufferedWatermark = manager.getWatermarks(graph.getProducer(keyed));
    assertThat(lateDataBufferedWatermark.getInputWatermark(), not(earlierThan(sourceWatermark)));
    assertThat(lateDataBufferedWatermark.getOutputWatermark(), not(earlierThan(sourceWatermark)));
    CommittedBundle<KV<String, Integer>> lateKeyedBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 3), new Instant(-1000L)));
    manager.updateWatermarks(lateDataBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), lateDataBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(lateKeyedBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
    manager.refreshAll();
}
Also used : TransformWatermarks(org.apache.beam.runners.direct.WatermarkManager.TransformWatermarks) ReadableInstant(org.joda.time.ReadableInstant) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 57 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class WatermarkManagerTest method updateWatermarkWithUnprocessedElements.

@Test
public void updateWatermarkWithUnprocessedElements() {
    WindowedValue<Integer> first = WindowedValue.valueInGlobalWindow(1);
    WindowedValue<Integer> second = WindowedValue.timestampedValueInGlobalWindow(2, new Instant(-1000L));
    WindowedValue<Integer> third = WindowedValue.timestampedValueInGlobalWindow(3, new Instant(1234L));
    CommittedBundle<Integer> createdBundle = bundleFactory.createBundle(createdInts).add(first).add(second).add(third).commit(clock.now());
    manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), null, Collections.<CommittedBundle<?>>singleton(createdBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
    CommittedBundle<KV<String, Integer>> keyBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 1), BoundedWindow.TIMESTAMP_MIN_VALUE));
    manager.updateWatermarks(createdBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), createdBundle.withElements(ImmutableList.of(second, third)), Collections.<CommittedBundle<?>>singleton(keyBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
    TransformWatermarks keyedWatermarks = manager.getWatermarks(graph.getProducer(keyed));
    // the unprocessed second and third are readded to pending
    assertThat(keyedWatermarks.getInputWatermark(), not(laterThan(new Instant(-1000L))));
}
Also used : TransformWatermarks(org.apache.beam.runners.direct.WatermarkManager.TransformWatermarks) ReadableInstant(org.joda.time.ReadableInstant) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 58 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class WatermarkManagerTest method updateWatermarkWithWatermarkHolds.

/**
   * Demonstrates that the watermark of an {@link AppliedPTransform} is held to the provided
   * watermark hold.
   */
@Test
public void updateWatermarkWithWatermarkHolds() {
    CommittedBundle<Integer> createdBundle = timestampedBundle(createdInts, TimestampedValue.of(1, new Instant(1_000_000L)), TimestampedValue.of(2, new Instant(1234L)), TimestampedValue.of(3, new Instant(-1000L)));
    manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), null, Collections.<CommittedBundle<?>>singleton(createdBundle)), new Instant(Long.MAX_VALUE));
    CommittedBundle<KV<String, Integer>> keyBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 1), new Instant(1_000_000L)), TimestampedValue.of(KV.of("MyKey", 2), new Instant(1234L)), TimestampedValue.of(KV.of("MyKey", 3), new Instant(-1000L)));
    manager.updateWatermarks(createdBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), createdBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(keyBundle)), new Instant(500L));
    manager.refreshAll();
    TransformWatermarks keyedWatermarks = manager.getWatermarks(graph.getProducer(keyed));
    assertThat(keyedWatermarks.getInputWatermark(), not(earlierThan(BoundedWindow.TIMESTAMP_MAX_VALUE)));
    assertThat(keyedWatermarks.getOutputWatermark(), not(laterThan(new Instant(500L))));
}
Also used : TransformWatermarks(org.apache.beam.runners.direct.WatermarkManager.TransformWatermarks) ReadableInstant(org.joda.time.ReadableInstant) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 59 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class KeyedPValueTrackingVisitorTest method keyedInputWithKeyPreserving.

@Test
public void keyedInputWithKeyPreserving() {
    PCollection<KV<String, WindowedValue<KV<String, Integer>>>> input = p.apply(Create.of(KV.of("hello", WindowedValue.of(KV.of("hello", 3), new Instant(0), new IntervalWindow(new Instant(0), new Instant(9)), PaneInfo.NO_FIRING))).withCoder(KvCoder.of(StringUtf8Coder.of(), WindowedValue.getValueOnlyCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())))));
    TupleTag<KeyedWorkItem<String, KV<String, Integer>>> keyedTag = new TupleTag<>();
    PCollection<KeyedWorkItem<String, KV<String, Integer>>> keyed = input.apply(new DirectGroupByKeyOnly<String, WindowedValue<KV<String, Integer>>>()).apply(new DirectGroupAlsoByWindow<String, WindowedValue<KV<String, Integer>>>(WindowingStrategy.globalDefault(), WindowingStrategy.globalDefault())).apply(ParDo.of(new ParDoMultiOverrideFactory.ToKeyedWorkItem<String, Integer>()).withOutputTags(keyedTag, TupleTagList.empty())).get(keyedTag).setCoder(KeyedWorkItemCoder.of(StringUtf8Coder.of(), KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()), GlobalWindow.Coder.INSTANCE));
    p.traverseTopologically(visitor);
    assertThat(visitor.getKeyedPValues(), hasItem(keyed));
}
Also used : Instant(org.joda.time.Instant) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) DirectGroupAlsoByWindow(org.apache.beam.runners.direct.DirectGroupByKey.DirectGroupAlsoByWindow) WindowedValue(org.apache.beam.sdk.util.WindowedValue) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Example 60 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class BigQueryIOTest method testWriteTables.

@Test
public void testWriteTables() throws Exception {
    p.enableAbandonedNodeEnforcement(false);
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
    datasetService.createDataset("project-id", "dataset-id", "", "");
    long numTables = 3;
    long numPartitions = 3;
    long numFilesPerPartition = 10;
    String jobIdToken = "jobIdToken";
    String stepUuid = "stepUuid";
    Map<TableDestination, List<String>> expectedTempTables = Maps.newHashMap();
    Path baseDir = Files.createTempDirectory(tempFolder, "testWriteTables");
    List<KV<ShardedKey<String>, List<String>>> partitions = Lists.newArrayList();
    for (int i = 0; i < numTables; ++i) {
        String tableName = String.format("project-id:dataset-id.table%05d", i);
        TableDestination tableDestination = new TableDestination(tableName, tableName);
        for (int j = 0; j < numPartitions; ++j) {
            String tempTableId = BigQueryHelpers.createJobId(jobIdToken, tableDestination, j);
            List<String> filesPerPartition = Lists.newArrayList();
            for (int k = 0; k < numFilesPerPartition; ++k) {
                String filename = Paths.get(baseDir.toString(), String.format("files0x%08x_%05d", tempTableId.hashCode(), k)).toString();
                ResourceId fileResource = FileSystems.matchNewResource(filename, false);
                try (WritableByteChannel channel = FileSystems.create(fileResource, MimeTypes.TEXT)) {
                    try (OutputStream output = Channels.newOutputStream(channel)) {
                        TableRow tableRow = new TableRow().set("name", tableName);
                        TableRowJsonCoder.of().encode(tableRow, output, Context.OUTER);
                        output.write("\n".getBytes(StandardCharsets.UTF_8));
                    }
                }
                filesPerPartition.add(filename);
            }
            partitions.add(KV.of(ShardedKey.of(tableDestination.getTableSpec(), j), filesPerPartition));
            List<String> expectedTables = expectedTempTables.get(tableDestination);
            if (expectedTables == null) {
                expectedTables = Lists.newArrayList();
                expectedTempTables.put(tableDestination, expectedTables);
            }
            String json = String.format("{\"datasetId\":\"dataset-id\",\"projectId\":\"project-id\",\"tableId\":\"%s\"}", tempTableId);
            expectedTables.add(json);
        }
    }
    PCollectionView<String> jobIdTokenView = p.apply("CreateJobId", Create.of("jobId")).apply(View.<String>asSingleton());
    PCollectionView<Map<String, String>> schemaMapView = p.apply("CreateEmptySchema", Create.empty(new TypeDescriptor<KV<String, String>>() {
    })).apply(View.<String, String>asMap());
    WriteTables<String> writeTables = new WriteTables<>(false, fakeBqServices, jobIdTokenView, schemaMapView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, new IdentityDynamicTables());
    DoFnTester<KV<ShardedKey<String>, List<String>>, KV<TableDestination, String>> tester = DoFnTester.of(writeTables);
    tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
    tester.setSideInput(schemaMapView, GlobalWindow.INSTANCE, ImmutableMap.<String, String>of());
    tester.getPipelineOptions().setTempLocation("tempLocation");
    for (KV<ShardedKey<String>, List<String>> partition : partitions) {
        tester.processElement(partition);
    }
    Map<TableDestination, List<String>> tempTablesResult = Maps.newHashMap();
    for (KV<TableDestination, String> element : tester.takeOutputElements()) {
        List<String> tables = tempTablesResult.get(element.getKey());
        if (tables == null) {
            tables = Lists.newArrayList();
            tempTablesResult.put(element.getKey(), tables);
        }
        tables.add(element.getValue());
    }
    assertEquals(expectedTempTables, tempTablesResult);
}
Also used : OutputStream(java.io.OutputStream) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Path(java.nio.file.Path) WritableByteChannel(java.nio.channels.WritableByteChannel) KV(org.apache.beam.sdk.values.KV) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) TableRow(com.google.api.services.bigquery.model.TableRow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Aggregations

KV (org.apache.beam.sdk.values.KV)192 Test (org.junit.Test)143 Instant (org.joda.time.Instant)66 Category (org.junit.experimental.categories.Category)62 Pipeline (org.apache.beam.sdk.Pipeline)35 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)34 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)33 Matchers.containsString (org.hamcrest.Matchers.containsString)33 StateSpec (org.apache.beam.sdk.state.StateSpec)25 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 ArrayList (java.util.ArrayList)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)19 TupleTag (org.apache.beam.sdk.values.TupleTag)16 TableRow (com.google.api.services.bigquery.model.TableRow)15 Map (java.util.Map)15 ValueState (org.apache.beam.sdk.state.ValueState)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 HashMap (java.util.HashMap)12 Timer (org.apache.beam.sdk.state.Timer)12