use of org.apache.beam.sdk.values.KV in project beam by apache.
the class WatermarkManagerTest method updateWatermarkWithWatermarkHolds.
/**
* Demonstrates that the watermark of an {@link AppliedPTransform} is held to the provided
* watermark hold.
*/
@Test
public void updateWatermarkWithWatermarkHolds() {
CommittedBundle<Integer> createdBundle = timestampedBundle(createdInts, TimestampedValue.of(1, new Instant(1_000_000L)), TimestampedValue.of(2, new Instant(1234L)), TimestampedValue.of(3, new Instant(-1000L)));
manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), null, Collections.<CommittedBundle<?>>singleton(createdBundle)), new Instant(Long.MAX_VALUE));
CommittedBundle<KV<String, Integer>> keyBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 1), new Instant(1_000_000L)), TimestampedValue.of(KV.of("MyKey", 2), new Instant(1234L)), TimestampedValue.of(KV.of("MyKey", 3), new Instant(-1000L)));
manager.updateWatermarks(createdBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), createdBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(keyBundle)), new Instant(500L));
manager.refreshAll();
TransformWatermarks keyedWatermarks = manager.getWatermarks(graph.getProducer(keyed));
assertThat(keyedWatermarks.getInputWatermark(), not(earlierThan(BoundedWindow.TIMESTAMP_MAX_VALUE)));
assertThat(keyedWatermarks.getOutputWatermark(), not(laterThan(new Instant(500L))));
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class KeyedPValueTrackingVisitorTest method keyedInputWithKeyPreserving.
@Test
public void keyedInputWithKeyPreserving() {
PCollection<KV<String, WindowedValue<KV<String, Integer>>>> input = p.apply(Create.of(KV.of("hello", WindowedValue.of(KV.of("hello", 3), new Instant(0), new IntervalWindow(new Instant(0), new Instant(9)), PaneInfo.NO_FIRING))).withCoder(KvCoder.of(StringUtf8Coder.of(), WindowedValue.getValueOnlyCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())))));
TupleTag<KeyedWorkItem<String, KV<String, Integer>>> keyedTag = new TupleTag<>();
PCollection<KeyedWorkItem<String, KV<String, Integer>>> keyed = input.apply(new DirectGroupByKeyOnly<String, WindowedValue<KV<String, Integer>>>()).apply(new DirectGroupAlsoByWindow<String, WindowedValue<KV<String, Integer>>>(WindowingStrategy.globalDefault(), WindowingStrategy.globalDefault())).apply(ParDo.of(new ParDoMultiOverrideFactory.ToKeyedWorkItem<String, Integer>()).withOutputTags(keyedTag, TupleTagList.empty())).get(keyedTag).setCoder(KeyedWorkItemCoder.of(StringUtf8Coder.of(), KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()), GlobalWindow.Coder.INSTANCE));
p.traverseTopologically(visitor);
assertThat(visitor.getKeyedPValues(), hasItem(keyed));
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class BigQueryIOTest method testWriteTables.
@Test
public void testWriteTables() throws Exception {
p.enableAbandonedNodeEnforcement(false);
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
datasetService.createDataset("project-id", "dataset-id", "", "");
long numTables = 3;
long numPartitions = 3;
long numFilesPerPartition = 10;
String jobIdToken = "jobIdToken";
String stepUuid = "stepUuid";
Map<TableDestination, List<String>> expectedTempTables = Maps.newHashMap();
Path baseDir = Files.createTempDirectory(tempFolder, "testWriteTables");
List<KV<ShardedKey<String>, List<String>>> partitions = Lists.newArrayList();
for (int i = 0; i < numTables; ++i) {
String tableName = String.format("project-id:dataset-id.table%05d", i);
TableDestination tableDestination = new TableDestination(tableName, tableName);
for (int j = 0; j < numPartitions; ++j) {
String tempTableId = BigQueryHelpers.createJobId(jobIdToken, tableDestination, j);
List<String> filesPerPartition = Lists.newArrayList();
for (int k = 0; k < numFilesPerPartition; ++k) {
String filename = Paths.get(baseDir.toString(), String.format("files0x%08x_%05d", tempTableId.hashCode(), k)).toString();
ResourceId fileResource = FileSystems.matchNewResource(filename, false);
try (WritableByteChannel channel = FileSystems.create(fileResource, MimeTypes.TEXT)) {
try (OutputStream output = Channels.newOutputStream(channel)) {
TableRow tableRow = new TableRow().set("name", tableName);
TableRowJsonCoder.of().encode(tableRow, output, Context.OUTER);
output.write("\n".getBytes(StandardCharsets.UTF_8));
}
}
filesPerPartition.add(filename);
}
partitions.add(KV.of(ShardedKey.of(tableDestination.getTableSpec(), j), filesPerPartition));
List<String> expectedTables = expectedTempTables.get(tableDestination);
if (expectedTables == null) {
expectedTables = Lists.newArrayList();
expectedTempTables.put(tableDestination, expectedTables);
}
String json = String.format("{\"datasetId\":\"dataset-id\",\"projectId\":\"project-id\",\"tableId\":\"%s\"}", tempTableId);
expectedTables.add(json);
}
}
PCollectionView<String> jobIdTokenView = p.apply("CreateJobId", Create.of("jobId")).apply(View.<String>asSingleton());
PCollectionView<Map<String, String>> schemaMapView = p.apply("CreateEmptySchema", Create.empty(new TypeDescriptor<KV<String, String>>() {
})).apply(View.<String, String>asMap());
WriteTables<String> writeTables = new WriteTables<>(false, fakeBqServices, jobIdTokenView, schemaMapView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, new IdentityDynamicTables());
DoFnTester<KV<ShardedKey<String>, List<String>>, KV<TableDestination, String>> tester = DoFnTester.of(writeTables);
tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
tester.setSideInput(schemaMapView, GlobalWindow.INSTANCE, ImmutableMap.<String, String>of());
tester.getPipelineOptions().setTempLocation("tempLocation");
for (KV<ShardedKey<String>, List<String>> partition : partitions) {
tester.processElement(partition);
}
Map<TableDestination, List<String>> tempTablesResult = Maps.newHashMap();
for (KV<TableDestination, String> element : tester.takeOutputElements()) {
List<String> tables = tempTablesResult.get(element.getKey());
if (tables == null) {
tables = Lists.newArrayList();
tempTablesResult.put(element.getKey(), tables);
}
tables.add(element.getValue());
}
assertEquals(expectedTempTables, tempTablesResult);
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class BigQueryIOTest method testWritePartition.
private void testWritePartition(long numTables, long numFilesPerTable, long fileSize, long expectedNumPartitionsPerTable) throws Exception {
p.enableAbandonedNodeEnforcement(false);
// In the case where a static destination is specified (i.e. not through a dynamic table
// function) and there is no input data, WritePartition will generate an empty table. This
// code is to test that path.
boolean isSingleton = numTables == 1 && numFilesPerTable == 0;
List<ShardedKey<String>> expectedPartitions = Lists.newArrayList();
if (isSingleton) {
expectedPartitions.add(ShardedKey.<String>of(null, 1));
} else {
for (int i = 0; i < numTables; ++i) {
for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
String tableName = String.format("project-id:dataset-id.tables%05d", i);
expectedPartitions.add(ShardedKey.of(tableName, j));
}
}
}
List<WriteBundlesToFiles.Result<String>> files = Lists.newArrayList();
Map<String, List<String>> filenamesPerTable = Maps.newHashMap();
for (int i = 0; i < numTables; ++i) {
String tableName = String.format("project-id:dataset-id.tables%05d", i);
List<String> filenames = filenamesPerTable.get(tableName);
if (filenames == null) {
filenames = Lists.newArrayList();
filenamesPerTable.put(tableName, filenames);
}
for (int j = 0; j < numFilesPerTable; ++j) {
String fileName = String.format("%s_files%05d", tableName, j);
filenames.add(fileName);
files.add(new Result<>(fileName, fileSize, tableName));
}
}
TupleTag<KV<ShardedKey<String>, List<String>>> multiPartitionsTag = new TupleTag<KV<ShardedKey<String>, List<String>>>("multiPartitionsTag") {
};
TupleTag<KV<ShardedKey<String>, List<String>>> singlePartitionTag = new TupleTag<KV<ShardedKey<String>, List<String>>>("singlePartitionTag") {
};
PCollectionView<Iterable<WriteBundlesToFiles.Result<String>>> resultsView = p.apply(Create.of(files).withCoder(WriteBundlesToFiles.ResultCoder.of(StringUtf8Coder.of()))).apply(View.<WriteBundlesToFiles.Result<String>>asIterable());
String tempFilePrefix = testFolder.newFolder("BigQueryIOTest").getAbsolutePath();
PCollectionView<String> tempFilePrefixView = p.apply(Create.of(tempFilePrefix)).apply(View.<String>asSingleton());
WritePartition<String> writePartition = new WritePartition<>(isSingleton, tempFilePrefixView, resultsView, multiPartitionsTag, singlePartitionTag);
DoFnTester<Void, KV<ShardedKey<String>, List<String>>> tester = DoFnTester.of(writePartition);
tester.setSideInput(resultsView, GlobalWindow.INSTANCE, files);
tester.setSideInput(tempFilePrefixView, GlobalWindow.INSTANCE, tempFilePrefix);
tester.processElement(null);
List<KV<ShardedKey<String>, List<String>>> partitions;
if (expectedNumPartitionsPerTable > 1) {
partitions = tester.takeOutputElements(multiPartitionsTag);
} else {
partitions = tester.takeOutputElements(singlePartitionTag);
}
List<ShardedKey<String>> partitionsResult = Lists.newArrayList();
Map<String, List<String>> filesPerTableResult = Maps.newHashMap();
for (KV<ShardedKey<String>, List<String>> partition : partitions) {
String table = partition.getKey().getKey();
partitionsResult.add(partition.getKey());
List<String> tableFilesResult = filesPerTableResult.get(table);
if (tableFilesResult == null) {
tableFilesResult = Lists.newArrayList();
filesPerTableResult.put(table, tableFilesResult);
}
tableFilesResult.addAll(partition.getValue());
}
assertThat(partitionsResult, containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
if (isSingleton) {
assertEquals(1, filesPerTableResult.size());
List<String> singletonFiles = filesPerTableResult.values().iterator().next();
assertTrue(Files.exists(Paths.get(singletonFiles.get(0))));
assertThat(Files.readAllBytes(Paths.get(singletonFiles.get(0))).length, Matchers.equalTo(0));
} else {
assertEquals(filenamesPerTable, filesPerTableResult);
}
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class BigQueryIOTest method testValidateReadSetsDefaultProject.
@Test
public void testValidateReadSetsDefaultProject() throws Exception {
String projectId = "someproject";
String datasetId = "somedataset";
String tableId = "sometable";
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject(projectId);
Path baseDir = Files.createTempDirectory(tempFolder, "testValidateReadSetsDefaultProject");
bqOptions.setTempLocation(baseDir.toString());
FakeDatasetService fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset(projectId, datasetId, "", "");
TableReference tableReference = new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId);
fakeDatasetService.createTable(new Table().setTableReference(tableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
fakeDatasetService.insertAll(tableReference, expected, null);
Pipeline p = TestPipeline.create(bqOptions);
TableReference tableRef = new TableReference();
tableRef.setDatasetId(datasetId);
tableRef.setTableId(tableId);
PCollection<KV<String, Long>> output = p.apply(BigQueryIO.read().from(tableRef).withTestServices(fakeBqServices)).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
}
}));
PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L), KV.of("d", 4L), KV.of("e", 5L), KV.of("f", 6L)));
p.run();
}
Aggregations