use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class CreateStreamTest method testInStreamingModeCountByKey.
@Test
public void testInStreamingModeCountByKey() throws Exception {
Instant instant = new Instant(0);
CreateStream<KV<Integer, Long>> kvSource = CreateStream.of(KvCoder.of(VarIntCoder.of(), VarLongCoder.of()), batchDuration()).emptyBatch().advanceWatermarkForNextBatch(instant).nextBatch(TimestampedValue.of(KV.of(1, 100L), instant.plus(Duration.standardSeconds(3L))), TimestampedValue.of(KV.of(1, 300L), instant.plus(Duration.standardSeconds(4L)))).advanceWatermarkForNextBatch(instant.plus(Duration.standardSeconds(7L))).nextBatch(TimestampedValue.of(KV.of(1, 400L), instant.plus(Duration.standardSeconds(8L)))).advanceNextBatchWatermarkToInfinity();
PCollection<KV<Integer, Long>> output = p.apply("create kv Source", kvSource).apply("window input", Window.<KV<Integer, Long>>into(FixedWindows.of(Duration.standardSeconds(3L))).withAllowedLateness(Duration.ZERO)).apply(Count.perKey());
PAssert.that("Wrong count value ", output).satisfies((SerializableFunction<Iterable<KV<Integer, Long>>, Void>) input -> {
for (KV<Integer, Long> element : input) {
if (element.getKey() == 1) {
Long countValue = element.getValue();
assertNotEquals("Count Value is 0 !!!", 0L, countValue.longValue());
} else {
fail("Unknown key in the output PCollection");
}
}
return null;
});
p.run();
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class SparkCoGroupByKeyStreamingTest method testInStreamingMode.
@Category(StreamingTest.class)
@Test
public void testInStreamingMode() throws Exception {
Instant instant = new Instant(0);
CreateStream<KV<Integer, Integer>> source1 = CreateStream.of(KvCoder.of(VarIntCoder.of(), VarIntCoder.of()), batchDuration()).emptyBatch().advanceWatermarkForNextBatch(instant).nextBatch(TimestampedValue.of(KV.of(1, 1), instant), TimestampedValue.of(KV.of(1, 2), instant), TimestampedValue.of(KV.of(1, 3), instant)).advanceWatermarkForNextBatch(instant.plus(Duration.standardSeconds(1L))).nextBatch(TimestampedValue.of(KV.of(2, 4), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(KV.of(2, 5), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(KV.of(2, 6), instant.plus(Duration.standardSeconds(1L)))).advanceNextBatchWatermarkToInfinity();
CreateStream<KV<Integer, Integer>> source2 = CreateStream.of(KvCoder.of(VarIntCoder.of(), VarIntCoder.of()), batchDuration()).emptyBatch().advanceWatermarkForNextBatch(instant).nextBatch(TimestampedValue.of(KV.of(1, 11), instant), TimestampedValue.of(KV.of(1, 12), instant), TimestampedValue.of(KV.of(1, 13), instant)).advanceWatermarkForNextBatch(instant.plus(Duration.standardSeconds(1L))).nextBatch(TimestampedValue.of(KV.of(2, 14), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(KV.of(2, 15), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(KV.of(2, 16), instant.plus(Duration.standardSeconds(1L)))).advanceNextBatchWatermarkToInfinity();
PCollection<KV<Integer, Integer>> input1 = pipeline.apply("create source1", source1).apply("window input1", Window.<KV<Integer, Integer>>into(FixedWindows.of(Duration.standardSeconds(3L))).withAllowedLateness(Duration.ZERO));
PCollection<KV<Integer, Integer>> input2 = pipeline.apply("create source2", source2).apply("window input2", Window.<KV<Integer, Integer>>into(FixedWindows.of(Duration.standardSeconds(3L))).withAllowedLateness(Duration.ZERO));
PCollection<KV<Integer, CoGbkResult>> output = KeyedPCollectionTuple.of(INPUT1_TAG, input1).and(INPUT2_TAG, input2).apply(CoGroupByKey.create());
PAssert.that("Wrong output of the join using CoGroupByKey in streaming mode", output).satisfies((SerializableFunction<Iterable<KV<Integer, CoGbkResult>>, Void>) input -> {
assertEquals("Wrong size of the output PCollection", 2, Iterables.size(input));
for (KV<Integer, CoGbkResult> element : input) {
if (element.getKey() == 1) {
Iterable<Integer> input1Elements = element.getValue().getAll(INPUT1_TAG);
assertEquals("Wrong number of values for output elements for tag input1 and key 1", 3, Iterables.size(input1Elements));
assertThat("Elements of PCollection input1 for key \"1\" are not present in the output PCollection", input1Elements, containsInAnyOrder(1, 2, 3));
Iterable<Integer> input2Elements = element.getValue().getAll(INPUT2_TAG);
assertEquals("Wrong number of values for output elements for tag input2 and key 1", 3, Iterables.size(input2Elements));
assertThat("Elements of PCollection input2 for key \"1\" are not present in the output PCollection", input2Elements, containsInAnyOrder(11, 12, 13));
} else if (element.getKey() == 2) {
Iterable<Integer> input1Elements = element.getValue().getAll(INPUT1_TAG);
assertEquals("Wrong number of values for output elements for tag input1 and key 2", 3, Iterables.size(input1Elements));
assertThat("Elements of PCollection input1 for key \"2\" are not present in the output PCollection", input1Elements, containsInAnyOrder(4, 5, 6));
Iterable<Integer> input2Elements = element.getValue().getAll(INPUT2_TAG);
assertEquals("Wrong number of values for output elements for tag input2 and key 2", 3, Iterables.size(input2Elements));
assertThat("Elements of PCollection input2 for key \"2\" are not present in the output PCollection", input2Elements, containsInAnyOrder(14, 15, 16));
} else {
fail("Unknown key in the output PCollection");
}
}
return null;
});
pipeline.run();
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class BigQueryIOWriteTest method writeDynamicDestinations.
public void writeDynamicDestinations(boolean schemas, boolean autoSharding) throws Exception {
final Schema schema = Schema.builder().addField("name", FieldType.STRING).addField("id", FieldType.INT32).build();
final Pattern userPattern = Pattern.compile("([a-z]+)([0-9]+)");
final PCollectionView<List<String>> sideInput1 = p.apply("Create SideInput 1", Create.of("a", "b", "c").withCoder(StringUtf8Coder.of())).apply("asList", View.asList());
final PCollectionView<Map<String, String>> sideInput2 = p.apply("Create SideInput2", Create.of(KV.of("a", "a"), KV.of("b", "b"), KV.of("c", "c"))).apply("AsMap", View.asMap());
final List<String> allUsernames = ImmutableList.of("bill", "bob", "randolph");
List<String> userList = Lists.newArrayList();
// WriteGroupedRecordsToFiles.
for (int i = 0; i < BatchLoads.DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE * 10; ++i) {
// Every user has 10 nicknames.
for (int j = 0; j < 10; ++j) {
String nickname = allUsernames.get(ThreadLocalRandom.current().nextInt(allUsernames.size()));
userList.add(nickname + i);
}
}
PCollection<String> users = p.apply("CreateUsers", Create.of(userList)).apply(Window.into(new PartitionedGlobalWindows<>(arg -> arg)));
if (useStreaming) {
users = users.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
if (schemas) {
users = users.setSchema(schema, TypeDescriptors.strings(), user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
return Row.withSchema(schema).addValue(matcher.group(1)).addValue(Integer.valueOf(matcher.group(2))).build();
}, r -> r.getString(0) + r.getInt32(1));
}
// Use a partition decorator to verify that partition decorators are supported.
final String partitionDecorator = "20171127";
BigQueryIO.Write<String> write = BigQueryIO.<String>write().withTestServices(fakeBqServices).withMaxFilesPerBundle(5).withMaxFileSize(10).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).to(new StringLongDestinations() {
@Override
public Long getDestination(ValueInSingleWindow<String> element) {
assertThat(element.getWindow(), Matchers.instanceOf(PartitionedGlobalWindow.class));
Matcher matcher = userPattern.matcher(element.getValue());
checkState(matcher.matches());
// a table.
return Long.valueOf(matcher.group(2));
}
@Override
public TableDestination getTable(Long userId) {
verifySideInputs();
// Each user in it's own table.
return new TableDestination("dataset-id.userid-" + userId + "$" + partitionDecorator, "table for userid " + userId);
}
@Override
public TableSchema getSchema(Long userId) {
verifySideInputs();
return new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("id").setType("INTEGER")));
}
@Override
public List<PCollectionView<?>> getSideInputs() {
return ImmutableList.of(sideInput1, sideInput2);
}
private void verifySideInputs() {
assertThat(sideInput(sideInput1), containsInAnyOrder("a", "b", "c"));
Map<String, String> mapSideInput = sideInput(sideInput2);
assertEquals(3, mapSideInput.size());
assertThat(mapSideInput, allOf(hasEntry("a", "a"), hasEntry("b", "b"), hasEntry("c", "c")));
}
}).withoutValidation();
if (schemas) {
write = write.useBeamSchema();
} else {
write = write.withFormatFunction(user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
return new TableRow().set("name", matcher.group(1)).set("id", matcher.group(2));
});
}
if (autoSharding) {
write = write.withAutoSharding();
}
WriteResult results = users.apply("WriteBigQuery", write);
if (!useStreaming && !useStorageApi) {
PCollection<TableDestination> successfulBatchInserts = results.getSuccessfulTableLoads();
TableDestination[] expectedTables = userList.stream().map(user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
String userId = matcher.group(2);
return new TableDestination(String.format("project-id:dataset-id.userid-%s$20171127", userId), String.format("table for userid %s", userId));
}).distinct().toArray(TableDestination[]::new);
PAssert.that(successfulBatchInserts.apply(Distinct.create())).containsInAnyOrder(expectedTables);
}
p.run();
Map<Long, List<TableRow>> expectedTableRows = Maps.newHashMap();
for (String anUserList : userList) {
Matcher matcher = userPattern.matcher(anUserList);
checkState(matcher.matches());
String nickname = matcher.group(1);
Long userid = Long.valueOf(matcher.group(2));
List<TableRow> expected = expectedTableRows.computeIfAbsent(userid, k -> Lists.newArrayList());
expected.add(new TableRow().set("name", nickname).set("id", userid.toString()));
}
for (Map.Entry<Long, List<TableRow>> entry : expectedTableRows.entrySet()) {
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "userid-" + entry.getKey()), containsInAnyOrder(Iterables.toArray(entry.getValue(), TableRow.class)));
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class BigQueryIOReadTest method testReadFromTable.
private void testReadFromTable(boolean useTemplateCompatibility, boolean useReadTableRows) throws IOException, InterruptedException {
Table sometable = new Table();
sometable.setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
sometable.setTableReference(new TableReference().setProjectId("non-executing-project").setDatasetId("somedataset").setTableId("sometable"));
sometable.setNumBytes(1024L * 1024L);
FakeDatasetService fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset("non-executing-project", "somedataset", "", "", null);
fakeDatasetService.createTable(sometable);
List<TableRow> records = Lists.newArrayList(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L));
fakeDatasetService.insertAll(sometable.getTableReference(), records, null);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
PTransform<PBegin, PCollection<TableRow>> readTransform;
if (useReadTableRows) {
BigQueryIO.Read read = BigQueryIO.read().from("non-executing-project:somedataset.sometable").withTestServices(fakeBqServices).withoutValidation();
readTransform = useTemplateCompatibility ? read.withTemplateCompatibility() : read;
} else {
BigQueryIO.TypedRead<TableRow> read = BigQueryIO.readTableRows().from("non-executing-project:somedataset.sometable").withTestServices(fakeBqServices).withoutValidation();
readTransform = useTemplateCompatibility ? read.withTemplateCompatibility() : read;
}
PCollection<KV<String, Long>> output = p.apply(readTransform).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
}
}));
PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L)));
p.run();
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class DeadLetteredTransform method expandInternal.
// Required to capture the generic type parameter of the PCollection.
private <RealInputT extends InputT> PCollection<OutputT> expandInternal(PCollection<RealInputT> input) {
Coder<RealInputT> coder = input.getCoder();
SerializableFunction<RealInputT, OutputT> localTransform = transform::apply;
MapElements.MapWithFailures<RealInputT, OutputT, Failure> mapWithFailures = MapElements.into(transform.getOutputTypeDescriptor()).via(localTransform).exceptionsInto(TypeDescriptor.of(Failure.class)).exceptionsVia(x -> {
try (ByteArrayOutputStream os = new ByteArrayOutputStream()) {
coder.encode(x.element(), os);
return Failure.newBuilder().setPayload(os.toByteArray()).setError(String.format("%s%n%n%s", x.exception().getMessage(), ExceptionUtils.getStackTrace(x.exception()))).build();
}
});
Result<PCollection<OutputT>, Failure> result = mapWithFailures.expand(input);
result.failures().apply(deadLetter);
return result.output();
}
Aggregations