use of org.apache.beam.sdk.util.ShardedKey in project beam by apache.
the class DataflowRunnerTest method verifyGroupIntoBatchesOverrideCount.
private void verifyGroupIntoBatchesOverrideCount(Pipeline p, Boolean withShardedKey, Boolean expectOverriden) {
final int batchSize = 2;
List<KV<String, Integer>> testValues = Arrays.asList(KV.of("A", 1), KV.of("B", 0), KV.of("A", 2), KV.of("A", 4), KV.of("A", 8));
PCollection<KV<String, Integer>> input = p.apply("CreateValuesCount", Create.of(testValues));
PCollection<KV<String, Iterable<Integer>>> output;
if (withShardedKey) {
output = input.apply("GroupIntoBatchesCount", GroupIntoBatches.<String, Integer>ofSize(batchSize).withShardedKey()).apply("StripShardIdCount", MapElements.via(new SimpleFunction<KV<ShardedKey<String>, Iterable<Integer>>, KV<String, Iterable<Integer>>>() {
@Override
public KV<String, Iterable<Integer>> apply(KV<ShardedKey<String>, Iterable<Integer>> input) {
return KV.of(input.getKey().getKey(), input.getValue());
}
}));
} else {
output = input.apply("GroupIntoBatchesCount", GroupIntoBatches.ofSize(batchSize));
}
PAssert.thatMultimap(output).satisfies(i -> {
assertEquals(2, i.size());
assertThat(i.keySet(), containsInAnyOrder("A", "B"));
Map<String, Integer> sums = new HashMap<>();
for (Map.Entry<String, Iterable<Iterable<Integer>>> entry : i.entrySet()) {
for (Iterable<Integer> batch : entry.getValue()) {
assertThat(Iterables.size(batch), lessThanOrEqualTo(batchSize));
for (Integer value : batch) {
sums.put(entry.getKey(), value + sums.getOrDefault(entry.getKey(), 0));
}
}
}
assertEquals(15, (int) sums.get("A"));
assertEquals(0, (int) sums.get("B"));
return null;
});
p.run();
AtomicBoolean sawGroupIntoBatchesOverride = new AtomicBoolean(false);
p.traverseTopologically(new PipelineVisitor.Defaults() {
@Override
public CompositeBehavior enterCompositeTransform(Node node) {
if (p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.StreamingGroupIntoBatchesWithShardedKey) {
sawGroupIntoBatchesOverride.set(true);
}
if (!p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.BatchGroupIntoBatches) {
sawGroupIntoBatchesOverride.set(true);
}
if (!p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.BatchGroupIntoBatchesWithShardedKey) {
sawGroupIntoBatchesOverride.set(true);
}
return CompositeBehavior.ENTER_TRANSFORM;
}
});
if (expectOverriden) {
assertTrue(sawGroupIntoBatchesOverride.get());
} else {
assertFalse(sawGroupIntoBatchesOverride.get());
}
}
use of org.apache.beam.sdk.util.ShardedKey in project beam by apache.
the class DataflowRunnerTest method verifyGroupIntoBatchesOverrideBytes.
private void verifyGroupIntoBatchesOverrideBytes(Pipeline p, Boolean withShardedKey, Boolean expectOverriden) {
final long batchSizeBytes = 2;
List<KV<String, String>> testValues = Arrays.asList(KV.of("A", "a"), KV.of("A", "ab"), KV.of("A", "abc"), KV.of("A", "abcd"), KV.of("A", "abcde"));
PCollection<KV<String, String>> input = p.apply("CreateValuesBytes", Create.of(testValues));
PCollection<KV<String, Iterable<String>>> output;
if (withShardedKey) {
output = input.apply("GroupIntoBatchesBytes", GroupIntoBatches.<String, String>ofByteSize(batchSizeBytes).withShardedKey()).apply("StripShardIdBytes", MapElements.via(new SimpleFunction<KV<ShardedKey<String>, Iterable<String>>, KV<String, Iterable<String>>>() {
@Override
public KV<String, Iterable<String>> apply(KV<ShardedKey<String>, Iterable<String>> input) {
return KV.of(input.getKey().getKey(), input.getValue());
}
}));
} else {
output = input.apply("GroupIntoBatchesBytes", GroupIntoBatches.ofByteSize(batchSizeBytes));
}
PAssert.thatMultimap(output).satisfies(i -> {
assertEquals(1, i.size());
assertThat(i.keySet(), containsInAnyOrder("A"));
Iterable<Iterable<String>> batches = i.get("A");
assertEquals(5, Iterables.size(batches));
return null;
});
p.run();
AtomicBoolean sawGroupIntoBatchesOverride = new AtomicBoolean(false);
p.traverseTopologically(new PipelineVisitor.Defaults() {
@Override
public CompositeBehavior enterCompositeTransform(Node node) {
if (p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.StreamingGroupIntoBatchesWithShardedKey) {
sawGroupIntoBatchesOverride.set(true);
}
if (!p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.BatchGroupIntoBatches) {
sawGroupIntoBatchesOverride.set(true);
}
if (!p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.BatchGroupIntoBatchesWithShardedKey) {
sawGroupIntoBatchesOverride.set(true);
}
return CompositeBehavior.ENTER_TRANSFORM;
}
});
if (expectOverriden) {
assertTrue(sawGroupIntoBatchesOverride.get());
} else {
assertFalse(sawGroupIntoBatchesOverride.get());
}
}
Aggregations