Search in sources :

Example 1 with ShardedKey

use of org.apache.beam.sdk.util.ShardedKey in project beam by apache.

the class DataflowRunnerTest method verifyGroupIntoBatchesOverrideCount.

private void verifyGroupIntoBatchesOverrideCount(Pipeline p, Boolean withShardedKey, Boolean expectOverriden) {
    final int batchSize = 2;
    List<KV<String, Integer>> testValues = Arrays.asList(KV.of("A", 1), KV.of("B", 0), KV.of("A", 2), KV.of("A", 4), KV.of("A", 8));
    PCollection<KV<String, Integer>> input = p.apply("CreateValuesCount", Create.of(testValues));
    PCollection<KV<String, Iterable<Integer>>> output;
    if (withShardedKey) {
        output = input.apply("GroupIntoBatchesCount", GroupIntoBatches.<String, Integer>ofSize(batchSize).withShardedKey()).apply("StripShardIdCount", MapElements.via(new SimpleFunction<KV<ShardedKey<String>, Iterable<Integer>>, KV<String, Iterable<Integer>>>() {

            @Override
            public KV<String, Iterable<Integer>> apply(KV<ShardedKey<String>, Iterable<Integer>> input) {
                return KV.of(input.getKey().getKey(), input.getValue());
            }
        }));
    } else {
        output = input.apply("GroupIntoBatchesCount", GroupIntoBatches.ofSize(batchSize));
    }
    PAssert.thatMultimap(output).satisfies(i -> {
        assertEquals(2, i.size());
        assertThat(i.keySet(), containsInAnyOrder("A", "B"));
        Map<String, Integer> sums = new HashMap<>();
        for (Map.Entry<String, Iterable<Iterable<Integer>>> entry : i.entrySet()) {
            for (Iterable<Integer> batch : entry.getValue()) {
                assertThat(Iterables.size(batch), lessThanOrEqualTo(batchSize));
                for (Integer value : batch) {
                    sums.put(entry.getKey(), value + sums.getOrDefault(entry.getKey(), 0));
                }
            }
        }
        assertEquals(15, (int) sums.get("A"));
        assertEquals(0, (int) sums.get("B"));
        return null;
    });
    p.run();
    AtomicBoolean sawGroupIntoBatchesOverride = new AtomicBoolean(false);
    p.traverseTopologically(new PipelineVisitor.Defaults() {

        @Override
        public CompositeBehavior enterCompositeTransform(Node node) {
            if (p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.StreamingGroupIntoBatchesWithShardedKey) {
                sawGroupIntoBatchesOverride.set(true);
            }
            if (!p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.BatchGroupIntoBatches) {
                sawGroupIntoBatchesOverride.set(true);
            }
            if (!p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.BatchGroupIntoBatchesWithShardedKey) {
                sawGroupIntoBatchesOverride.set(true);
            }
            return CompositeBehavior.ENTER_TRANSFORM;
        }
    });
    if (expectOverriden) {
        assertTrue(sawGroupIntoBatchesOverride.get());
    } else {
        assertFalse(sawGroupIntoBatchesOverride.get());
    }
}
Also used : HashMap(java.util.HashMap) Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) JsonNode(com.fasterxml.jackson.databind.JsonNode) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) Matchers.containsString(org.hamcrest.Matchers.containsString) ShardedKey(org.apache.beam.sdk.util.ShardedKey) PipelineVisitor(org.apache.beam.sdk.Pipeline.PipelineVisitor) KV(org.apache.beam.sdk.values.KV) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Map(java.util.Map) HashMap(java.util.HashMap)

Example 2 with ShardedKey

use of org.apache.beam.sdk.util.ShardedKey in project beam by apache.

the class DataflowRunnerTest method verifyGroupIntoBatchesOverrideBytes.

private void verifyGroupIntoBatchesOverrideBytes(Pipeline p, Boolean withShardedKey, Boolean expectOverriden) {
    final long batchSizeBytes = 2;
    List<KV<String, String>> testValues = Arrays.asList(KV.of("A", "a"), KV.of("A", "ab"), KV.of("A", "abc"), KV.of("A", "abcd"), KV.of("A", "abcde"));
    PCollection<KV<String, String>> input = p.apply("CreateValuesBytes", Create.of(testValues));
    PCollection<KV<String, Iterable<String>>> output;
    if (withShardedKey) {
        output = input.apply("GroupIntoBatchesBytes", GroupIntoBatches.<String, String>ofByteSize(batchSizeBytes).withShardedKey()).apply("StripShardIdBytes", MapElements.via(new SimpleFunction<KV<ShardedKey<String>, Iterable<String>>, KV<String, Iterable<String>>>() {

            @Override
            public KV<String, Iterable<String>> apply(KV<ShardedKey<String>, Iterable<String>> input) {
                return KV.of(input.getKey().getKey(), input.getValue());
            }
        }));
    } else {
        output = input.apply("GroupIntoBatchesBytes", GroupIntoBatches.ofByteSize(batchSizeBytes));
    }
    PAssert.thatMultimap(output).satisfies(i -> {
        assertEquals(1, i.size());
        assertThat(i.keySet(), containsInAnyOrder("A"));
        Iterable<Iterable<String>> batches = i.get("A");
        assertEquals(5, Iterables.size(batches));
        return null;
    });
    p.run();
    AtomicBoolean sawGroupIntoBatchesOverride = new AtomicBoolean(false);
    p.traverseTopologically(new PipelineVisitor.Defaults() {

        @Override
        public CompositeBehavior enterCompositeTransform(Node node) {
            if (p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.StreamingGroupIntoBatchesWithShardedKey) {
                sawGroupIntoBatchesOverride.set(true);
            }
            if (!p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.BatchGroupIntoBatches) {
                sawGroupIntoBatchesOverride.set(true);
            }
            if (!p.getOptions().as(StreamingOptions.class).isStreaming() && node.getTransform() instanceof GroupIntoBatchesOverride.BatchGroupIntoBatchesWithShardedKey) {
                sawGroupIntoBatchesOverride.set(true);
            }
            return CompositeBehavior.ENTER_TRANSFORM;
        }
    });
    if (expectOverriden) {
        assertTrue(sawGroupIntoBatchesOverride.get());
    } else {
        assertFalse(sawGroupIntoBatchesOverride.get());
    }
}
Also used : Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) JsonNode(com.fasterxml.jackson.databind.JsonNode) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) KV(org.apache.beam.sdk.values.KV) Matchers.containsString(org.hamcrest.Matchers.containsString) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ShardedKey(org.apache.beam.sdk.util.ShardedKey) PipelineVisitor(org.apache.beam.sdk.Pipeline.PipelineVisitor)

Aggregations

JsonNode (com.fasterxml.jackson.databind.JsonNode)2 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)2 PipelineVisitor (org.apache.beam.sdk.Pipeline.PipelineVisitor)2 StreamingOptions (org.apache.beam.sdk.options.StreamingOptions)2 Node (org.apache.beam.sdk.runners.TransformHierarchy.Node)2 ShardedKey (org.apache.beam.sdk.util.ShardedKey)2 KV (org.apache.beam.sdk.values.KV)2 Matchers.containsString (org.hamcrest.Matchers.containsString)2 HashMap (java.util.HashMap)1 Map (java.util.Map)1