use of org.apache.beam.sdk.transforms.Materializations.MultimapView in project beam by apache.
the class SparkSideInputReader method initializeBroadcastVariable.
private <T> Map<BoundedWindow, T> initializeBroadcastVariable(Iterable<WindowedValue<?>> inputValues, PCollectionView<T> view) {
// first partition into windows
Map<BoundedWindow, List<WindowedValue<?>>> partitionedElements = new HashMap<>();
for (WindowedValue<?> value : inputValues) {
for (BoundedWindow window : value.getWindows()) {
List<WindowedValue<?>> windowedValues = partitionedElements.computeIfAbsent(window, k -> new ArrayList<>());
windowedValues.add(value);
}
}
Map<BoundedWindow, T> resultMap = new HashMap<>();
for (Map.Entry<BoundedWindow, List<WindowedValue<?>>> elements : partitionedElements.entrySet()) {
switch(view.getViewFn().getMaterialization().getUrn()) {
case Materializations.ITERABLE_MATERIALIZATION_URN:
{
ViewFn<IterableView, T> viewFn = (ViewFn<IterableView, T>) view.getViewFn();
resultMap.put(elements.getKey(), viewFn.apply(() -> elements.getValue().stream().map(WindowedValue::getValue).collect(Collectors.toList())));
}
break;
case Materializations.MULTIMAP_MATERIALIZATION_URN:
{
ViewFn<MultimapView, T> viewFn = (ViewFn<MultimapView, T>) view.getViewFn();
Coder<?> keyCoder = ((KvCoder<?, ?>) view.getCoderInternal()).getKeyCoder();
resultMap.put(elements.getKey(), viewFn.apply(InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) elements.getValue().stream().map(WindowedValue::getValue).collect(Collectors.toList()))));
}
break;
default:
throw new IllegalStateException(String.format("Unknown side input materialization format requested '%s'", view.getViewFn().getMaterialization().getUrn()));
}
}
return resultMap;
}
use of org.apache.beam.sdk.transforms.Materializations.MultimapView in project beam by apache.
the class SparkSideInputReader method get.
@Override
@Nullable
public <T> T get(PCollectionView<T> view, BoundedWindow window) {
// --- validate sideInput.
checkNotNull(view, "The PCollectionView passed to sideInput cannot be null ");
KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>> windowedBroadcastHelper = sideInputs.get(view.getTagInternal());
checkNotNull(windowedBroadcastHelper, "SideInput for view " + view + " is not available.");
// --- sideInput window
final BoundedWindow sideInputWindow = view.getWindowMappingFn().getSideInputWindow(window);
// --- match the appropriate sideInput window.
// a tag will point to all matching sideInputs, that is all windows.
// now that we've obtained the appropriate sideInputWindow, all that's left is to filter by it.
Iterable<WindowedValue<?>> availableSideInputs = (Iterable<WindowedValue<?>>) windowedBroadcastHelper.getValue().getValue();
Iterable<?> sideInputForWindow = StreamSupport.stream(availableSideInputs.spliterator(), false).filter(sideInputCandidate -> {
if (sideInputCandidate == null) {
return false;
}
return Iterables.contains(sideInputCandidate.getWindows(), sideInputWindow);
}).collect(Collectors.toList()).stream().map(WindowedValue::getValue).collect(Collectors.toList());
switch(view.getViewFn().getMaterialization().getUrn()) {
case Materializations.ITERABLE_MATERIALIZATION_URN:
{
ViewFn<IterableView, T> viewFn = (ViewFn<IterableView, T>) view.getViewFn();
return viewFn.apply(() -> sideInputForWindow);
}
case Materializations.MULTIMAP_MATERIALIZATION_URN:
{
ViewFn<MultimapView, T> viewFn = (ViewFn<MultimapView, T>) view.getViewFn();
Coder<?> keyCoder = ((KvCoder<?, ?>) view.getCoderInternal()).getKeyCoder();
return viewFn.apply(InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) sideInputForWindow));
}
default:
throw new IllegalStateException(String.format("Unknown side input materialization format requested '%s'", view.getViewFn().getMaterialization().getUrn()));
}
}
use of org.apache.beam.sdk.transforms.Materializations.MultimapView in project beam by apache.
the class Twister2SideInputReader method getMultimapSideInput.
private <T> T getMultimapSideInput(PCollectionView<T> view, BoundedWindow window) {
Map<BoundedWindow, List<WindowedValue<?>>> partitionedElements = getPartitionedElements(view);
Map<BoundedWindow, T> resultMap = new HashMap<>();
ViewFn<MultimapView, T> viewFn = (ViewFn<MultimapView, T>) view.getViewFn();
for (Map.Entry<BoundedWindow, List<WindowedValue<?>>> elements : partitionedElements.entrySet()) {
Coder keyCoder = ((KvCoder<?, ?>) view.getCoderInternal()).getKeyCoder();
resultMap.put(elements.getKey(), viewFn.apply(InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) elements.getValue().stream().map(WindowedValue::getValue).collect(Collectors.toList()))));
}
T result = resultMap.get(window);
if (result == null) {
result = viewFn.apply(InMemoryMultimapSideInputView.empty());
}
return result;
}
use of org.apache.beam.sdk.transforms.Materializations.MultimapView in project beam by apache.
the class SideInputInitializer method initializeBroadcastVariable.
@Override
public Map<BoundedWindow, ViewT> initializeBroadcastVariable(Iterable<WindowedValue<?>> inputValues) {
// first partition into windows
Map<BoundedWindow, List<WindowedValue<?>>> partitionedElements = new HashMap<>();
for (WindowedValue<?> value : inputValues) {
for (BoundedWindow window : value.getWindows()) {
List<WindowedValue<?>> windowedValues = partitionedElements.computeIfAbsent(window, k -> new ArrayList<>());
windowedValues.add(value);
}
}
Map<BoundedWindow, ViewT> resultMap = new HashMap<>();
for (Map.Entry<BoundedWindow, List<WindowedValue<?>>> elements : partitionedElements.entrySet()) {
switch(view.getViewFn().getMaterialization().getUrn()) {
case Materializations.ITERABLE_MATERIALIZATION_URN:
{
ViewFn<IterableView, ViewT> viewFn = (ViewFn<IterableView, ViewT>) view.getViewFn();
resultMap.put(elements.getKey(), viewFn.apply(() -> elements.getValue().stream().map(WindowedValue::getValue).collect(Collectors.toList())));
}
break;
case Materializations.MULTIMAP_MATERIALIZATION_URN:
{
ViewFn<MultimapView, ViewT> viewFn = (ViewFn<MultimapView, ViewT>) view.getViewFn();
Coder<?> keyCoder = ((KvCoder<?, ?>) view.getCoderInternal()).getKeyCoder();
resultMap.put(elements.getKey(), viewFn.apply(InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) elements.getValue().stream().map(WindowedValue::getValue).collect(Collectors.toList()))));
}
break;
default:
throw new IllegalStateException(String.format("Unknown side input materialization format requested '%s'", view.getViewFn().getMaterialization().getUrn()));
}
}
return resultMap;
}
use of org.apache.beam.sdk.transforms.Materializations.MultimapView in project beam by apache.
the class IsmSideInputReaderTest method testMultimapViewInWindow.
@Test
public void testMultimapViewInWindow() throws Exception {
// Note that we purposely use byte[]s as keys to force structural equality testing
// versus using java equality testing. Since we want to define a duplicate key for
// the multimap, we specifically use the same instance of the byte[].
byte[] duplicateKey = new byte[] { 0x01 };
Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), INTERVAL_WINDOW_CODER);
final ListMultimap<byte[], WindowedValue<Long>> firstWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInIntervalWindow(12L, 10)).put(duplicateKey, valueInIntervalWindow(22L, 10)).put(duplicateKey, valueInIntervalWindow(23L, 10)).put(new byte[] { 0x02 }, valueInIntervalWindow(32L, 10)).build();
final ListMultimap<byte[], WindowedValue<Long>> secondWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x00 }, valueInIntervalWindow(42L, 20)).put(new byte[] { 0x03 }, valueInIntervalWindow(52L, 20)).put(new byte[] { 0x02 }, valueInIntervalWindow(62L, 20)).build();
final ListMultimap<byte[], WindowedValue<Long>> thirdWindow = ImmutableListMultimap.<byte[], WindowedValue<Long>>builder().put(new byte[] { 0x02 }, valueInIntervalWindow(73L, 30)).put(new byte[] { 0x04 }, valueInIntervalWindow(82L, 30)).put(new byte[] { 0x05 }, valueInIntervalWindow(92L, 30)).build();
final PCollectionView<MultimapView<byte[], WindowedValue<Long>>> view = DataflowPortabilityPCollectionView.with(new TupleTag<>(), FullWindowedValueCoder.of(KvCoder.of(ByteArrayCoder.of(), valueCoder), INTERVAL_WINDOW_CODER));
IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 0, ImmutableList.of(ByteArrayCoder.of(), INTERVAL_WINDOW_CODER, BigEndianLongCoder.of()), valueCoder);
Multimap<Integer, IsmRecord<WindowedValue<Long>>> elementsPerShard = forMap(ismCoder, firstWindow);
elementsPerShard.putAll(forMap(ismCoder, secondWindow));
elementsPerShard.putAll(forMap(ismCoder, thirdWindow));
List<IsmRecord<WindowedValue<Long>>> firstElements = new ArrayList<>();
List<IsmRecord<WindowedValue<Long>>> secondElements = new ArrayList<>();
for (Map.Entry<Integer, Collection<IsmRecord<WindowedValue<Long>>>> entry : elementsPerShard.asMap().entrySet()) {
if (entry.getKey() % 2 == 0) {
firstElements.addAll(entry.getValue());
} else {
secondElements.addAll(entry.getValue());
}
}
// Ensure that each file will have some records.
checkState(!firstElements.isEmpty());
checkState(!secondElements.isEmpty());
Source sourceA = initInputFile(firstElements, ismCoder);
Source sourceB = initInputFile(secondElements, ismCoder);
final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), sourceA, sourceB);
List<Callable<Map<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>>>> tasks = new ArrayList<>();
for (int i = 0; i < 3; ++i) {
tasks.add(() -> {
// Store a strong reference to the returned value so that the logical reference
// cache is not cleared for this test.
MultimapView<byte[], WindowedValue<Long>> firstValues = reader.get(view, intervalWindow(10));
MultimapView<byte[], WindowedValue<Long>> secondValues = reader.get(view, intervalWindow(20));
MultimapView<byte[], WindowedValue<Long>> thirdValues = reader.get(view, intervalWindow(30));
for (Map.Entry<byte[], Collection<WindowedValue<Long>>> entry : firstWindow.asMap().entrySet()) {
verifyIterable(entry.getValue(), firstValues.get(entry.getKey()));
}
for (Map.Entry<byte[], Collection<WindowedValue<Long>>> entry : secondWindow.asMap().entrySet()) {
verifyIterable(entry.getValue(), secondValues.get(entry.getKey()));
}
for (Map.Entry<byte[], Collection<WindowedValue<Long>>> entry : thirdWindow.asMap().entrySet()) {
verifyIterable(entry.getValue(), thirdValues.get(entry.getKey()));
}
// Assert that the same value reference was returned showing that it was cached.
assertSame(firstValues, reader.get(view, intervalWindow(10)));
assertSame(secondValues, reader.get(view, intervalWindow(20)));
assertSame(thirdValues, reader.get(view, intervalWindow(30)));
return ImmutableMap.of(intervalWindow(10), firstValues, intervalWindow(20), secondValues, intervalWindow(30), thirdValues);
});
}
List<Future<Map<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>>>> results = pipelineOptions.getExecutorService().invokeAll(tasks);
Map<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>> value = results.get(0).get();
// Assert that all threads got back the same reference
for (Future<Map<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>>> result : results) {
assertEquals(value, result.get());
for (Map.Entry<BoundedWindow, MultimapView<byte[], WindowedValue<Long>>> entry : result.get().entrySet()) {
assertSame(value.get(entry.getKey()), entry.getValue());
}
}
}
Aggregations