Search in sources :

Example 6 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class IsmSinkFactory method create.

@Override
public Sink<?> create(CloudObject spec, @Nullable Coder<?> coder, @Nullable PipelineOptions options, @Nullable DataflowExecutionContext executionContext, DataflowOperationContext operationContext) throws Exception {
    options = checkArgumentNotNull(options);
    coder = checkArgumentNotNull(coder);
    // The validity of this coder is checked in detail by the typed create, below
    @SuppressWarnings("unchecked") Coder<WindowedValue<IsmRecord<Object>>> typedCoder = (Coder<WindowedValue<IsmRecord<Object>>>) coder;
    String filename = getString(spec, WorkerPropertyNames.FILENAME);
    checkArgument(typedCoder instanceof WindowedValueCoder, "%s only supports using %s but got %s.", IsmSink.class, WindowedValueCoder.class, typedCoder);
    WindowedValueCoder<IsmRecord<Object>> windowedCoder = (WindowedValueCoder<IsmRecord<Object>>) typedCoder;
    checkArgument(windowedCoder.getValueCoder() instanceof IsmRecordCoder, "%s only supports using %s but got %s.", IsmSink.class, IsmRecordCoder.class, windowedCoder.getValueCoder());
    @SuppressWarnings("unchecked") IsmRecordCoder<Object> ismCoder = (IsmRecordCoder<Object>) windowedCoder.getValueCoder();
    long bloomFilterSizeLimitBytes = Math.max(MIN_BLOOM_FILTER_SIZE_BYTES, DoubleMath.roundToLong(BLOOM_FILTER_SIZE_LIMIT_MULTIPLIER * options.as(DataflowWorkerHarnessOptions.class).getWorkerCacheMb() * // Note the conversion from MiB to bytes
    1024 * 1024, RoundingMode.DOWN));
    return new IsmSink<>(FileSystems.matchNewResource(filename, false), ismCoder, bloomFilterSizeLimitBytes);
}
Also used : WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) Coder(org.apache.beam.sdk.coders.Coder) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) WindowedValue(org.apache.beam.sdk.util.WindowedValue) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject)

Example 7 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class BatchViewOverridesTest method testToMultimapDoFn.

@Test
public void testToMultimapDoFn() throws Exception {
    Coder<IntervalWindow> windowCoder = IntervalWindow.getCoder();
    DoFnTester<KV<Integer, Iterable<KV<IntervalWindow, WindowedValue<KV<Long, Long>>>>>, IsmRecord<WindowedValue<TransformedMap<Long, Iterable<WindowedValue<Long>>, Iterable<Long>>>>> doFnTester = DoFnTester.of(new BatchViewOverrides.BatchViewAsMultimap.ToMultimapDoFn<Long, Long, IntervalWindow>(windowCoder));
    IntervalWindow windowA = new IntervalWindow(new Instant(0), new Instant(10));
    IntervalWindow windowB = new IntervalWindow(new Instant(10), new Instant(20));
    IntervalWindow windowC = new IntervalWindow(new Instant(20), new Instant(30));
    Iterable<KV<Integer, Iterable<KV<IntervalWindow, WindowedValue<KV<Long, Long>>>>>> inputElements = ImmutableList.of(KV.of(1, (Iterable<KV<IntervalWindow, WindowedValue<KV<Long, Long>>>>) ImmutableList.of(KV.of(windowA, WindowedValue.of(KV.of(1L, 11L), new Instant(3), windowA, PaneInfo.NO_FIRING)), // duplicate key/values are not lost.
    KV.of(windowA, WindowedValue.of(KV.of(1L, 11L), new Instant(3), windowA, PaneInfo.NO_FIRING)), KV.of(windowA, WindowedValue.of(KV.of(1L, 12L), new Instant(5), windowA, PaneInfo.NO_FIRING)), KV.of(windowA, WindowedValue.of(KV.of(2L, 21L), new Instant(7), windowA, PaneInfo.NO_FIRING)), KV.of(windowB, WindowedValue.of(KV.of(2L, 21L), new Instant(13), windowB, PaneInfo.NO_FIRING)), KV.of(windowB, WindowedValue.of(KV.of(3L, 31L), new Instant(15), windowB, PaneInfo.NO_FIRING)))), KV.of(2, (Iterable<KV<IntervalWindow, WindowedValue<KV<Long, Long>>>>) ImmutableList.of(KV.of(windowC, WindowedValue.of(KV.of(4L, 41L), new Instant(25), windowC, PaneInfo.NO_FIRING)))));
    // The order of the output elements is important relative to processing order
    List<IsmRecord<WindowedValue<TransformedMap<Long, Iterable<WindowedValue<Long>>, Iterable<Long>>>>> output = doFnTester.processBundle(inputElements);
    assertEquals(3, output.size());
    Map<Long, Iterable<Long>> outputMap;
    outputMap = output.get(0).getValue().getValue();
    assertEquals(2, outputMap.size());
    assertThat(outputMap.get(1L), containsInAnyOrder(11L, 11L, 12L));
    assertThat(outputMap.get(2L), containsInAnyOrder(21L));
    outputMap = output.get(1).getValue().getValue();
    assertEquals(2, outputMap.size());
    assertThat(outputMap.get(2L), containsInAnyOrder(21L));
    assertThat(outputMap.get(3L), containsInAnyOrder(31L));
    outputMap = output.get(2).getValue().getValue();
    assertEquals(1, outputMap.size());
    assertThat(outputMap.get(4L), containsInAnyOrder(41L));
}
Also used : Instant(org.joda.time.Instant) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) KV(org.apache.beam.sdk.values.KV) TransformedMap(org.apache.beam.runners.dataflow.BatchViewOverrides.TransformedMap) WindowedValue(org.apache.beam.sdk.util.WindowedValue) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Example 8 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class BatchViewOverridesTest method testBatchViewAsListToIsmRecordForNonGlobalWindow.

@Test
public void testBatchViewAsListToIsmRecordForNonGlobalWindow() throws Exception {
    DoFnTester<KV<Integer, Iterable<KV<IntervalWindow, WindowedValue<Long>>>>, IsmRecord<WindowedValue<Long>>> doFnTester = DoFnTester.of(new BatchViewOverrides.BatchViewAsList.ToIsmRecordForNonGlobalWindowDoFn<Long, IntervalWindow>(IntervalWindow.getCoder()));
    IntervalWindow windowA = new IntervalWindow(new Instant(0), new Instant(10));
    IntervalWindow windowB = new IntervalWindow(new Instant(10), new Instant(20));
    IntervalWindow windowC = new IntervalWindow(new Instant(20), new Instant(30));
    Iterable<KV<Integer, Iterable<KV<IntervalWindow, WindowedValue<Long>>>>> inputElements = ImmutableList.of(KV.of(1, (Iterable<KV<IntervalWindow, WindowedValue<Long>>>) ImmutableList.of(KV.of(windowA, WindowedValue.of(110L, new Instant(1), windowA, PaneInfo.NO_FIRING)), KV.of(windowA, WindowedValue.of(111L, new Instant(3), windowA, PaneInfo.NO_FIRING)), KV.of(windowA, WindowedValue.of(112L, new Instant(4), windowA, PaneInfo.NO_FIRING)), KV.of(windowB, WindowedValue.of(120L, new Instant(12), windowB, PaneInfo.NO_FIRING)), KV.of(windowB, WindowedValue.of(121L, new Instant(14), windowB, PaneInfo.NO_FIRING)))), KV.of(2, (Iterable<KV<IntervalWindow, WindowedValue<Long>>>) ImmutableList.of(KV.of(windowC, WindowedValue.of(210L, new Instant(25), windowC, PaneInfo.NO_FIRING)))));
    // The order of the output elements is important relative to processing order
    assertThat(doFnTester.processBundle(inputElements), contains(IsmRecord.of(ImmutableList.of(windowA, 0L), WindowedValue.of(110L, new Instant(1), windowA, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(windowA, 1L), WindowedValue.of(111L, new Instant(3), windowA, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(windowA, 2L), WindowedValue.of(112L, new Instant(4), windowA, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(windowB, 0L), WindowedValue.of(120L, new Instant(12), windowB, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(windowB, 1L), WindowedValue.of(121L, new Instant(14), windowB, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(windowC, 0L), WindowedValue.of(210L, new Instant(25), windowC, PaneInfo.NO_FIRING))));
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) Instant(org.joda.time.Instant) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) KV(org.apache.beam.sdk.values.KV) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Example 9 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class BatchViewOverridesTest method testToIsmMetadataRecordForSizeDoFn.

@Test
public void testToIsmMetadataRecordForSizeDoFn() throws Exception {
    Coder<Long> keyCoder = VarLongCoder.of();
    Coder<IntervalWindow> windowCoder = IntervalWindow.getCoder();
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 2, ImmutableList.of(MetadataKeyCoder.of(keyCoder), IntervalWindow.getCoder(), BigEndianLongCoder.of()), FullWindowedValueCoder.of(VarLongCoder.of(), windowCoder));
    DoFnTester<KV<Integer, Iterable<KV<IntervalWindow, Long>>>, IsmRecord<WindowedValue<Long>>> doFnTester = DoFnTester.of(new BatchViewOverrides.BatchViewAsMultimap.ToIsmMetadataRecordForSizeDoFn<Long, Long, IntervalWindow>(windowCoder));
    IntervalWindow windowA = new IntervalWindow(new Instant(0), new Instant(10));
    IntervalWindow windowB = new IntervalWindow(new Instant(10), new Instant(20));
    IntervalWindow windowC = new IntervalWindow(new Instant(20), new Instant(30));
    Iterable<KV<Integer, Iterable<KV<IntervalWindow, Long>>>> inputElements = ImmutableList.of(KV.of(1, (Iterable<KV<IntervalWindow, Long>>) ImmutableList.of(KV.of(windowA, 2L), KV.of(windowA, 3L), KV.of(windowB, 7L))), KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowB)), (Iterable<KV<IntervalWindow, Long>>) ImmutableList.of(KV.of(windowC, 9L))));
    // The order of the output elements is important relative to processing order
    assertThat(doFnTester.processBundle(inputElements), contains(IsmRecord.<WindowedValue<Long>>meta(ImmutableList.of(IsmFormat.getMetadataKey(), windowA, 0L), CoderUtils.encodeToByteArray(VarLongCoder.of(), 5L)), IsmRecord.<WindowedValue<Long>>meta(ImmutableList.of(IsmFormat.getMetadataKey(), windowB, 0L), CoderUtils.encodeToByteArray(VarLongCoder.of(), 7L)), IsmRecord.<WindowedValue<Long>>meta(ImmutableList.of(IsmFormat.getMetadataKey(), windowC, 0L), CoderUtils.encodeToByteArray(VarLongCoder.of(), 9L))));
}
Also used : Instant(org.joda.time.Instant) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Example 10 with IsmRecord

use of org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord in project beam by apache.

the class BatchViewOverridesTest method testToIsmRecordForMapLikeDoFn.

@Test
public void testToIsmRecordForMapLikeDoFn() throws Exception {
    TupleTag<KV<Integer, KV<IntervalWindow, Long>>> outputForSizeTag = new TupleTag<>();
    TupleTag<KV<Integer, KV<IntervalWindow, Long>>> outputForEntrySetTag = new TupleTag<>();
    Coder<Long> keyCoder = VarLongCoder.of();
    Coder<IntervalWindow> windowCoder = IntervalWindow.getCoder();
    IsmRecordCoder<WindowedValue<Long>> ismCoder = IsmRecordCoder.of(1, 2, ImmutableList.of(MetadataKeyCoder.of(keyCoder), IntervalWindow.getCoder(), BigEndianLongCoder.of()), FullWindowedValueCoder.of(VarLongCoder.of(), windowCoder));
    DoFnTester<KV<Integer, Iterable<KV<KV<Long, IntervalWindow>, WindowedValue<Long>>>>, IsmRecord<WindowedValue<Long>>> doFnTester = DoFnTester.of(new BatchViewOverrides.BatchViewAsMultimap.ToIsmRecordForMapLikeDoFn<>(outputForSizeTag, outputForEntrySetTag, windowCoder, keyCoder, ismCoder, false));
    IntervalWindow windowA = new IntervalWindow(new Instant(0), new Instant(10));
    IntervalWindow windowB = new IntervalWindow(new Instant(10), new Instant(20));
    IntervalWindow windowC = new IntervalWindow(new Instant(20), new Instant(30));
    Iterable<KV<Integer, Iterable<KV<KV<Long, IntervalWindow>, WindowedValue<Long>>>>> inputElements = ImmutableList.of(KV.of(1, (Iterable<KV<KV<Long, IntervalWindow>, WindowedValue<Long>>>) ImmutableList.of(KV.of(KV.of(1L, windowA), WindowedValue.of(110L, new Instant(1), windowA, PaneInfo.NO_FIRING)), // same window same key as to previous
    KV.of(KV.of(1L, windowA), WindowedValue.of(111L, new Instant(2), windowA, PaneInfo.NO_FIRING)), // same window different key as to previous
    KV.of(KV.of(2L, windowA), WindowedValue.of(120L, new Instant(3), windowA, PaneInfo.NO_FIRING)), // different window same key as to previous
    KV.of(KV.of(2L, windowB), WindowedValue.of(210L, new Instant(11), windowB, PaneInfo.NO_FIRING)), // different window and different key as to previous
    KV.of(KV.of(3L, windowB), WindowedValue.of(220L, new Instant(12), windowB, PaneInfo.NO_FIRING)))), KV.of(2, (Iterable<KV<KV<Long, IntervalWindow>, WindowedValue<Long>>>) ImmutableList.of(// different shard
    KV.of(KV.of(4L, windowC), WindowedValue.of(330L, new Instant(21), windowC, PaneInfo.NO_FIRING)))));
    // The order of the output elements is important relative to processing order
    assertThat(doFnTester.processBundle(inputElements), contains(IsmRecord.of(ImmutableList.of(1L, windowA, 0L), WindowedValue.of(110L, new Instant(1), windowA, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(1L, windowA, 1L), WindowedValue.of(111L, new Instant(2), windowA, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(2L, windowA, 0L), WindowedValue.of(120L, new Instant(3), windowA, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(2L, windowB, 0L), WindowedValue.of(210L, new Instant(11), windowB, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(3L, windowB, 0L), WindowedValue.of(220L, new Instant(12), windowB, PaneInfo.NO_FIRING)), IsmRecord.of(ImmutableList.of(4L, windowC, 0L), WindowedValue.of(330L, new Instant(21), windowC, PaneInfo.NO_FIRING))));
    // Verify the number of unique keys per window.
    assertThat(doFnTester.takeOutputElements(outputForSizeTag), contains(KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowA)), KV.of(windowA, 2L)), KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowB)), KV.of(windowB, 2L)), KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowC)), KV.of(windowC, 1L))));
    // Verify the output for the unique keys.
    assertThat(doFnTester.takeOutputElements(outputForEntrySetTag), contains(KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowA)), KV.of(windowA, 1L)), KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowA)), KV.of(windowA, 2L)), KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowB)), KV.of(windowB, 2L)), KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowB)), KV.of(windowB, 3L)), KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(), windowC)), KV.of(windowC, 4L))));
}
Also used : Instant(org.joda.time.Instant) TupleTag(org.apache.beam.sdk.values.TupleTag) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Aggregations

IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)26 Test (org.junit.Test)17 WindowedValue (org.apache.beam.sdk.util.WindowedValue)16 ArrayList (java.util.ArrayList)12 File (java.io.File)8 KV (org.apache.beam.sdk.values.KV)8 HashMap (java.util.HashMap)7 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)7 Instant (org.joda.time.Instant)7 SortedMap (java.util.SortedMap)6 TreeMap (java.util.TreeMap)6 Callable (java.util.concurrent.Callable)6 Future (java.util.concurrent.Future)6 Source (com.google.api.services.dataflow.model.Source)5 Collection (java.util.Collection)5 Map (java.util.Map)5 RandomAccessData (org.apache.beam.runners.dataflow.util.RandomAccessData)5 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)5 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)3 TransformedMap (org.apache.beam.runners.dataflow.BatchViewOverrides.TransformedMap)2