Search in sources :

Example 46 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class WorkerCustomSourcesTest method testLargeNumberOfSplitsReturnsSplittableOnlyBoundedSources.

@Test
public void testLargeNumberOfSplitsReturnsSplittableOnlyBoundedSources() throws Exception {
    final long apiSizeLimitForTest = 500 * 1024;
    // Generate a CountingSource and split it into the desired number of splits
    // (desired size = 1 byte), triggering the re-split with a larger bundle size.
    // Thus below we expect to produce 451 splits.
    com.google.api.services.dataflow.model.Source source = translateIOToCloudSource(CountingSource.upTo(451), options);
    SourceSplitResponse split = performSplit(source, options, 1L, null, /* numBundles limit */
    apiSizeLimitForTest);
    assertEquals(WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT, split.getBundles().size());
    // 0-99, 100-199, 200-299, 300-355, 356, 357, ... 451
    for (int i = 0; i <= 3; ++i) {
        DerivedSource derivedSource = split.getBundles().get(i);
        // Make sure that we are setting the flag telling Dataflow that we need further splits.
        assertFalse(derivedSource.getSource().getDoesNotNeedSplitting());
        Object deserializedSource = WorkerCustomSources.deserializeFromCloudSource(derivedSource.getSource().getSpec());
        assertTrue(deserializedSource instanceof SplittableOnlyBoundedSource);
        SplittableOnlyBoundedSource<?> splittableOnlySource = (SplittableOnlyBoundedSource<?>) deserializedSource;
        List<? extends BoundedSource<?>> splitSources = splittableOnlySource.split(1L, options);
        int expectedNumSplits = i < 3 ? 100 : 55;
        assertEquals(expectedNumSplits, splitSources.size());
        for (int j = 0; j < splitSources.size(); ++j) {
            assertTrue(splitSources.get(j) instanceof OffsetBasedSource);
            OffsetBasedSource<?> offsetBasedSource = (OffsetBasedSource<?>) splitSources.get(j);
            assertEquals(i * 100 + j, offsetBasedSource.getStartOffset());
            assertEquals(i * 100 + j + 1, offsetBasedSource.getEndOffset());
        }
    }
    for (int i = 4; i < WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT; ++i) {
        DerivedSource derivedSource = split.getBundles().get(i);
        // Make sure that we are not setting the flag telling Dataflow that we need further splits
        // for the individual counting sources
        assertTrue(derivedSource.getSource().getDoesNotNeedSplitting());
        Object deserializedSource = WorkerCustomSources.deserializeFromCloudSource(derivedSource.getSource().getSpec());
        assertTrue(deserializedSource instanceof OffsetBasedSource);
        OffsetBasedSource<?> offsetBasedSource = (OffsetBasedSource<?>) deserializedSource;
        assertEquals(351 + i, offsetBasedSource.getStartOffset());
        assertEquals(351 + i + 1, offsetBasedSource.getEndOffset());
    }
}
Also used : Source(com.google.api.services.dataflow.model.Source) OffsetBasedSource(org.apache.beam.sdk.io.OffsetBasedSource) SourceSplitResponse(com.google.api.services.dataflow.model.SourceSplitResponse) DerivedSource(com.google.api.services.dataflow.model.DerivedSource) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) Structs.getObject(org.apache.beam.runners.dataflow.util.Structs.getObject) SplittableOnlyBoundedSource(org.apache.beam.runners.dataflow.worker.WorkerCustomSources.SplittableOnlyBoundedSource) Test(org.junit.Test)

Example 47 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class WorkerCustomSourcesTest method testOversplittingDesiredBundleSizeScaledFirst.

@Test
public void testOversplittingDesiredBundleSizeScaledFirst() throws Exception {
    // Create a source that greatly oversplits but with coalescing/compression it would still fit
    // under the API limit. Test that the API limit gets applied first, so oversplitting is
    // reduced.
    com.google.api.services.dataflow.model.Source source = translateIOToCloudSource(CountingSource.upTo(8000), options);
    // Without either limit, produces 1000 bundles, total size ~500kb.
    // With only numBundles limit 100, produces 100 bundles, total size ~72kb.
    // With only apiSize limit = 10kb, 72 bundles, total size ~40kb (over the limit but oh well).
    // With numBundles limit 100 and apiSize limit 10kb, should produce 72 bundles.
    // On the other hand, if the numBundles limit of 100 was applied first, we'd get 100 bundles.
    SourceSplitResponse bundledWithOnlyNumBundlesLimit = performSplit(source, options, 8L, 100, /* numBundles limit */
    10000 * 1024L);
    assertEquals(100, bundledWithOnlyNumBundlesLimit.getBundles().size());
    assertThat(DataflowApiUtils.computeSerializedSizeBytes(bundledWithOnlyNumBundlesLimit), greaterThan(10 * 1024L));
    SourceSplitResponse bundledWithOnlySizeLimit = performSplit(source, options, 8L, 1000000, /* numBundles limit */
    10 * 1024L);
    int numBundlesWithOnlySizeLimit = bundledWithOnlySizeLimit.getBundles().size();
    assertThat(numBundlesWithOnlySizeLimit, lessThan(100));
    SourceSplitResponse bundledWithSizeLimit = performSplit(source, options, 8L, 100, 10 * 1024L);
    assertEquals(numBundlesWithOnlySizeLimit, bundledWithSizeLimit.getBundles().size());
}
Also used : SourceSplitResponse(com.google.api.services.dataflow.model.SourceSplitResponse) Source(com.google.api.services.dataflow.model.Source) Test(org.junit.Test)

Example 48 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class WorkerCustomSourcesTest method testTooLargeSplitResponseFails.

@Test
public void testTooLargeSplitResponseFails() throws Exception {
    com.google.api.services.dataflow.model.Source source = translateIOToCloudSource(CountingSource.upTo(1000), options);
    expectedException.expectMessage("[0, 1000)");
    expectedException.expectMessage("larger than the limit 100");
    performSplit(source, options, 8L, 10, 100L);
}
Also used : Source(com.google.api.services.dataflow.model.Source) Test(org.junit.Test)

Example 49 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class WorkerCustomSourcesTest method testSplitAndReadBundlesBack.

@Test
public void testSplitAndReadBundlesBack() throws Exception {
    com.google.api.services.dataflow.model.Source source = translateIOToCloudSource(CountingSource.upTo(10L), options);
    List<WindowedValue<Integer>> elems = readElemsFromSource(options, source);
    assertEquals(10L, elems.size());
    for (long i = 0; i < 10L; i++) {
        assertEquals(valueInGlobalWindow(i), elems.get((int) i));
    }
    SourceSplitResponse response = performSplit(source, options, 16L, /*desiredBundleSizeBytes for two longs*/
    null, /* numBundles limit */
    null);
    assertEquals("SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED", response.getOutcome());
    List<DerivedSource> bundles = response.getBundles();
    assertEquals(5, bundles.size());
    for (int i = 0; i < 5; ++i) {
        DerivedSource bundle = bundles.get(i);
        assertEquals("SOURCE_DERIVATION_MODE_INDEPENDENT", bundle.getDerivationMode());
        com.google.api.services.dataflow.model.Source bundleSource = bundle.getSource();
        assertTrue(bundleSource.getDoesNotNeedSplitting());
        bundleSource.setCodec(source.getCodec());
        List<WindowedValue<Integer>> xs = readElemsFromSource(options, bundleSource);
        assertThat("Failed on bundle " + i, xs, contains(valueInGlobalWindow(0L + 2 * i), valueInGlobalWindow(1L + 2 * i)));
        assertTrue(bundle.getSource().getMetadata().getEstimatedSizeBytes() > 0);
    }
}
Also used : SourceSplitResponse(com.google.api.services.dataflow.model.SourceSplitResponse) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Source(com.google.api.services.dataflow.model.Source) DerivedSource(com.google.api.services.dataflow.model.DerivedSource) Test(org.junit.Test)

Example 50 with Source

use of com.google.api.services.dataflow.model.Source in project beam by apache.

the class WorkerCustomSourcesTest method testLargeSerializedSizeResplits.

@Test
public void testLargeSerializedSizeResplits() throws Exception {
    final long apiSizeLimitForTest = 5 * 1024;
    // Figure out how many splits of CountingSource are needed to exceed the API limits, using an
    // extra factor of 2 to ensure that we go over the limits.
    BoundedSource<Long> justForSizing = CountingSource.upTo(1000000L);
    long size = DataflowApiUtils.computeSerializedSizeBytes(translateIOToCloudSource(justForSizing, options));
    long numberToSplitToExceedLimit = 2 * apiSizeLimitForTest / size;
    checkState(numberToSplitToExceedLimit < WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT, "This test expects the number of splits to be less than %s " + "to avoid using SplittableOnlyBoundedSource", WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT);
    // Generate a CountingSource and split it into the desired number of splits
    // (desired size = 8 bytes, 1 long), triggering the re-split with a larger bundle size.
    com.google.api.services.dataflow.model.Source source = translateIOToCloudSource(CountingSource.upTo(numberToSplitToExceedLimit), options);
    SourceSplitResponse split = performSplit(source, options, 8L, null, /* numBundles limit */
    apiSizeLimitForTest);
    logged.verifyWarn("too large for the Google Cloud Dataflow API");
    logged.verifyWarn(String.format("%d bundles", numberToSplitToExceedLimit));
    assertThat((long) split.getBundles().size(), lessThan(numberToSplitToExceedLimit));
}
Also used : SourceSplitResponse(com.google.api.services.dataflow.model.SourceSplitResponse) Source(com.google.api.services.dataflow.model.Source) Test(org.junit.Test)

Aggregations

Source (com.google.api.services.dataflow.model.Source)51 Test (org.junit.Test)31 ArrayList (java.util.ArrayList)20 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)16 Map (java.util.Map)15 Callable (java.util.concurrent.Callable)15 Future (java.util.concurrent.Future)15 HashMap (java.util.HashMap)13 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)12 SortedMap (java.util.SortedMap)11 TreeMap (java.util.TreeMap)11 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)8 ParallelInstruction (com.google.api.services.dataflow.model.ParallelInstruction)7 ReadInstruction (com.google.api.services.dataflow.model.ReadInstruction)6 KV (org.apache.beam.sdk.values.KV)6 Collection (java.util.Collection)5 List (java.util.List)5 IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)5 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)5