use of com.google.api.services.dataflow.model.DerivedSource in project beam by apache.
the class WorkerCustomSourcesSplitOnlySourceTest method testAllSplitsAreReturned.
@Test
public void testAllSplitsAreReturned() throws Exception {
final long apiSizeLimitForTest = 500 * 1024;
DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
options.setAppName("TestAppName");
options.setProject("test-project");
options.setRegion("some-region1");
options.setTempLocation("gs://test/temp/location");
options.setGcpCredential(new TestCredential());
options.setRunner(DataflowRunner.class);
options.setPathValidatorClass(NoopPathValidator.class);
// Generate a CountingSource and split it into the desired number of splits
// (desired size = 1 byte), triggering the re-split with a larger bundle size.
// Thus below we expect to produce 'numberOfSplits' splits.
com.google.api.services.dataflow.model.Source source = WorkerCustomSourcesTest.translateIOToCloudSource(CountingSource.upTo(numberOfSplits), options);
SourceSplitResponse split = WorkerCustomSourcesTest.performSplit(source, options, 1L, null, /* numBundles limit */
apiSizeLimitForTest);
assertThat(split.getBundles().size(), lessThanOrEqualTo(WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT));
List<OffsetBasedSource<?>> originalSplits = new ArrayList<>(numberOfSplits);
// Collect all the splits
for (DerivedSource derivedSource : split.getBundles()) {
Object deserializedSource = WorkerCustomSources.deserializeFromCloudSource(derivedSource.getSource().getSpec());
if (deserializedSource instanceof SplittableOnlyBoundedSource) {
SplittableOnlyBoundedSource<?> splittableOnlySource = (SplittableOnlyBoundedSource<?>) deserializedSource;
originalSplits.addAll((List) splittableOnlySource.split(1L, options));
} else {
originalSplits.add((OffsetBasedSource<?>) deserializedSource);
}
}
assertEquals(numberOfSplits, originalSplits.size());
for (int i = 0; i < originalSplits.size(); i++) {
OffsetBasedSource<?> offsetBasedSource = (OffsetBasedSource<?>) originalSplits.get(i);
assertEquals(i, offsetBasedSource.getStartOffset());
assertEquals(i + 1, offsetBasedSource.getEndOffset());
}
}
use of com.google.api.services.dataflow.model.DerivedSource in project beam by apache.
the class WorkerCustomSources method toSourceSplit.
public static DynamicSourceSplit toSourceSplit(BoundedSourceSplit<?> sourceSplitResult) {
DynamicSourceSplit sourceSplit = new DynamicSourceSplit();
com.google.api.services.dataflow.model.Source primarySource;
com.google.api.services.dataflow.model.Source residualSource;
try {
primarySource = serializeSplitToCloudSource(sourceSplitResult.primary);
residualSource = serializeSplitToCloudSource(sourceSplitResult.residual);
} catch (Exception e) {
throw new RuntimeException("Failed to serialize one of the parts of the source split", e);
}
sourceSplit.setPrimary(new DerivedSource().setDerivationMode("SOURCE_DERIVATION_MODE_INDEPENDENT").setSource(primarySource));
sourceSplit.setResidual(new DerivedSource().setDerivationMode("SOURCE_DERIVATION_MODE_INDEPENDENT").setSource(residualSource));
return sourceSplit;
}
use of com.google.api.services.dataflow.model.DerivedSource in project beam by apache.
the class WorkerCustomSourcesTest method testLargeNumberOfSplitsReturnsSplittableOnlyBoundedSources.
@Test
public void testLargeNumberOfSplitsReturnsSplittableOnlyBoundedSources() throws Exception {
final long apiSizeLimitForTest = 500 * 1024;
// Generate a CountingSource and split it into the desired number of splits
// (desired size = 1 byte), triggering the re-split with a larger bundle size.
// Thus below we expect to produce 451 splits.
com.google.api.services.dataflow.model.Source source = translateIOToCloudSource(CountingSource.upTo(451), options);
SourceSplitResponse split = performSplit(source, options, 1L, null, /* numBundles limit */
apiSizeLimitForTest);
assertEquals(WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT, split.getBundles().size());
// 0-99, 100-199, 200-299, 300-355, 356, 357, ... 451
for (int i = 0; i <= 3; ++i) {
DerivedSource derivedSource = split.getBundles().get(i);
// Make sure that we are setting the flag telling Dataflow that we need further splits.
assertFalse(derivedSource.getSource().getDoesNotNeedSplitting());
Object deserializedSource = WorkerCustomSources.deserializeFromCloudSource(derivedSource.getSource().getSpec());
assertTrue(deserializedSource instanceof SplittableOnlyBoundedSource);
SplittableOnlyBoundedSource<?> splittableOnlySource = (SplittableOnlyBoundedSource<?>) deserializedSource;
List<? extends BoundedSource<?>> splitSources = splittableOnlySource.split(1L, options);
int expectedNumSplits = i < 3 ? 100 : 55;
assertEquals(expectedNumSplits, splitSources.size());
for (int j = 0; j < splitSources.size(); ++j) {
assertTrue(splitSources.get(j) instanceof OffsetBasedSource);
OffsetBasedSource<?> offsetBasedSource = (OffsetBasedSource<?>) splitSources.get(j);
assertEquals(i * 100 + j, offsetBasedSource.getStartOffset());
assertEquals(i * 100 + j + 1, offsetBasedSource.getEndOffset());
}
}
for (int i = 4; i < WorkerCustomSources.DEFAULT_NUM_BUNDLES_LIMIT; ++i) {
DerivedSource derivedSource = split.getBundles().get(i);
// Make sure that we are not setting the flag telling Dataflow that we need further splits
// for the individual counting sources
assertTrue(derivedSource.getSource().getDoesNotNeedSplitting());
Object deserializedSource = WorkerCustomSources.deserializeFromCloudSource(derivedSource.getSource().getSpec());
assertTrue(deserializedSource instanceof OffsetBasedSource);
OffsetBasedSource<?> offsetBasedSource = (OffsetBasedSource<?>) deserializedSource;
assertEquals(351 + i, offsetBasedSource.getStartOffset());
assertEquals(351 + i + 1, offsetBasedSource.getEndOffset());
}
}
use of com.google.api.services.dataflow.model.DerivedSource in project beam by apache.
the class WorkerCustomSourcesTest method testSplitAndReadBundlesBack.
@Test
public void testSplitAndReadBundlesBack() throws Exception {
com.google.api.services.dataflow.model.Source source = translateIOToCloudSource(CountingSource.upTo(10L), options);
List<WindowedValue<Integer>> elems = readElemsFromSource(options, source);
assertEquals(10L, elems.size());
for (long i = 0; i < 10L; i++) {
assertEquals(valueInGlobalWindow(i), elems.get((int) i));
}
SourceSplitResponse response = performSplit(source, options, 16L, /*desiredBundleSizeBytes for two longs*/
null, /* numBundles limit */
null);
assertEquals("SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED", response.getOutcome());
List<DerivedSource> bundles = response.getBundles();
assertEquals(5, bundles.size());
for (int i = 0; i < 5; ++i) {
DerivedSource bundle = bundles.get(i);
assertEquals("SOURCE_DERIVATION_MODE_INDEPENDENT", bundle.getDerivationMode());
com.google.api.services.dataflow.model.Source bundleSource = bundle.getSource();
assertTrue(bundleSource.getDoesNotNeedSplitting());
bundleSource.setCodec(source.getCodec());
List<WindowedValue<Integer>> xs = readElemsFromSource(options, bundleSource);
assertThat("Failed on bundle " + i, xs, contains(valueInGlobalWindow(0L + 2 * i), valueInGlobalWindow(1L + 2 * i)));
assertTrue(bundle.getSource().getMetadata().getEstimatedSizeBytes() > 0);
}
}
Aggregations