use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class BigQueryIOTest method testBigQueryTableSourceInitSplit.
@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
FakeDatasetService fakeDatasetService = new FakeDatasetService();
FakeJobService fakeJobService = new FakeJobService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
fakeDatasetService.createDataset("project", "data_set", "", "");
fakeDatasetService.createTable(new Table().setTableReference(table).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
fakeDatasetService.insertAll(table, expected, null);
Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryTableSourceInitSplit");
String stepUuid = "testStepUuid";
BoundedSource<TableRow> bqSource = BigQueryTableSource.create(stepUuid, StaticValueProvider.of(table), fakeBqServices);
PipelineOptions options = PipelineOptionsFactory.create();
options.setTempLocation(baseDir.toString());
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setProject("project");
List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
// Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
sources = bqSource.split(200, options);
assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
// A repeated call to split() should not have caused a duplicate extract job.
assertEquals(1, fakeJobService.getNumExtractJobCalls());
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class BigQueryIOTest method testBigQueryNoTableQuerySourceInitSplit.
@Test
public void testBigQueryNoTableQuerySourceInitSplit() throws Exception {
TableReference dryRunTable = new TableReference();
Job queryJob = new Job();
JobStatistics queryJobStats = new JobStatistics();
JobStatistics2 queryStats = new JobStatistics2();
queryStats.setReferencedTables(ImmutableList.of(dryRunTable));
queryJobStats.setQuery(queryStats);
queryJob.setStatus(new JobStatus()).setStatistics(queryJobStats);
Job extractJob = new Job();
JobStatistics extractJobStats = new JobStatistics();
JobStatistics4 extractStats = new JobStatistics4();
extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
extractJobStats.setExtract(extractStats);
extractJob.setStatus(new JobStatus()).setStatistics(extractJobStats);
FakeDatasetService datasetService = new FakeDatasetService();
FakeJobService jobService = new FakeJobService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(jobService).withDatasetService(datasetService);
PipelineOptions options = PipelineOptionsFactory.create();
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setProject("project");
String stepUuid = "testStepUuid";
TableReference tempTableReference = createTempTableReference(bqOptions.getProject(), createJobIdToken(bqOptions.getJobName(), stepUuid));
List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
datasetService.createDataset(tempTableReference.getProjectId(), tempTableReference.getDatasetId(), "", "");
Table table = new Table().setTableReference(tempTableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
datasetService.createTable(table);
String query = FakeBigQueryServices.encodeQuery(expected);
jobService.expectDryRunQuery("project", query, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L).setReferencedTables(ImmutableList.of(table.getTableReference()))));
Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryNoTableQuerySourceInitSplit");
BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(stepUuid, StaticValueProvider.of(query), true, /* flattenResults */
true, /* useLegacySql */
fakeBqServices);
options.setTempLocation(baseDir.toString());
List<TableRow> read = convertBigDecimaslToLong(SourceTestUtils.readFromSource(bqSource, options));
assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class BigQueryIOTest method testTransformingSource.
@Test
public void testTransformingSource() throws Exception {
int numElements = 10000;
@SuppressWarnings("deprecation") BoundedSource<Long> longSource = CountingSource.upTo(numElements);
SerializableFunction<Long, String> toStringFn = new SerializableFunction<Long, String>() {
@Override
public String apply(Long input) {
return input.toString();
}
};
BoundedSource<String> stringSource = new TransformingSource<>(longSource, toStringFn, StringUtf8Coder.of());
List<String> expected = Lists.newArrayList();
for (int i = 0; i < numElements; i++) {
expected.add(String.valueOf(i));
}
PipelineOptions options = PipelineOptionsFactory.create();
Assert.assertThat(SourceTestUtils.readFromSource(stringSource, options), CoreMatchers.is(expected));
SourceTestUtils.assertSplitAtFractionBehavior(stringSource, 100, 0.3, ExpectedSplitOutcome.MUST_SUCCEED_AND_BE_CONSISTENT, options);
SourceTestUtils.assertSourcesEqualReferenceSource(stringSource, stringSource.split(100, options), options);
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class UnboundedReadFromBoundedSourceTest method testBoundedToUnboundedSourceAdapterCheckpointRestart.
private <T> void testBoundedToUnboundedSourceAdapterCheckpointRestart(BoundedSource<T> boundedSource, List<T> expectedElements) throws Exception {
BoundedToUnboundedSourceAdapter<T> unboundedSource = new BoundedToUnboundedSourceAdapter<>(boundedSource);
PipelineOptions options = PipelineOptionsFactory.create();
BoundedToUnboundedSourceAdapter<T>.Reader<T> reader = unboundedSource.createReader(options, null);
List<T> actual = Lists.newArrayList();
for (boolean hasNext = reader.start(); hasNext; ) {
actual.add(reader.getCurrent());
// checkpoint every 9 elements
if (actual.size() % 9 == 0) {
Checkpoint<T> checkpoint = reader.getCheckpointMark();
Coder<Checkpoint<T>> checkpointCoder = unboundedSource.getCheckpointMarkCoder();
Checkpoint<T> decodedCheckpoint = CoderUtils.decodeFromByteArray(checkpointCoder, CoderUtils.encodeToByteArray(checkpointCoder, checkpoint));
reader.close();
checkpoint.finalizeCheckpoint();
BoundedToUnboundedSourceAdapter<T>.Reader<T> restarted = unboundedSource.createReader(options, decodedCheckpoint);
reader = restarted;
hasNext = reader.start();
} else {
hasNext = reader.advance();
}
}
Checkpoint<T> checkpointDone = reader.getCheckpointMark();
assertTrue(checkpointDone.getResidualElements() == null || checkpointDone.getResidualElements().isEmpty());
assertEquals(expectedElements.size(), actual.size());
assertEquals(Sets.newHashSet(expectedElements), Sets.newHashSet(actual));
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class UnboundedReadFromBoundedSourceTest method testReadFromCheckpointBeforeStart.
@Test
public void testReadFromCheckpointBeforeStart() throws Exception {
thrown.expect(NoSuchElementException.class);
BoundedSource<Long> countingSource = CountingSource.upTo(100);
BoundedToUnboundedSourceAdapter<Long> unboundedSource = new BoundedToUnboundedSourceAdapter<>(countingSource);
PipelineOptions options = PipelineOptionsFactory.create();
List<TimestampedValue<Long>> elements = ImmutableList.of(TimestampedValue.of(1L, new Instant(1L)));
Checkpoint<Long> checkpoint = new Checkpoint<>(elements, countingSource);
unboundedSource.createReader(options, checkpoint).getCurrent();
}
Aggregations