Examples with PipelineOptions - org.apache.beam.sdk.options.PipelineOptions

Example 11 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class BigQueryIOTest method testBigQueryTableSourceInitSplit.

@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
    FakeDatasetService fakeDatasetService = new FakeDatasetService();
    FakeJobService fakeJobService = new FakeJobService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
    fakeDatasetService.createDataset("project", "data_set", "", "");
    fakeDatasetService.createTable(new Table().setTableReference(table).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
    fakeDatasetService.insertAll(table, expected, null);
    Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryTableSourceInitSplit");
    String stepUuid = "testStepUuid";
    BoundedSource<TableRow> bqSource = BigQueryTableSource.create(stepUuid, StaticValueProvider.of(table), fakeBqServices);
    PipelineOptions options = PipelineOptionsFactory.create();
    options.setTempLocation(baseDir.toString());
    BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
    bqOptions.setProject("project");
    List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
    assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
    SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
    List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
    assertEquals(2, sources.size());
    // Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
    sources = bqSource.split(200, options);
    assertEquals(2, sources.size());
    BoundedSource<TableRow> actual = sources.get(0);
    assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
    // A repeated call to split() should not have caused a duplicate extract job.
    assertEquals(1, fakeJobService.getNumExtractJobCalls());
}

Also used : Path(java.nio.file.Path) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TableRow(com.google.api.services.bigquery.model.TableRow) Test(org.junit.Test)

Example 12 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class BigQueryIOTest method testBigQueryNoTableQuerySourceInitSplit.

@Test
public void testBigQueryNoTableQuerySourceInitSplit() throws Exception {
    TableReference dryRunTable = new TableReference();
    Job queryJob = new Job();
    JobStatistics queryJobStats = new JobStatistics();
    JobStatistics2 queryStats = new JobStatistics2();
    queryStats.setReferencedTables(ImmutableList.of(dryRunTable));
    queryJobStats.setQuery(queryStats);
    queryJob.setStatus(new JobStatus()).setStatistics(queryJobStats);
    Job extractJob = new Job();
    JobStatistics extractJobStats = new JobStatistics();
    JobStatistics4 extractStats = new JobStatistics4();
    extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
    extractJobStats.setExtract(extractStats);
    extractJob.setStatus(new JobStatus()).setStatistics(extractJobStats);
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeJobService jobService = new FakeJobService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(jobService).withDatasetService(datasetService);
    PipelineOptions options = PipelineOptionsFactory.create();
    BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
    bqOptions.setProject("project");
    String stepUuid = "testStepUuid";
    TableReference tempTableReference = createTempTableReference(bqOptions.getProject(), createJobIdToken(bqOptions.getJobName(), stepUuid));
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    datasetService.createDataset(tempTableReference.getProjectId(), tempTableReference.getDatasetId(), "", "");
    Table table = new Table().setTableReference(tempTableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
    datasetService.createTable(table);
    String query = FakeBigQueryServices.encodeQuery(expected);
    jobService.expectDryRunQuery("project", query, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L).setReferencedTables(ImmutableList.of(table.getTableReference()))));
    Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryNoTableQuerySourceInitSplit");
    BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(stepUuid, StaticValueProvider.of(query), true, /* flattenResults */
    true, /* useLegacySql */
    fakeBqServices);
    options.setTempLocation(baseDir.toString());
    List<TableRow> read = convertBigDecimaslToLong(SourceTestUtils.readFromSource(bqSource, options));
    assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
    SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
    List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
    assertEquals(2, sources.size());
    BoundedSource<TableRow> actual = sources.get(0);
    assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}

Also used : Path(java.nio.file.Path) JobStatistics(com.google.api.services.bigquery.model.JobStatistics) JobStatistics2(com.google.api.services.bigquery.model.JobStatistics2) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) JobStatistics4(com.google.api.services.bigquery.model.JobStatistics4) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) JobStatus(com.google.api.services.bigquery.model.JobStatus) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TableRow(com.google.api.services.bigquery.model.TableRow) Job(com.google.api.services.bigquery.model.Job) Test(org.junit.Test)

Example 13 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class BigQueryIOTest method testTransformingSource.

@Test
public void testTransformingSource() throws Exception {
    int numElements = 10000;
    @SuppressWarnings("deprecation") BoundedSource<Long> longSource = CountingSource.upTo(numElements);
    SerializableFunction<Long, String> toStringFn = new SerializableFunction<Long, String>() {

        @Override
        public String apply(Long input) {
            return input.toString();
        }
    };
    BoundedSource<String> stringSource = new TransformingSource<>(longSource, toStringFn, StringUtf8Coder.of());
    List<String> expected = Lists.newArrayList();
    for (int i = 0; i < numElements; i++) {
        expected.add(String.valueOf(i));
    }
    PipelineOptions options = PipelineOptionsFactory.create();
    Assert.assertThat(SourceTestUtils.readFromSource(stringSource, options), CoreMatchers.is(expected));
    SourceTestUtils.assertSplitAtFractionBehavior(stringSource, 100, 0.3, ExpectedSplitOutcome.MUST_SUCCEED_AND_BE_CONSISTENT, options);
    SourceTestUtils.assertSourcesEqualReferenceSource(stringSource, stringSource.split(100, options), options);
}

Also used : SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) Test(org.junit.Test)

Example 14 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class UnboundedReadFromBoundedSourceTest method testBoundedToUnboundedSourceAdapterCheckpointRestart.

private <T> void testBoundedToUnboundedSourceAdapterCheckpointRestart(BoundedSource<T> boundedSource, List<T> expectedElements) throws Exception {
    BoundedToUnboundedSourceAdapter<T> unboundedSource = new BoundedToUnboundedSourceAdapter<>(boundedSource);
    PipelineOptions options = PipelineOptionsFactory.create();
    BoundedToUnboundedSourceAdapter<T>.Reader<T> reader = unboundedSource.createReader(options, null);
    List<T> actual = Lists.newArrayList();
    for (boolean hasNext = reader.start(); hasNext; ) {
        actual.add(reader.getCurrent());
        // checkpoint every 9 elements
        if (actual.size() % 9 == 0) {
            Checkpoint<T> checkpoint = reader.getCheckpointMark();
            Coder<Checkpoint<T>> checkpointCoder = unboundedSource.getCheckpointMarkCoder();
            Checkpoint<T> decodedCheckpoint = CoderUtils.decodeFromByteArray(checkpointCoder, CoderUtils.encodeToByteArray(checkpointCoder, checkpoint));
            reader.close();
            checkpoint.finalizeCheckpoint();
            BoundedToUnboundedSourceAdapter<T>.Reader<T> restarted = unboundedSource.createReader(options, decodedCheckpoint);
            reader = restarted;
            hasNext = reader.start();
        } else {
            hasNext = reader.advance();
        }
    }
    Checkpoint<T> checkpointDone = reader.getCheckpointMark();
    assertTrue(checkpointDone.getResidualElements() == null || checkpointDone.getResidualElements().isEmpty());
    assertEquals(expectedElements.size(), actual.size());
    assertEquals(Sets.newHashSet(expectedElements), Sets.newHashSet(actual));
}

Also used : Checkpoint(org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter.Checkpoint) BoundedToUnboundedSourceAdapter(org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions)

Example 15 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class UnboundedReadFromBoundedSourceTest method testReadFromCheckpointBeforeStart.

@Test
public void testReadFromCheckpointBeforeStart() throws Exception {
    thrown.expect(NoSuchElementException.class);
    BoundedSource<Long> countingSource = CountingSource.upTo(100);
    BoundedToUnboundedSourceAdapter<Long> unboundedSource = new BoundedToUnboundedSourceAdapter<>(countingSource);
    PipelineOptions options = PipelineOptionsFactory.create();
    List<TimestampedValue<Long>> elements = ImmutableList.of(TimestampedValue.of(1L, new Instant(1L)));
    Checkpoint<Long> checkpoint = new Checkpoint<>(elements, countingSource);
    unboundedSource.createReader(options, checkpoint).getCurrent();
}

Also used : Checkpoint(org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter.Checkpoint) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) BoundedToUnboundedSourceAdapter(org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Instant(org.joda.time.Instant) Test(org.junit.Test)

Aggregations

PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)92 Test (org.junit.Test)79 File (java.io.File)26 ArrayList (java.util.ArrayList)16 Pipeline (org.apache.beam.sdk.Pipeline)10 Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)9 Path (java.nio.file.Path)6 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)6 SerializedPipelineOptions (org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions)5 KV (org.apache.beam.sdk.values.KV)5 Matchers.containsString (org.hamcrest.Matchers.containsString)5 Table (com.google.api.services.bigquery.model.Table)4 TableReference (com.google.api.services.bigquery.model.TableReference)4 TableRow (com.google.api.services.bigquery.model.TableRow)4 HashBasedTable (com.google.common.collect.HashBasedTable)4 BoundedToUnboundedSourceAdapter (org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter)4 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)4 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)4 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)3 TableSchema (com.google.api.services.bigquery.model.TableSchema)3