Search in sources :

Example 1 with SchemaUpdateOption

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption in project beam by apache.

the class WriteTables method startLoad.

private PendingJob startLoad(JobService jobService, DatasetService datasetService, String jobIdPrefix, TableReference ref, TimePartitioning timePartitioning, Clustering clustering, @Nullable TableSchema schema, List<String> gcsUris, WriteDisposition writeDisposition, CreateDisposition createDisposition, Set<SchemaUpdateOption> schemaUpdateOptions) {
    JobConfigurationLoad loadConfig = new JobConfigurationLoad().setDestinationTable(ref).setSchema(schema).setSourceUris(gcsUris).setWriteDisposition(writeDisposition.name()).setCreateDisposition(createDisposition.name()).setSourceFormat(sourceFormat).setIgnoreUnknownValues(ignoreUnknownValues).setUseAvroLogicalTypes(useAvroLogicalTypes);
    if (schemaUpdateOptions != null) {
        List<String> options = schemaUpdateOptions.stream().map(Enum<SchemaUpdateOption>::name).collect(Collectors.toList());
        loadConfig.setSchemaUpdateOptions(options);
    }
    if (timePartitioning != null) {
        loadConfig.setTimePartitioning(timePartitioning);
        // only set clustering if timePartitioning is set
        if (clustering != null) {
            loadConfig.setClustering(clustering);
        }
    }
    if (kmsKey != null) {
        loadConfig.setDestinationEncryptionConfiguration(new EncryptionConfiguration().setKmsKeyName(kmsKey));
    }
    String projectId = loadJobProjectId == null || loadJobProjectId.get() == null ? ref.getProjectId() : loadJobProjectId.get();
    String bqLocation = BigQueryHelpers.getDatasetLocation(datasetService, ref.getProjectId(), ref.getDatasetId());
    PendingJob retryJob = new PendingJob(// Function to load the data.
    jobId -> {
        JobReference jobRef = new JobReference().setProjectId(projectId).setJobId(jobId.getJobId()).setLocation(bqLocation);
        LOG.info("Loading {} files into {} using job {}, job id iteration {}", gcsUris.size(), ref, jobRef, jobId.getRetryIndex());
        try {
            jobService.startLoadJob(jobRef, loadConfig);
        } catch (IOException | InterruptedException e) {
            LOG.warn("Load job {} failed with {}", jobRef, e.toString());
            throw new RuntimeException(e);
        }
        return null;
    }, // Function to poll the result of a load job.
    jobId -> {
        JobReference jobRef = new JobReference().setProjectId(projectId).setJobId(jobId.getJobId()).setLocation(bqLocation);
        try {
            return jobService.pollJob(jobRef, BatchLoads.LOAD_JOB_POLL_MAX_RETRIES);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }, // Function to lookup a job.
    jobId -> {
        JobReference jobRef = new JobReference().setProjectId(projectId).setJobId(jobId.getJobId()).setLocation(bqLocation);
        try {
            return jobService.getJob(jobRef);
        } catch (InterruptedException | IOException e) {
            throw new RuntimeException(e);
        }
    }, maxRetryJobs, jobIdPrefix);
    return retryJob;
}
Also used : JobConfigurationLoad(com.google.api.services.bigquery.model.JobConfigurationLoad) JobReference(com.google.api.services.bigquery.model.JobReference) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) EncryptionConfiguration(com.google.api.services.bigquery.model.EncryptionConfiguration) PendingJob(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.PendingJob) IOException(java.io.IOException)

Example 2 with SchemaUpdateOption

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption in project beam by apache.

the class BigQuerySchemaUpdateOptionsIT method testAllowFieldAddition.

@Test
public void testAllowFieldAddition() throws Exception {
    String tableName = makeTestTable();
    Set<SchemaUpdateOption> schemaUpdateOptions = EnumSet.of(BigQueryIO.Write.SchemaUpdateOption.ALLOW_FIELD_ADDITION);
    TableSchema newSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("new_field").setType("STRING"), new TableFieldSchema().setName("optional_field").setType("STRING"), new TableFieldSchema().setName("required_field").setType("STRING").setMode("REQUIRED")));
    String[] values = { "meow", "bark" };
    TableRow rowToInsert = new TableRow().set("new_field", values[0]).set("required_field", values[1]);
    String testQuery = String.format("SELECT new_field, required_field FROM [%s.%s];", BIG_QUERY_DATASET_ID, tableName);
    List<List<String>> expectedResult = Arrays.asList(Arrays.asList(values));
    runWriteTest(schemaUpdateOptions, tableName, newSchema, rowToInsert, testQuery, expectedResult);
}
Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) TableRow(com.google.api.services.bigquery.model.TableRow) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Test(org.junit.Test)

Example 3 with SchemaUpdateOption

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption in project beam by apache.

the class BigQuerySchemaUpdateOptionsIT method runWriteTest.

/**
 * Runs a write test against a BigQuery table to check that SchemaUpdateOption sets are taking
 * effect.
 *
 * <p>Attempt write a row via BigQueryIO.writeTables with the given params, then run the given
 * query, and finaly check the results of the query.
 *
 * @param schemaUpdateOptions The SchemaUpdateOption set to use
 * @param tableName The table to write to
 * @param schema The schema to use for the table
 * @param rowToInsert The row to insert
 * @param testQuery A testing SQL query to run after writing the row
 * @param expectedResult The expected result of the query as a nested list of column values (one
 *     list per result row)
 */
private void runWriteTest(Set<SchemaUpdateOption> schemaUpdateOptions, String tableName, TableSchema schema, TableRow rowToInsert, String testQuery, List<List<String>> expectedResult) throws Exception {
    Options options = TestPipeline.testingPipelineOptions().as(Options.class);
    options.setTempLocation(options.getTempRoot() + "/bq_it_temp");
    Pipeline p = Pipeline.create(options);
    Create.Values<TableRow> input = Create.<TableRow>of(rowToInsert);
    Write<TableRow> writer = BigQueryIO.writeTableRows().to(String.format("%s:%s.%s", options.getProject(), BIG_QUERY_DATASET_ID, tableName)).withSchema(schema).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND).withSchemaUpdateOptions(schemaUpdateOptions);
    p.apply(input).apply(writer);
    p.run().waitUntilFinish();
    QueryResponse response = BQ_CLIENT.queryWithRetries(testQuery, project);
    List<List<String>> result = response.getRows().stream().map(row -> row.getF().stream().map(cell -> cell.getV().toString()).collect(Collectors.toList())).collect(Collectors.toList());
    assertEquals(expectedResult, result);
}
Also used : Arrays(java.util.Arrays) TestPipelineOptions(org.apache.beam.sdk.testing.TestPipelineOptions) BeforeClass(org.junit.BeforeClass) RunWith(org.junit.runner.RunWith) LoggerFactory(org.slf4j.LoggerFactory) SecureRandom(java.security.SecureRandom) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) Create(org.apache.beam.sdk.transforms.Create) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) TableRow(com.google.api.services.bigquery.model.TableRow) TableSchema(com.google.api.services.bigquery.model.TableSchema) Pipeline(org.apache.beam.sdk.Pipeline) EnumSet(java.util.EnumSet) BigqueryClient(org.apache.beam.sdk.io.gcp.testing.BigqueryClient) QueryResponse(com.google.api.services.bigquery.model.QueryResponse) TableReference(com.google.api.services.bigquery.model.TableReference) AfterClass(org.junit.AfterClass) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Logger(org.slf4j.Logger) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) Set(java.util.Set) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) Collectors(java.util.stream.Collectors) Table(com.google.api.services.bigquery.model.Table) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) Assert.assertEquals(org.junit.Assert.assertEquals) TestPipelineOptions(org.apache.beam.sdk.testing.TestPipelineOptions) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) Create(org.apache.beam.sdk.transforms.Create) TableRow(com.google.api.services.bigquery.model.TableRow) QueryResponse(com.google.api.services.bigquery.model.QueryResponse) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline)

Example 4 with SchemaUpdateOption

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption in project beam by apache.

the class BigQuerySchemaUpdateOptionsIT method testAllowFieldRelaxation.

@Test
public void testAllowFieldRelaxation() throws Exception {
    String tableName = makeTestTable();
    Set<SchemaUpdateOption> schemaUpdateOptions = EnumSet.of(BigQueryIO.Write.SchemaUpdateOption.ALLOW_FIELD_RELAXATION);
    TableSchema newSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("optional_field").setType("STRING")));
    String value = "hellooo";
    TableRow rowToInsert = new TableRow().set("optional_field", value);
    String testQuery = String.format("SELECT optional_field FROM [%s.%s];", BIG_QUERY_DATASET_ID, tableName);
    List<List<String>> expectedResult = Arrays.asList(Arrays.asList(value));
    runWriteTest(schemaUpdateOptions, tableName, newSchema, rowToInsert, testQuery, expectedResult);
}
Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) TableRow(com.google.api.services.bigquery.model.TableRow) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Test(org.junit.Test)

Aggregations

SchemaUpdateOption (org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption)4 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)3 TableRow (com.google.api.services.bigquery.model.TableRow)3 TableSchema (com.google.api.services.bigquery.model.TableSchema)3 List (java.util.List)3 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)3 Test (org.junit.Test)3 EncryptionConfiguration (com.google.api.services.bigquery.model.EncryptionConfiguration)1 JobConfigurationLoad (com.google.api.services.bigquery.model.JobConfigurationLoad)1 JobReference (com.google.api.services.bigquery.model.JobReference)1 QueryResponse (com.google.api.services.bigquery.model.QueryResponse)1 Table (com.google.api.services.bigquery.model.Table)1 TableReference (com.google.api.services.bigquery.model.TableReference)1 IOException (java.io.IOException)1 SecureRandom (java.security.SecureRandom)1 Arrays (java.util.Arrays)1 EnumSet (java.util.EnumSet)1 Set (java.util.Set)1 Collectors (java.util.stream.Collectors)1 Pipeline (org.apache.beam.sdk.Pipeline)1