use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption in project beam by apache.
the class WriteTables method startLoad.
private PendingJob startLoad(JobService jobService, DatasetService datasetService, String jobIdPrefix, TableReference ref, TimePartitioning timePartitioning, Clustering clustering, @Nullable TableSchema schema, List<String> gcsUris, WriteDisposition writeDisposition, CreateDisposition createDisposition, Set<SchemaUpdateOption> schemaUpdateOptions) {
JobConfigurationLoad loadConfig = new JobConfigurationLoad().setDestinationTable(ref).setSchema(schema).setSourceUris(gcsUris).setWriteDisposition(writeDisposition.name()).setCreateDisposition(createDisposition.name()).setSourceFormat(sourceFormat).setIgnoreUnknownValues(ignoreUnknownValues).setUseAvroLogicalTypes(useAvroLogicalTypes);
if (schemaUpdateOptions != null) {
List<String> options = schemaUpdateOptions.stream().map(Enum<SchemaUpdateOption>::name).collect(Collectors.toList());
loadConfig.setSchemaUpdateOptions(options);
}
if (timePartitioning != null) {
loadConfig.setTimePartitioning(timePartitioning);
// only set clustering if timePartitioning is set
if (clustering != null) {
loadConfig.setClustering(clustering);
}
}
if (kmsKey != null) {
loadConfig.setDestinationEncryptionConfiguration(new EncryptionConfiguration().setKmsKeyName(kmsKey));
}
String projectId = loadJobProjectId == null || loadJobProjectId.get() == null ? ref.getProjectId() : loadJobProjectId.get();
String bqLocation = BigQueryHelpers.getDatasetLocation(datasetService, ref.getProjectId(), ref.getDatasetId());
PendingJob retryJob = new PendingJob(// Function to load the data.
jobId -> {
JobReference jobRef = new JobReference().setProjectId(projectId).setJobId(jobId.getJobId()).setLocation(bqLocation);
LOG.info("Loading {} files into {} using job {}, job id iteration {}", gcsUris.size(), ref, jobRef, jobId.getRetryIndex());
try {
jobService.startLoadJob(jobRef, loadConfig);
} catch (IOException | InterruptedException e) {
LOG.warn("Load job {} failed with {}", jobRef, e.toString());
throw new RuntimeException(e);
}
return null;
}, // Function to poll the result of a load job.
jobId -> {
JobReference jobRef = new JobReference().setProjectId(projectId).setJobId(jobId.getJobId()).setLocation(bqLocation);
try {
return jobService.pollJob(jobRef, BatchLoads.LOAD_JOB_POLL_MAX_RETRIES);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}, // Function to lookup a job.
jobId -> {
JobReference jobRef = new JobReference().setProjectId(projectId).setJobId(jobId.getJobId()).setLocation(bqLocation);
try {
return jobService.getJob(jobRef);
} catch (InterruptedException | IOException e) {
throw new RuntimeException(e);
}
}, maxRetryJobs, jobIdPrefix);
return retryJob;
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption in project beam by apache.
the class BigQuerySchemaUpdateOptionsIT method testAllowFieldAddition.
@Test
public void testAllowFieldAddition() throws Exception {
String tableName = makeTestTable();
Set<SchemaUpdateOption> schemaUpdateOptions = EnumSet.of(BigQueryIO.Write.SchemaUpdateOption.ALLOW_FIELD_ADDITION);
TableSchema newSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("new_field").setType("STRING"), new TableFieldSchema().setName("optional_field").setType("STRING"), new TableFieldSchema().setName("required_field").setType("STRING").setMode("REQUIRED")));
String[] values = { "meow", "bark" };
TableRow rowToInsert = new TableRow().set("new_field", values[0]).set("required_field", values[1]);
String testQuery = String.format("SELECT new_field, required_field FROM [%s.%s];", BIG_QUERY_DATASET_ID, tableName);
List<List<String>> expectedResult = Arrays.asList(Arrays.asList(values));
runWriteTest(schemaUpdateOptions, tableName, newSchema, rowToInsert, testQuery, expectedResult);
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption in project beam by apache.
the class BigQuerySchemaUpdateOptionsIT method runWriteTest.
/**
* Runs a write test against a BigQuery table to check that SchemaUpdateOption sets are taking
* effect.
*
* <p>Attempt write a row via BigQueryIO.writeTables with the given params, then run the given
* query, and finaly check the results of the query.
*
* @param schemaUpdateOptions The SchemaUpdateOption set to use
* @param tableName The table to write to
* @param schema The schema to use for the table
* @param rowToInsert The row to insert
* @param testQuery A testing SQL query to run after writing the row
* @param expectedResult The expected result of the query as a nested list of column values (one
* list per result row)
*/
private void runWriteTest(Set<SchemaUpdateOption> schemaUpdateOptions, String tableName, TableSchema schema, TableRow rowToInsert, String testQuery, List<List<String>> expectedResult) throws Exception {
Options options = TestPipeline.testingPipelineOptions().as(Options.class);
options.setTempLocation(options.getTempRoot() + "/bq_it_temp");
Pipeline p = Pipeline.create(options);
Create.Values<TableRow> input = Create.<TableRow>of(rowToInsert);
Write<TableRow> writer = BigQueryIO.writeTableRows().to(String.format("%s:%s.%s", options.getProject(), BIG_QUERY_DATASET_ID, tableName)).withSchema(schema).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND).withSchemaUpdateOptions(schemaUpdateOptions);
p.apply(input).apply(writer);
p.run().waitUntilFinish();
QueryResponse response = BQ_CLIENT.queryWithRetries(testQuery, project);
List<List<String>> result = response.getRows().stream().map(row -> row.getF().stream().map(cell -> cell.getV().toString()).collect(Collectors.toList())).collect(Collectors.toList());
assertEquals(expectedResult, result);
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption in project beam by apache.
the class BigQuerySchemaUpdateOptionsIT method testAllowFieldRelaxation.
@Test
public void testAllowFieldRelaxation() throws Exception {
String tableName = makeTestTable();
Set<SchemaUpdateOption> schemaUpdateOptions = EnumSet.of(BigQueryIO.Write.SchemaUpdateOption.ALLOW_FIELD_RELAXATION);
TableSchema newSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("optional_field").setType("STRING")));
String value = "hellooo";
TableRow rowToInsert = new TableRow().set("optional_field", value);
String testQuery = String.format("SELECT optional_field FROM [%s.%s];", BIG_QUERY_DATASET_ID, tableName);
List<List<String>> expectedResult = Arrays.asList(Arrays.asList(value));
runWriteTest(schemaUpdateOptions, tableName, newSchema, rowToInsert, testQuery, expectedResult);
}
Aggregations