use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.
the class CreateTables method possibleCreateTable.
private void possibleCreateTable(BigQueryOptions options, TableDestination tableDestination, TableSchema tableSchema) throws InterruptedException, IOException {
String tableSpec = tableDestination.getTableSpec();
TableReference tableReference = tableDestination.getTableReference();
String tableDescription = tableDestination.getTableDescription();
if (createDisposition != createDisposition.CREATE_NEVER && !createdTables.contains(tableSpec)) {
synchronized (createdTables) {
// Another thread may have succeeded in creating the table in the meanwhile, so
// check again. This check isn't needed for correctness, but we add it to prevent
// every thread from attempting a create and overwhelming our BigQuery quota.
DatasetService datasetService = bqServices.getDatasetService(options);
if (!createdTables.contains(tableSpec)) {
if (datasetService.getTable(tableReference) == null) {
datasetService.createTable(new Table().setTableReference(tableReference).setSchema(tableSchema).setDescription(tableDescription));
}
createdTables.add(tableSpec);
}
}
}
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.
the class BigQueryQueryHelper method executeQuery.
public static TableReference executeQuery(BigQueryServices bqServices, BigQueryOptions options, AtomicReference<JobStatistics> dryRunJobStats, String stepUuid, String query, Boolean flattenResults, Boolean useLegacySql, QueryPriority priority, @Nullable String location, @Nullable String queryTempDatasetId, @Nullable String kmsKey) throws InterruptedException, IOException {
// Step 1: Find the effective location of the query.
String effectiveLocation = location;
DatasetService tableService = bqServices.getDatasetService(options);
if (effectiveLocation == null) {
List<TableReference> referencedTables = dryRunQueryIfNeeded(bqServices, options, dryRunJobStats, query, flattenResults, useLegacySql, location).getQuery().getReferencedTables();
if (referencedTables != null && !referencedTables.isEmpty()) {
TableReference referencedTable = referencedTables.get(0);
effectiveLocation = tableService.getDataset(referencedTable.getProjectId(), referencedTable.getDatasetId()).getLocation();
}
}
// Step 2: Create a temporary dataset in the query location only if the user has not specified a
// temp dataset.
String queryJobId = BigQueryResourceNaming.createJobIdPrefix(options.getJobName(), stepUuid, JobType.QUERY);
Optional<String> queryTempDatasetOpt = Optional.ofNullable(queryTempDatasetId);
TableReference queryResultTable = createTempTableReference(options.getBigQueryProject() == null ? options.getProject() : options.getBigQueryProject(), queryJobId, queryTempDatasetOpt);
boolean beamToCreateTempDataset = !queryTempDatasetOpt.isPresent();
// Create dataset only if it has not been set by the user
if (beamToCreateTempDataset) {
LOG.info("Creating temporary dataset {} for query results", queryResultTable.getDatasetId());
tableService.createDataset(queryResultTable.getProjectId(), queryResultTable.getDatasetId(), effectiveLocation, "Temporary tables for query results of job " + options.getJobName(), TimeUnit.DAYS.toMillis(1));
} else {
// If the user specified a temp dataset, check that the destination table does not
// exist
Table destTable = tableService.getTable(queryResultTable);
checkArgument(destTable == null, "Refusing to write on existing table {} in the specified temp dataset {}", queryResultTable.getTableId(), queryResultTable.getDatasetId());
}
// Step 3: Execute the query. Generate a transient (random) query job ID, because this code may
// be retried after the temporary dataset and table have been deleted by a previous attempt --
// in that case, we want to regenerate the temporary dataset and table, and we'll need a fresh
// query ID to do that.
LOG.info("Exporting query results into temporary table {} using job {}", queryResultTable, queryJobId);
JobReference jobReference = new JobReference().setProjectId(options.getBigQueryProject() == null ? options.getProject() : options.getBigQueryProject()).setLocation(effectiveLocation).setJobId(queryJobId);
JobConfigurationQuery queryConfiguration = createBasicQueryConfig(query, flattenResults, useLegacySql).setAllowLargeResults(true).setDestinationTable(queryResultTable).setCreateDisposition("CREATE_IF_NEEDED").setWriteDisposition("WRITE_TRUNCATE").setPriority(priority.name());
if (kmsKey != null) {
queryConfiguration.setDestinationEncryptionConfiguration(new EncryptionConfiguration().setKmsKeyName(kmsKey));
}
JobService jobService = bqServices.getJobService(options);
jobService.startQueryJob(jobReference, queryConfiguration);
Job job = jobService.pollJob(jobReference, JOB_POLL_MAX_RETRIES);
if (BigQueryHelpers.parseStatus(job) != Status.SUCCEEDED) {
throw new IOException(String.format("Query job %s failed, status: %s", queryJobId, BigQueryHelpers.statusToPrettyString(job.getStatus())));
}
LOG.info("Query job {} completed", queryJobId);
return queryResultTable;
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.
the class StorageApiFinalizeWritesDoFn method process.
@ProcessElement
@SuppressWarnings({ "nullness" })
public void process(PipelineOptions pipelineOptions, @Element KV<String, String> element) throws Exception {
String tableId = element.getKey();
String streamId = element.getValue();
DatasetService datasetService = getDatasetService(pipelineOptions);
RetryManager<FinalizeWriteStreamResponse, Context<FinalizeWriteStreamResponse>> retryManager = new RetryManager<>(Duration.standardSeconds(1), Duration.standardMinutes(1), 3);
retryManager.addOperation(c -> {
finalizeOperationsSent.inc();
return datasetService.finalizeWriteStream(streamId);
}, contexts -> {
LOG.error("Finalize of stream " + streamId + " failed with " + Iterables.getFirst(contexts, null).getError());
finalizeOperationsFailed.inc();
return RetryType.RETRY_ALL_OPERATIONS;
}, c -> {
LOG.info("Finalize of stream " + streamId + " finished with " + c.getResult());
finalizeOperationsSucceeded.inc();
commitStreams.computeIfAbsent(tableId, d -> Lists.newArrayList()).add(streamId);
}, new Context<>());
retryManager.run(true);
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.
the class StorageApiFlushAndFinalizeDoFn method process.
@SuppressWarnings({ "nullness" })
@ProcessElement
public void process(PipelineOptions pipelineOptions, @Element KV<String, Operation> element) throws Exception {
final String streamId = element.getKey();
final Operation operation = element.getValue();
final DatasetService datasetService = getDatasetService(pipelineOptions);
// Flush the stream. If the flush offset < 0, that means we only need to finalize.
long offset = operation.flushOffset;
if (offset >= 0) {
Instant now = Instant.now();
RetryManager<FlushRowsResponse, Context<FlushRowsResponse>> retryManager = new RetryManager<>(Duration.standardSeconds(1), Duration.standardMinutes(1), 3);
retryManager.addOperation(// runOperation
c -> {
try {
flushOperationsSent.inc();
return datasetService.flush(streamId, offset);
} catch (Exception e) {
throw new RuntimeException(e);
}
}, // onError
contexts -> {
Throwable error = Iterables.getFirst(contexts, null).getError();
LOG.warn("Flush of stream " + streamId + " to offset " + offset + " failed with " + error);
flushOperationsFailed.inc();
if (error instanceof ApiException) {
Code statusCode = ((ApiException) error).getStatusCode().getCode();
if (statusCode.equals(Code.ALREADY_EXISTS)) {
flushOperationsAlreadyExists.inc();
// Implies that we have already flushed up to this point, so don't retry.
return RetryType.DONT_RETRY;
}
if (statusCode.equals(Code.INVALID_ARGUMENT)) {
flushOperationsInvalidArgument.inc();
// TODO: Storage API should provide a more-specific way of identifying this failure.
return RetryType.DONT_RETRY;
}
}
return RetryType.RETRY_ALL_OPERATIONS;
}, // onSuccess
c -> {
flushOperationsSucceeded.inc();
}, new Context<>());
retryManager.run(true);
java.time.Duration timeElapsed = java.time.Duration.between(now, Instant.now());
flushLatencyDistribution.update(timeElapsed.toMillis());
}
// or we would end up with duplicates.
if (operation.finalizeStream) {
RetryManager<FinalizeWriteStreamResponse, Context<FinalizeWriteStreamResponse>> retryManager = new RetryManager<>(Duration.standardSeconds(1), Duration.standardMinutes(1), 3);
retryManager.addOperation(c -> {
finalizeOperationsSent.inc();
return datasetService.finalizeWriteStream(streamId);
}, contexts -> {
LOG.warn("Finalize of stream " + streamId + " failed with " + Iterables.getFirst(contexts, null).getError());
finalizeOperationsFailed.inc();
return RetryType.RETRY_ALL_OPERATIONS;
}, r -> {
finalizeOperationsSucceeded.inc();
}, new Context<>());
retryManager.run(true);
}
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.
the class WriteRename method finishBundle.
@FinishBundle
public void finishBundle(FinishBundleContext c) throws Exception {
DatasetService datasetService = getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class));
PendingJobManager jobManager = new PendingJobManager();
for (PendingJobData pendingJob : pendingJobs) {
jobManager.addPendingJob(pendingJob.retryJob, j -> {
try {
if (pendingJob.tableDestination.getTableDescription() != null) {
TableReference ref = pendingJob.tableDestination.getTableReference();
datasetService.patchTableDescription(ref.clone().setTableId(BigQueryHelpers.stripPartitionDecorator(ref.getTableId())), pendingJob.tableDestination.getTableDescription());
}
c.output(pendingJob.tableDestination, pendingJob.window.maxTimestamp(), pendingJob.window);
removeTemporaryTables(datasetService, pendingJob.tempTables);
return null;
} catch (IOException | InterruptedException e) {
return e;
}
});
}
jobManager.waitForDone();
}
Aggregations