Search in sources :

Example 6 with DatasetService

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.

the class CreateTables method possibleCreateTable.

private void possibleCreateTable(BigQueryOptions options, TableDestination tableDestination, TableSchema tableSchema) throws InterruptedException, IOException {
    String tableSpec = tableDestination.getTableSpec();
    TableReference tableReference = tableDestination.getTableReference();
    String tableDescription = tableDestination.getTableDescription();
    if (createDisposition != createDisposition.CREATE_NEVER && !createdTables.contains(tableSpec)) {
        synchronized (createdTables) {
            // Another thread may have succeeded in creating the table in the meanwhile, so
            // check again. This check isn't needed for correctness, but we add it to prevent
            // every thread from attempting a create and overwhelming our BigQuery quota.
            DatasetService datasetService = bqServices.getDatasetService(options);
            if (!createdTables.contains(tableSpec)) {
                if (datasetService.getTable(tableReference) == null) {
                    datasetService.createTable(new Table().setTableReference(tableReference).setSchema(tableSchema).setDescription(tableDescription));
                }
                createdTables.add(tableSpec);
            }
        }
    }
}
Also used : TableReference(com.google.api.services.bigquery.model.TableReference) Table(com.google.api.services.bigquery.model.Table) DatasetService(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService)

Example 7 with DatasetService

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.

the class BigQueryQueryHelper method executeQuery.

public static TableReference executeQuery(BigQueryServices bqServices, BigQueryOptions options, AtomicReference<JobStatistics> dryRunJobStats, String stepUuid, String query, Boolean flattenResults, Boolean useLegacySql, QueryPriority priority, @Nullable String location, @Nullable String queryTempDatasetId, @Nullable String kmsKey) throws InterruptedException, IOException {
    // Step 1: Find the effective location of the query.
    String effectiveLocation = location;
    DatasetService tableService = bqServices.getDatasetService(options);
    if (effectiveLocation == null) {
        List<TableReference> referencedTables = dryRunQueryIfNeeded(bqServices, options, dryRunJobStats, query, flattenResults, useLegacySql, location).getQuery().getReferencedTables();
        if (referencedTables != null && !referencedTables.isEmpty()) {
            TableReference referencedTable = referencedTables.get(0);
            effectiveLocation = tableService.getDataset(referencedTable.getProjectId(), referencedTable.getDatasetId()).getLocation();
        }
    }
    // Step 2: Create a temporary dataset in the query location only if the user has not specified a
    // temp dataset.
    String queryJobId = BigQueryResourceNaming.createJobIdPrefix(options.getJobName(), stepUuid, JobType.QUERY);
    Optional<String> queryTempDatasetOpt = Optional.ofNullable(queryTempDatasetId);
    TableReference queryResultTable = createTempTableReference(options.getBigQueryProject() == null ? options.getProject() : options.getBigQueryProject(), queryJobId, queryTempDatasetOpt);
    boolean beamToCreateTempDataset = !queryTempDatasetOpt.isPresent();
    // Create dataset only if it has not been set by the user
    if (beamToCreateTempDataset) {
        LOG.info("Creating temporary dataset {} for query results", queryResultTable.getDatasetId());
        tableService.createDataset(queryResultTable.getProjectId(), queryResultTable.getDatasetId(), effectiveLocation, "Temporary tables for query results of job " + options.getJobName(), TimeUnit.DAYS.toMillis(1));
    } else {
        // If the user specified a temp dataset, check that the destination table does not
        // exist
        Table destTable = tableService.getTable(queryResultTable);
        checkArgument(destTable == null, "Refusing to write on existing table {} in the specified temp dataset {}", queryResultTable.getTableId(), queryResultTable.getDatasetId());
    }
    // Step 3: Execute the query. Generate a transient (random) query job ID, because this code may
    // be retried after the temporary dataset and table have been deleted by a previous attempt --
    // in that case, we want to regenerate the temporary dataset and table, and we'll need a fresh
    // query ID to do that.
    LOG.info("Exporting query results into temporary table {} using job {}", queryResultTable, queryJobId);
    JobReference jobReference = new JobReference().setProjectId(options.getBigQueryProject() == null ? options.getProject() : options.getBigQueryProject()).setLocation(effectiveLocation).setJobId(queryJobId);
    JobConfigurationQuery queryConfiguration = createBasicQueryConfig(query, flattenResults, useLegacySql).setAllowLargeResults(true).setDestinationTable(queryResultTable).setCreateDisposition("CREATE_IF_NEEDED").setWriteDisposition("WRITE_TRUNCATE").setPriority(priority.name());
    if (kmsKey != null) {
        queryConfiguration.setDestinationEncryptionConfiguration(new EncryptionConfiguration().setKmsKeyName(kmsKey));
    }
    JobService jobService = bqServices.getJobService(options);
    jobService.startQueryJob(jobReference, queryConfiguration);
    Job job = jobService.pollJob(jobReference, JOB_POLL_MAX_RETRIES);
    if (BigQueryHelpers.parseStatus(job) != Status.SUCCEEDED) {
        throw new IOException(String.format("Query job %s failed, status: %s", queryJobId, BigQueryHelpers.statusToPrettyString(job.getStatus())));
    }
    LOG.info("Query job {} completed", queryJobId);
    return queryResultTable;
}
Also used : Table(com.google.api.services.bigquery.model.Table) JobReference(com.google.api.services.bigquery.model.JobReference) EncryptionConfiguration(com.google.api.services.bigquery.model.EncryptionConfiguration) JobConfigurationQuery(com.google.api.services.bigquery.model.JobConfigurationQuery) DatasetService(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService) IOException(java.io.IOException) JobService(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService) TableReference(com.google.api.services.bigquery.model.TableReference) BigQueryResourceNaming.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference) Job(com.google.api.services.bigquery.model.Job)

Example 8 with DatasetService

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.

the class StorageApiFinalizeWritesDoFn method process.

@ProcessElement
@SuppressWarnings({ "nullness" })
public void process(PipelineOptions pipelineOptions, @Element KV<String, String> element) throws Exception {
    String tableId = element.getKey();
    String streamId = element.getValue();
    DatasetService datasetService = getDatasetService(pipelineOptions);
    RetryManager<FinalizeWriteStreamResponse, Context<FinalizeWriteStreamResponse>> retryManager = new RetryManager<>(Duration.standardSeconds(1), Duration.standardMinutes(1), 3);
    retryManager.addOperation(c -> {
        finalizeOperationsSent.inc();
        return datasetService.finalizeWriteStream(streamId);
    }, contexts -> {
        LOG.error("Finalize of stream " + streamId + " failed with " + Iterables.getFirst(contexts, null).getError());
        finalizeOperationsFailed.inc();
        return RetryType.RETRY_ALL_OPERATIONS;
    }, c -> {
        LOG.info("Finalize of stream " + streamId + " finished with " + c.getResult());
        finalizeOperationsSucceeded.inc();
        commitStreams.computeIfAbsent(tableId, d -> Lists.newArrayList()).add(streamId);
    }, new Context<>());
    retryManager.run(true);
}
Also used : Context(org.apache.beam.sdk.io.gcp.bigquery.RetryManager.Operation.Context) DoFn(org.apache.beam.sdk.transforms.DoFn) KV(org.apache.beam.sdk.values.KV) Logger(org.slf4j.Logger) Collection(java.util.Collection) Counter(org.apache.beam.sdk.metrics.Counter) Duration(org.joda.time.Duration) LoggerFactory(org.slf4j.LoggerFactory) Set(java.util.Set) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) BatchCommitWriteStreamsResponse(com.google.cloud.bigquery.storage.v1.BatchCommitWriteStreamsResponse) IOException(java.io.IOException) RetryType(org.apache.beam.sdk.io.gcp.bigquery.RetryManager.RetryType) Metrics(org.apache.beam.sdk.metrics.Metrics) FinalizeWriteStreamResponse(com.google.cloud.bigquery.storage.v1.FinalizeWriteStreamResponse) DatasetService(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService) Sets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets) Context(org.apache.beam.sdk.io.gcp.bigquery.RetryManager.Operation.Context) StorageError(com.google.cloud.bigquery.storage.v1.StorageError) Map(java.util.Map) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Maps(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps) StorageErrorCode(com.google.cloud.bigquery.storage.v1.StorageError.StorageErrorCode) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Nullable(javax.annotation.Nullable) FinalizeWriteStreamResponse(com.google.cloud.bigquery.storage.v1.FinalizeWriteStreamResponse) DatasetService(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService)

Example 9 with DatasetService

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.

the class StorageApiFlushAndFinalizeDoFn method process.

@SuppressWarnings({ "nullness" })
@ProcessElement
public void process(PipelineOptions pipelineOptions, @Element KV<String, Operation> element) throws Exception {
    final String streamId = element.getKey();
    final Operation operation = element.getValue();
    final DatasetService datasetService = getDatasetService(pipelineOptions);
    // Flush the stream. If the flush offset < 0, that means we only need to finalize.
    long offset = operation.flushOffset;
    if (offset >= 0) {
        Instant now = Instant.now();
        RetryManager<FlushRowsResponse, Context<FlushRowsResponse>> retryManager = new RetryManager<>(Duration.standardSeconds(1), Duration.standardMinutes(1), 3);
        retryManager.addOperation(// runOperation
        c -> {
            try {
                flushOperationsSent.inc();
                return datasetService.flush(streamId, offset);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }, // onError
        contexts -> {
            Throwable error = Iterables.getFirst(contexts, null).getError();
            LOG.warn("Flush of stream " + streamId + " to offset " + offset + " failed with " + error);
            flushOperationsFailed.inc();
            if (error instanceof ApiException) {
                Code statusCode = ((ApiException) error).getStatusCode().getCode();
                if (statusCode.equals(Code.ALREADY_EXISTS)) {
                    flushOperationsAlreadyExists.inc();
                    // Implies that we have already flushed up to this point, so don't retry.
                    return RetryType.DONT_RETRY;
                }
                if (statusCode.equals(Code.INVALID_ARGUMENT)) {
                    flushOperationsInvalidArgument.inc();
                    // TODO: Storage API should provide a more-specific way of identifying this failure.
                    return RetryType.DONT_RETRY;
                }
            }
            return RetryType.RETRY_ALL_OPERATIONS;
        }, // onSuccess
        c -> {
            flushOperationsSucceeded.inc();
        }, new Context<>());
        retryManager.run(true);
        java.time.Duration timeElapsed = java.time.Duration.between(now, Instant.now());
        flushLatencyDistribution.update(timeElapsed.toMillis());
    }
    // or we would end up with duplicates.
    if (operation.finalizeStream) {
        RetryManager<FinalizeWriteStreamResponse, Context<FinalizeWriteStreamResponse>> retryManager = new RetryManager<>(Duration.standardSeconds(1), Duration.standardMinutes(1), 3);
        retryManager.addOperation(c -> {
            finalizeOperationsSent.inc();
            return datasetService.finalizeWriteStream(streamId);
        }, contexts -> {
            LOG.warn("Finalize of stream " + streamId + " failed with " + Iterables.getFirst(contexts, null).getError());
            finalizeOperationsFailed.inc();
            return RetryType.RETRY_ALL_OPERATIONS;
        }, r -> {
            finalizeOperationsSucceeded.inc();
        }, new Context<>());
        retryManager.run(true);
    }
}
Also used : Context(org.apache.beam.sdk.io.gcp.bigquery.RetryManager.Operation.Context) FinalizeWriteStreamResponse(com.google.cloud.bigquery.storage.v1.FinalizeWriteStreamResponse) Instant(java.time.Instant) DatasetService(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService) Operation(org.apache.beam.sdk.io.gcp.bigquery.StorageApiFlushAndFinalizeDoFn.Operation) Code(com.google.api.gax.rpc.StatusCode.Code) IOException(java.io.IOException) ApiException(com.google.api.gax.rpc.ApiException) FlushRowsResponse(com.google.cloud.bigquery.storage.v1.FlushRowsResponse) ApiException(com.google.api.gax.rpc.ApiException)

Example 10 with DatasetService

use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService in project beam by apache.

the class WriteRename method finishBundle.

@FinishBundle
public void finishBundle(FinishBundleContext c) throws Exception {
    DatasetService datasetService = getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class));
    PendingJobManager jobManager = new PendingJobManager();
    for (PendingJobData pendingJob : pendingJobs) {
        jobManager.addPendingJob(pendingJob.retryJob, j -> {
            try {
                if (pendingJob.tableDestination.getTableDescription() != null) {
                    TableReference ref = pendingJob.tableDestination.getTableReference();
                    datasetService.patchTableDescription(ref.clone().setTableId(BigQueryHelpers.stripPartitionDecorator(ref.getTableId())), pendingJob.tableDestination.getTableDescription());
                }
                c.output(pendingJob.tableDestination, pendingJob.window.maxTimestamp(), pendingJob.window);
                removeTemporaryTables(datasetService, pendingJob.tempTables);
                return null;
            } catch (IOException | InterruptedException e) {
                return e;
            }
        });
    }
    jobManager.waitForDone();
}
Also used : TableReference(com.google.api.services.bigquery.model.TableReference) PendingJobManager(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.PendingJobManager) DatasetService(org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService) IOException(java.io.IOException)

Aggregations

DatasetService (org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService)10 TableReference (com.google.api.services.bigquery.model.TableReference)7 IOException (java.io.IOException)4 Table (com.google.api.services.bigquery.model.Table)3 Map (java.util.Map)3 Context (org.apache.beam.sdk.io.gcp.bigquery.RetryManager.Operation.Context)3 EncryptionConfiguration (com.google.api.services.bigquery.model.EncryptionConfiguration)2 BatchCommitWriteStreamsResponse (com.google.cloud.bigquery.storage.v1.BatchCommitWriteStreamsResponse)2 FinalizeWriteStreamResponse (com.google.cloud.bigquery.storage.v1.FinalizeWriteStreamResponse)2 StorageError (com.google.cloud.bigquery.storage.v1.StorageError)2 Collection (java.util.Collection)2 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)2 ApiException (com.google.api.gax.rpc.ApiException)1 Code (com.google.api.gax.rpc.StatusCode.Code)1 Job (com.google.api.services.bigquery.model.Job)1 JobConfigurationQuery (com.google.api.services.bigquery.model.JobConfigurationQuery)1 JobReference (com.google.api.services.bigquery.model.JobReference)1 TableSchema (com.google.api.services.bigquery.model.TableSchema)1 FlushRowsResponse (com.google.cloud.bigquery.storage.v1.FlushRowsResponse)1 StorageErrorCode (com.google.cloud.bigquery.storage.v1.StorageError.StorageErrorCode)1