Search in sources :

Example 1 with Step

use of bio.terra.stairway.Step in project terra-workspace-manager by DataBiosphere.

the class CreateTableCopyJobsStep method doStep.

/**
 * Create one BigQuery copy job for each table in the source dataset. Keep a running map from
 * table ID to job ID as new jobs are created, and only create jobs for tables that aren't in the
 * map already. Rerun the step after every table is processed so that the map may be persisted
 * incrementally.
 *
 * <p>On retry, create the jobs for any tables that don't have them. Use WRITE_TRUNCATE to avoid
 * the possibility of duplicate data.
 */
@Override
public StepResult doStep(FlightContext flightContext) throws InterruptedException, RetryException {
    final FlightMap workingMap = flightContext.getWorkingMap();
    final CloningInstructions effectiveCloningInstructions = flightContext.getInputParameters().get(ControlledResourceKeys.CLONING_INSTRUCTIONS, CloningInstructions.class);
    if (CloningInstructions.COPY_RESOURCE != effectiveCloningInstructions) {
        return StepResult.getStepResultSuccess();
    }
    // Gather inputs
    final DatasetCloneInputs sourceInputs = getSourceInputs();
    workingMap.put(ControlledResourceKeys.SOURCE_CLONE_INPUTS, sourceInputs);
    final DatasetCloneInputs destinationInputs = getDestinationInputs(flightContext);
    workingMap.put(ControlledResourceKeys.DESTINATION_CLONE_INPUTS, destinationInputs);
    final BigQueryCow bigQueryCow = crlService.createWsmSaBigQueryCow();
    // TODO(jaycarlton):  remove usage of this client when it's all in CRL PF-942
    final Bigquery bigQueryClient = crlService.createWsmSaNakedBigQueryClient();
    try {
        // Get a list of all tables in the source dataset
        final TableList sourceTables = bigQueryCow.tables().list(sourceInputs.getProjectId(), sourceInputs.getDatasetName()).execute();
        // Start a copy job for each source table
        final Map<String, String> tableToJobId = Optional.ofNullable(workingMap.get(ControlledResourceKeys.TABLE_TO_JOB_ID_MAP, new TypeReference<Map<String, String>>() {
        })).orElseGet(HashMap::new);
        final List<Tables> tables = Optional.ofNullable(sourceTables.getTables()).orElse(Collections.emptyList());
        // Find the first table whose ID isn't a key in the map.
        final Optional<Tables> tableMaybe = tables.stream().filter(t -> null != t.getId() && !tableToJobId.containsKey(t.getId())).findFirst();
        if (tableMaybe.isPresent()) {
            final Tables table = tableMaybe.get();
            checkStreamingBuffer(sourceInputs, bigQueryCow, table);
            final Job inputJob = buildTableCopyJob(sourceInputs, destinationInputs, table);
            // bill the job to the destination project
            final Job submittedJob = bigQueryClient.jobs().insert(destinationInputs.getProjectId(), inputJob).execute();
            // Update the map, which will be persisted
            tableToJobId.put(table.getId(), submittedJob.getId());
            workingMap.put(ControlledResourceKeys.TABLE_TO_JOB_ID_MAP, tableToJobId);
            return new StepResult(StepStatus.STEP_RESULT_RERUN);
        } else {
            // All tables have entries in the map, so all jobs are started.
            workingMap.put(ControlledResourceKeys.TABLE_TO_JOB_ID_MAP, // in case it's empty
            tableToJobId);
            return StepResult.getStepResultSuccess();
        }
    } catch (IOException e) {
        return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, e);
    }
}
Also used : TableList(com.google.api.services.bigquery.model.TableList) BigQueryCow(bio.terra.cloudres.google.bigquery.BigQueryCow) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) Tables(com.google.api.services.bigquery.model.TableList.Tables) StepResult(bio.terra.stairway.StepResult) Step(bio.terra.stairway.Step) RetryException(bio.terra.stairway.exception.RetryException) Duration(java.time.Duration) Map(java.util.Map) TypeReference(com.fasterxml.jackson.core.type.TypeReference) Job(com.google.api.services.bigquery.model.Job) CrlService(bio.terra.workspace.service.crl.CrlService) TableReference(com.google.api.services.bigquery.model.TableReference) ControlledBigQueryDatasetResource(bio.terra.workspace.service.resource.controlled.cloud.gcp.bqdataset.ControlledBigQueryDatasetResource) Logger(org.slf4j.Logger) FlightMap(bio.terra.stairway.FlightMap) IOException(java.io.IOException) UUID(java.util.UUID) Instant(java.time.Instant) JobConfigurationTableCopy(com.google.api.services.bigquery.model.JobConfigurationTableCopy) Table(com.google.api.services.bigquery.model.Table) List(java.util.List) GcpCloudContextService(bio.terra.workspace.service.workspace.GcpCloudContextService) Bigquery(com.google.api.services.bigquery.Bigquery) CloningInstructions(bio.terra.workspace.service.resource.model.CloningInstructions) Optional(java.util.Optional) ControlledResourceKeys(bio.terra.workspace.service.workspace.flight.WorkspaceFlightMapKeys.ControlledResourceKeys) StepStatus(bio.terra.stairway.StepStatus) Collections(java.util.Collections) FlightContext(bio.terra.stairway.FlightContext) JobConfiguration(com.google.api.services.bigquery.model.JobConfiguration) HashMap(java.util.HashMap) Bigquery(com.google.api.services.bigquery.Bigquery) TableList(com.google.api.services.bigquery.model.TableList) IOException(java.io.IOException) BigQueryCow(bio.terra.cloudres.google.bigquery.BigQueryCow) CloningInstructions(bio.terra.workspace.service.resource.model.CloningInstructions) Tables(com.google.api.services.bigquery.model.TableList.Tables) FlightMap(bio.terra.stairway.FlightMap) Job(com.google.api.services.bigquery.model.Job) StepResult(bio.terra.stairway.StepResult) HashMap(java.util.HashMap) Map(java.util.Map) FlightMap(bio.terra.stairway.FlightMap)

Aggregations

BigQueryCow (bio.terra.cloudres.google.bigquery.BigQueryCow)1 FlightContext (bio.terra.stairway.FlightContext)1 FlightMap (bio.terra.stairway.FlightMap)1 Step (bio.terra.stairway.Step)1 StepResult (bio.terra.stairway.StepResult)1 StepStatus (bio.terra.stairway.StepStatus)1 RetryException (bio.terra.stairway.exception.RetryException)1 CrlService (bio.terra.workspace.service.crl.CrlService)1 ControlledBigQueryDatasetResource (bio.terra.workspace.service.resource.controlled.cloud.gcp.bqdataset.ControlledBigQueryDatasetResource)1 CloningInstructions (bio.terra.workspace.service.resource.model.CloningInstructions)1 GcpCloudContextService (bio.terra.workspace.service.workspace.GcpCloudContextService)1 ControlledResourceKeys (bio.terra.workspace.service.workspace.flight.WorkspaceFlightMapKeys.ControlledResourceKeys)1 TypeReference (com.fasterxml.jackson.core.type.TypeReference)1 Bigquery (com.google.api.services.bigquery.Bigquery)1 Job (com.google.api.services.bigquery.model.Job)1 JobConfiguration (com.google.api.services.bigquery.model.JobConfiguration)1 JobConfigurationTableCopy (com.google.api.services.bigquery.model.JobConfigurationTableCopy)1 Table (com.google.api.services.bigquery.model.Table)1 TableList (com.google.api.services.bigquery.model.TableList)1 Tables (com.google.api.services.bigquery.model.TableList.Tables)1