use of bio.terra.stairway.Step in project terra-workspace-manager by DataBiosphere.
the class CreateTableCopyJobsStep method doStep.
/**
* Create one BigQuery copy job for each table in the source dataset. Keep a running map from
* table ID to job ID as new jobs are created, and only create jobs for tables that aren't in the
* map already. Rerun the step after every table is processed so that the map may be persisted
* incrementally.
*
* <p>On retry, create the jobs for any tables that don't have them. Use WRITE_TRUNCATE to avoid
* the possibility of duplicate data.
*/
@Override
public StepResult doStep(FlightContext flightContext) throws InterruptedException, RetryException {
final FlightMap workingMap = flightContext.getWorkingMap();
final CloningInstructions effectiveCloningInstructions = flightContext.getInputParameters().get(ControlledResourceKeys.CLONING_INSTRUCTIONS, CloningInstructions.class);
if (CloningInstructions.COPY_RESOURCE != effectiveCloningInstructions) {
return StepResult.getStepResultSuccess();
}
// Gather inputs
final DatasetCloneInputs sourceInputs = getSourceInputs();
workingMap.put(ControlledResourceKeys.SOURCE_CLONE_INPUTS, sourceInputs);
final DatasetCloneInputs destinationInputs = getDestinationInputs(flightContext);
workingMap.put(ControlledResourceKeys.DESTINATION_CLONE_INPUTS, destinationInputs);
final BigQueryCow bigQueryCow = crlService.createWsmSaBigQueryCow();
// TODO(jaycarlton): remove usage of this client when it's all in CRL PF-942
final Bigquery bigQueryClient = crlService.createWsmSaNakedBigQueryClient();
try {
// Get a list of all tables in the source dataset
final TableList sourceTables = bigQueryCow.tables().list(sourceInputs.getProjectId(), sourceInputs.getDatasetName()).execute();
// Start a copy job for each source table
final Map<String, String> tableToJobId = Optional.ofNullable(workingMap.get(ControlledResourceKeys.TABLE_TO_JOB_ID_MAP, new TypeReference<Map<String, String>>() {
})).orElseGet(HashMap::new);
final List<Tables> tables = Optional.ofNullable(sourceTables.getTables()).orElse(Collections.emptyList());
// Find the first table whose ID isn't a key in the map.
final Optional<Tables> tableMaybe = tables.stream().filter(t -> null != t.getId() && !tableToJobId.containsKey(t.getId())).findFirst();
if (tableMaybe.isPresent()) {
final Tables table = tableMaybe.get();
checkStreamingBuffer(sourceInputs, bigQueryCow, table);
final Job inputJob = buildTableCopyJob(sourceInputs, destinationInputs, table);
// bill the job to the destination project
final Job submittedJob = bigQueryClient.jobs().insert(destinationInputs.getProjectId(), inputJob).execute();
// Update the map, which will be persisted
tableToJobId.put(table.getId(), submittedJob.getId());
workingMap.put(ControlledResourceKeys.TABLE_TO_JOB_ID_MAP, tableToJobId);
return new StepResult(StepStatus.STEP_RESULT_RERUN);
} else {
// All tables have entries in the map, so all jobs are started.
workingMap.put(ControlledResourceKeys.TABLE_TO_JOB_ID_MAP, // in case it's empty
tableToJobId);
return StepResult.getStepResultSuccess();
}
} catch (IOException e) {
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, e);
}
}
Aggregations