use of bio.terra.common.PdaoLoadStatistics in project jade-data-repo by DataBiosphere.
the class IngestInsertIntoDatasetTableStep method doStep.
@Override
public StepResult doStep(FlightContext context) throws InterruptedException {
Dataset dataset = IngestUtils.getDataset(context, datasetService);
DatasetTable targetTable = IngestUtils.getDatasetTable(context, dataset);
String stagingTableName = IngestUtils.getStagingTableName(context);
IngestRequestModel ingestRequest = IngestUtils.getIngestRequestModel(context);
PdaoLoadStatistics loadStatistics = IngestUtils.getIngestStatistics(context);
IngestResponseModel ingestResponse = new IngestResponseModel().dataset(dataset.getName()).datasetId(dataset.getId().toString()).table(ingestRequest.getTable()).path(ingestRequest.getPath()).loadTag(ingestRequest.getLoadTag()).badRowCount(loadStatistics.getBadRecords()).rowCount(loadStatistics.getRowCount());
context.getWorkingMap().put(JobMapKeys.RESPONSE.getKeyName(), ingestResponse);
bigQueryPdao.insertIntoDatasetTable(dataset, targetTable, stagingTableName);
return StepResult.getStepResultSuccess();
}
use of bio.terra.common.PdaoLoadStatistics in project jade-data-repo by DataBiosphere.
the class IngestLoadTableStep method doStep.
@Override
public StepResult doStep(FlightContext context) throws InterruptedException {
Dataset dataset = IngestUtils.getDataset(context, datasetService);
DatasetTable targetTable = IngestUtils.getDatasetTable(context, dataset);
String stagingTableName = IngestUtils.getStagingTableName(context);
IngestRequestModel ingestRequest = IngestUtils.getIngestRequestModel(context);
PdaoLoadStatistics ingestStatistics = bigQueryPdao.loadToStagingTable(dataset, targetTable, stagingTableName, ingestRequest);
// Save away the stats in the working map. We will use some of them later
// when we make the annotations. Others are returned on the ingest response.
IngestUtils.putIngestStatistics(context, ingestStatistics);
return StepResult.getStepResultSuccess();
}
use of bio.terra.common.PdaoLoadStatistics in project jade-data-repo by DataBiosphere.
the class BigQueryPdao method loadToStagingTable.
// Load data
public PdaoLoadStatistics loadToStagingTable(Dataset dataset, DatasetTable targetTable, String stagingTableName, IngestRequestModel ingestRequest) throws InterruptedException {
BigQueryProject bigQueryProject = bigQueryProjectForDataset(dataset);
BigQuery bigQuery = bigQueryProject.getBigQuery();
TableId tableId = TableId.of(prefixName(dataset.getName()), stagingTableName);
// Source does not have row_id
Schema schema = buildSchema(targetTable, true);
LoadJobConfiguration.Builder loadBuilder = LoadJobConfiguration.builder(tableId, ingestRequest.getPath()).setFormatOptions(buildFormatOptions(ingestRequest)).setMaxBadRecords((ingestRequest.getMaxBadRecords() == null) ? Integer.valueOf(0) : ingestRequest.getMaxBadRecords()).setIgnoreUnknownValues((ingestRequest.isIgnoreUnknownValues() == null) ? Boolean.TRUE : ingestRequest.isIgnoreUnknownValues()).setSchema(// docs say this is for target, but CLI provides one for the source
schema).setCreateDisposition(JobInfo.CreateDisposition.CREATE_IF_NEEDED).setWriteDisposition(JobInfo.WriteDisposition.WRITE_TRUNCATE);
// so we have to special-case here. Grumble...
if (ingestRequest.getFormat() == IngestRequestModel.FormatEnum.CSV) {
loadBuilder.setNullMarker((ingestRequest.getCsvNullMarker() == null) ? "" : ingestRequest.getCsvNullMarker());
}
LoadJobConfiguration configuration = loadBuilder.build();
Job loadJob = bigQuery.create(JobInfo.of(configuration));
Instant loadJobMaxTime = Instant.now().plusSeconds(TimeUnit.MINUTES.toSeconds(20L));
while (!loadJob.isDone()) {
logger.info("Waiting for staging table load job " + loadJob.getJobId().getJob() + " to complete");
TimeUnit.SECONDS.sleep(5L);
if (loadJobMaxTime.isBefore(Instant.now())) {
loadJob.cancel();
throw new PdaoException("Staging table load failed to complete within timeout - canceled");
}
}
loadJob = loadJob.reload();
BigQueryError loadJobError = loadJob.getStatus().getError();
if (loadJobError == null) {
logger.info("Staging table load job " + loadJob.getJobId().getJob() + " succeeded");
} else {
logger.info("Staging table load job " + loadJob.getJobId().getJob() + " failed: " + loadJobError);
if ("notFound".equals(loadJobError.getReason())) {
throw new IngestFileNotFoundException("Ingest source file not found: " + ingestRequest.getPath());
}
List<String> loadErrors = new ArrayList<>();
List<BigQueryError> bigQueryErrors = loadJob.getStatus().getExecutionErrors();
for (BigQueryError bigQueryError : bigQueryErrors) {
loadErrors.add("BigQueryError: reason=" + bigQueryError.getReason() + " message=" + bigQueryError.getMessage());
}
throw new IngestFailureException("Ingest failed with " + loadErrors.size() + " errors - see error details", loadErrors);
}
// Job completed successfully
JobStatistics.LoadStatistics loadStatistics = loadJob.getStatistics();
PdaoLoadStatistics pdaoLoadStatistics = new PdaoLoadStatistics().badRecords(loadStatistics.getBadRecords()).rowCount(loadStatistics.getOutputRows()).startTime(Instant.ofEpochMilli(loadStatistics.getStartTime())).endTime(Instant.ofEpochMilli(loadStatistics.getEndTime()));
return pdaoLoadStatistics;
}
Aggregations