Search in sources :

Example 1 with SnapshotSource

use of bio.terra.service.snapshot.SnapshotSource in project jade-data-repo by DataBiosphere.

the class BigQueryPdao method matchRowIds.

// for each table in a dataset (source), collect row id matches ON the row id
public RowIdMatch matchRowIds(Snapshot snapshot, SnapshotSource source, String tableName, List<String> rowIds) throws InterruptedException {
    // One source: grab it and navigate to the relevant parts
    BigQueryProject bigQueryProject = bigQueryProjectForSnapshot(snapshot);
    Optional<SnapshotMapTable> optTable = source.getSnapshotMapTables().stream().filter(table -> table.getFromTable().getName().equals(tableName)).findFirst();
    // create a column to point to the row id column in the source table to check that passed row ids exist in it
    Column rowIdColumn = new Column().table(optTable.get().getFromTable()).name(PDAO_ROW_ID_COLUMN);
    ST sqlTemplate = new ST(mapValuesToRowsTemplate);
    sqlTemplate.add("project", bigQueryProject.getProjectId());
    sqlTemplate.add("dataset", prefixName(source.getDataset().getName()));
    sqlTemplate.add("table", tableName);
    sqlTemplate.add("column", rowIdColumn.getName());
    sqlTemplate.add("inputVals", rowIds);
    // Execute the query building the row id match structure that tracks the matching
    // ids and the mismatched ids
    RowIdMatch rowIdMatch = new RowIdMatch();
    String sql = sqlTemplate.render();
    logger.debug("mapValuesToRows sql: " + sql);
    TableResult result = bigQueryProject.query(sql);
    for (FieldValueList row : result.iterateAll()) {
        // Test getting these by name
        FieldValue rowId = row.get(0);
        FieldValue inputValue = row.get(1);
        if (rowId.isNull()) {
            rowIdMatch.addMismatch(inputValue.getStringValue());
            logger.debug("rowId=<NULL>" + "  inVal=" + inputValue.getStringValue());
        } else {
            rowIdMatch.addMatch(inputValue.getStringValue(), rowId.getStringValue());
            logger.debug("rowId=" + rowId.getStringValue() + "  inVal=" + inputValue.getStringValue());
        }
    }
    return rowIdMatch;
}
Also used : FieldValue(com.google.cloud.bigquery.FieldValue) IngestFileNotFoundException(bio.terra.service.dataset.exception.IngestFileNotFoundException) DatasetService(bio.terra.service.dataset.DatasetService) BigQueryError(com.google.cloud.bigquery.BigQueryError) ViewDefinition(com.google.cloud.bigquery.ViewDefinition) PrimaryDataAccess(bio.terra.common.PrimaryDataAccess) SnapshotDataProject(bio.terra.service.snapshot.SnapshotDataProject) PDAO_EXTERNAL_TABLE_PREFIX(bio.terra.common.PdaoConstant.PDAO_EXTERNAL_TABLE_PREFIX) TableId(com.google.cloud.bigquery.TableId) LoggerFactory(org.slf4j.LoggerFactory) Autowired(org.springframework.beans.factory.annotation.Autowired) StringUtils(org.apache.commons.lang3.StringUtils) PDAO_TEMP_TABLE(bio.terra.common.PdaoConstant.PDAO_TEMP_TABLE) DatasetTable(bio.terra.service.dataset.DatasetTable) BigQuery(com.google.cloud.bigquery.BigQuery) FieldValueList(com.google.cloud.bigquery.FieldValueList) Schema(com.google.cloud.bigquery.Schema) Map(java.util.Map) Table(bio.terra.common.Table) TableResult(com.google.cloud.bigquery.TableResult) PdaoException(bio.terra.common.exception.PdaoException) DataLocationService(bio.terra.service.resourcemanagement.DataLocationService) MismatchedRowIdException(bio.terra.service.tabulardata.exception.MismatchedRowIdException) Field(com.google.cloud.bigquery.Field) DataDeletionTableModel(bio.terra.model.DataDeletionTableModel) LoadJobConfiguration(com.google.cloud.bigquery.LoadJobConfiguration) PDAO_PREFIX(bio.terra.common.PdaoConstant.PDAO_PREFIX) SnapshotRequestRowIdTableModel(bio.terra.model.SnapshotRequestRowIdTableModel) Collection(java.util.Collection) QueryJobConfiguration(com.google.cloud.bigquery.QueryJobConfiguration) SnapshotMapColumn(bio.terra.service.snapshot.SnapshotMapColumn) Instant(java.time.Instant) SnapshotRequestContentsModel(bio.terra.model.SnapshotRequestContentsModel) Collectors(java.util.stream.Collectors) CorruptMetadataException(bio.terra.service.snapshot.exception.CorruptMetadataException) MismatchedValueException(bio.terra.service.snapshot.exception.MismatchedValueException) List(java.util.List) AssetSpecification(bio.terra.service.dataset.AssetSpecification) ST(org.stringtemplate.v4.ST) PDAO_LOAD_HISTORY_TABLE(bio.terra.common.PdaoConstant.PDAO_LOAD_HISTORY_TABLE) BadExternalFileException(bio.terra.service.tabulardata.exception.BadExternalFileException) ExternalTableDefinition(com.google.cloud.bigquery.ExternalTableDefinition) Optional(java.util.Optional) PDAO_ROW_ID_COLUMN(bio.terra.common.PdaoConstant.PDAO_ROW_ID_COLUMN) PDAO_ROW_ID_TABLE(bio.terra.common.PdaoConstant.PDAO_ROW_ID_TABLE) FormatOptions(com.google.cloud.bigquery.FormatOptions) HashMap(java.util.HashMap) Column(bio.terra.common.Column) RowIdMatch(bio.terra.service.snapshot.RowIdMatch) CsvOptions(com.google.cloud.bigquery.CsvOptions) ArrayList(java.util.ArrayList) PDAO_TABLE_ID_COLUMN(bio.terra.common.PdaoConstant.PDAO_TABLE_ID_COLUMN) AssetTable(bio.terra.service.dataset.AssetTable) Snapshot(bio.terra.service.snapshot.Snapshot) Job(com.google.cloud.bigquery.Job) InvalidQueryException(bio.terra.grammar.exception.InvalidQueryException) PdaoLoadStatistics(bio.terra.common.PdaoLoadStatistics) DatasetDataProject(bio.terra.service.dataset.DatasetDataProject) LegacySQLTypeName(com.google.cloud.bigquery.LegacySQLTypeName) Logger(org.slf4j.Logger) JobInfo(com.google.cloud.bigquery.JobInfo) Acl(com.google.cloud.bigquery.Acl) SnapshotTable(bio.terra.service.snapshot.SnapshotTable) Profile(org.springframework.context.annotation.Profile) JobStatistics(com.google.cloud.bigquery.JobStatistics) PDAO_LOAD_HISTORY_STAGING_TABLE_PREFIX(bio.terra.common.PdaoConstant.PDAO_LOAD_HISTORY_STAGING_TABLE_PREFIX) TimeUnit(java.util.concurrent.TimeUnit) BulkLoadHistoryModel(bio.terra.model.BulkLoadHistoryModel) Component(org.springframework.stereotype.Component) IngestRequestModel(bio.terra.model.IngestRequestModel) IngestFailureException(bio.terra.service.dataset.exception.IngestFailureException) ApplicationConfiguration(bio.terra.app.configuration.ApplicationConfiguration) SnapshotRequestRowIdModel(bio.terra.model.SnapshotRequestRowIdModel) PdaoConstant(bio.terra.common.PdaoConstant) BigQueryPartitionConfigV1(bio.terra.service.dataset.BigQueryPartitionConfigV1) TableInfo(com.google.cloud.bigquery.TableInfo) Dataset(bio.terra.service.dataset.Dataset) SnapshotSource(bio.terra.service.snapshot.SnapshotSource) Collections(java.util.Collections) SnapshotMapTable(bio.terra.service.snapshot.SnapshotMapTable) ST(org.stringtemplate.v4.ST) TableResult(com.google.cloud.bigquery.TableResult) SnapshotMapTable(bio.terra.service.snapshot.SnapshotMapTable) RowIdMatch(bio.terra.service.snapshot.RowIdMatch) SnapshotMapColumn(bio.terra.service.snapshot.SnapshotMapColumn) Column(bio.terra.common.Column) FieldValueList(com.google.cloud.bigquery.FieldValueList) FieldValue(com.google.cloud.bigquery.FieldValue)

Example 2 with SnapshotSource

use of bio.terra.service.snapshot.SnapshotSource in project jade-data-repo by DataBiosphere.

the class DeleteSnapshotPrimaryDataStep method doStep.

@Override
public StepResult doStep(FlightContext context) throws InterruptedException {
    try {
        // this fault is used by the SnapshotConnectedTest > testOverlappingDeletes
        if (configService.testInsertFault(ConfigEnum.SNAPSHOT_DELETE_LOCK_CONFLICT_STOP_FAULT)) {
            logger.info("SNAPSHOT_DELETE_LOCK_CONFLICT_STOP_FAULT");
            while (!configService.testInsertFault(ConfigEnum.SNAPSHOT_DELETE_LOCK_CONFLICT_CONTINUE_FAULT)) {
                logger.info("Sleeping for CONTINUE FAULT");
                TimeUnit.SECONDS.sleep(5);
            }
            logger.info("SNAPSHOT_DELETE_LOCK_CONFLICT_CONTINUE_FAULT");
        }
        Snapshot snapshot = snapshotService.retrieve(snapshotId);
        bigQueryPdao.deleteSnapshot(snapshot);
        // Remove snapshot file references from the underlying datasets
        for (SnapshotSource snapshotSource : snapshot.getSnapshotSources()) {
            Dataset dataset = datasetService.retrieve(snapshotSource.getDataset().getId());
            dependencyDao.deleteSnapshotFileDependencies(dataset, snapshotId.toString());
        }
        fileDao.deleteFilesFromSnapshot(snapshot);
    } catch (SnapshotNotFoundException | DatasetNotFoundException nfe) {
    // If we do not find the snapshot or dataset, we assume things are already clean
    }
    return StepResult.getStepResultSuccess();
}
Also used : Snapshot(bio.terra.service.snapshot.Snapshot) Dataset(bio.terra.service.dataset.Dataset) SnapshotNotFoundException(bio.terra.service.snapshot.exception.SnapshotNotFoundException) SnapshotSource(bio.terra.service.snapshot.SnapshotSource) DatasetNotFoundException(bio.terra.service.dataset.exception.DatasetNotFoundException)

Example 3 with SnapshotSource

use of bio.terra.service.snapshot.SnapshotSource in project jade-data-repo by DataBiosphere.

the class CreateSnapshotPrimaryDataRowIdsStep method doStep.

@Override
public StepResult doStep(FlightContext context) throws InterruptedException {
    // TODO: this assumes single-dataset snapshots, will need to add a loop for multiple
    SnapshotRequestContentsModel contentsModel = snapshotReq.getContents().get(0);
    Snapshot snapshot = snapshotDao.retrieveSnapshotByName(snapshotReq.getName());
    SnapshotSource source = snapshot.getSnapshotSources().get(0);
    SnapshotRequestRowIdModel rowIdModel = contentsModel.getRowIdSpec();
    // for each table, make sure all of the row ids match
    for (SnapshotRequestRowIdTableModel table : rowIdModel.getTables()) {
        List<String> rowIds = table.getRowIds();
        if (!rowIds.isEmpty()) {
            RowIdMatch rowIdMatch = bigQueryPdao.matchRowIds(snapshot, source, table.getTableName(), rowIds);
            if (!rowIdMatch.getUnmatchedInputValues().isEmpty()) {
                String unmatchedValues = String.join("', '", rowIdMatch.getUnmatchedInputValues());
                String message = String.format("Mismatched row ids: '%s'", unmatchedValues);
                FlightUtils.setErrorResponse(context, message, HttpStatus.BAD_REQUEST);
                return new StepResult(StepStatus.STEP_RESULT_FAILURE_FATAL, new MismatchedValueException(message));
            }
        }
    }
    bigQueryPdao.createSnapshotWithProvidedIds(snapshot, contentsModel);
    return StepResult.getStepResultSuccess();
}
Also used : Snapshot(bio.terra.service.snapshot.Snapshot) SnapshotRequestRowIdTableModel(bio.terra.model.SnapshotRequestRowIdTableModel) RowIdMatch(bio.terra.service.snapshot.RowIdMatch) SnapshotSource(bio.terra.service.snapshot.SnapshotSource) SnapshotRequestRowIdModel(bio.terra.model.SnapshotRequestRowIdModel) SnapshotRequestContentsModel(bio.terra.model.SnapshotRequestContentsModel) MismatchedValueException(bio.terra.service.snapshot.exception.MismatchedValueException) StepResult(bio.terra.stairway.StepResult)

Example 4 with SnapshotSource

use of bio.terra.service.snapshot.SnapshotSource in project jade-data-repo by DataBiosphere.

the class SnapshotAuthzFileAclStep method undoStep.

@Override
public StepResult undoStep(FlightContext context) throws InterruptedException {
    FlightMap workingMap = context.getWorkingMap();
    UUID snapshotId = workingMap.get(SnapshotWorkingMapKeys.SNAPSHOT_ID, UUID.class);
    Snapshot snapshot = snapshotService.retrieve(snapshotId);
    Map<IamRole, String> policies = workingMap.get(SnapshotWorkingMapKeys.POLICY_MAP, Map.class);
    String readersPolicyEmail = policies.get(IamRole.READER);
    // TODO: when we support multiple datasets, we can generate more than one copy of this
    // step: one for each dataset. That is because each dataset keeps its file dependencies
    // in its own scope. For now, we know there is exactly one dataset and we take shortcuts.
    SnapshotSource snapshotSource = snapshot.getSnapshotSources().get(0);
    String datasetId = snapshotSource.getDataset().getId().toString();
    Dataset dataset = datasetService.retrieve(UUID.fromString(datasetId));
    List<String> fileIds = fireStoreDao.getDatasetSnapshotFileIds(dataset, snapshotId.toString());
    try {
        gcsPdao.removeAclOnFiles(dataset, fileIds, readersPolicyEmail);
    } catch (StorageException ex) {
        // We don't let the exception stop us from continuing to remove the rest of the snapshot parts.
        // TODO: change this to whatever our alert-a-human log message is.
        logger.warn("NEEDS CLEANUP: Failed to remove snapshot reader ACLs from files", ex);
    }
    return StepResult.getStepResultSuccess();
}
Also used : Snapshot(bio.terra.service.snapshot.Snapshot) Dataset(bio.terra.service.dataset.Dataset) SnapshotSource(bio.terra.service.snapshot.SnapshotSource) IamRole(bio.terra.service.iam.IamRole) FlightMap(bio.terra.stairway.FlightMap) UUID(java.util.UUID) StorageException(com.google.cloud.storage.StorageException)

Example 5 with SnapshotSource

use of bio.terra.service.snapshot.SnapshotSource in project jade-data-repo by DataBiosphere.

the class SnapshotAuthzFileAclStep method doStep.

@Override
public StepResult doStep(FlightContext context) throws InterruptedException {
    FlightMap workingMap = context.getWorkingMap();
    UUID snapshotId = workingMap.get(SnapshotWorkingMapKeys.SNAPSHOT_ID, UUID.class);
    Snapshot snapshot = snapshotService.retrieve(snapshotId);
    Map<IamRole, String> policies = workingMap.get(SnapshotWorkingMapKeys.POLICY_MAP, Map.class);
    String readersPolicyEmail = policies.get(IamRole.READER);
    // TODO: when we support multiple datasets, we can generate more than one copy of this
    // step: one for each dataset. That is because each dataset keeps its file dependencies
    // in its own scope. For now, we know there is exactly one dataset and we take shortcuts.
    SnapshotSource snapshotSource = snapshot.getSnapshotSources().get(0);
    String datasetId = snapshotSource.getDataset().getId().toString();
    Dataset dataset = datasetService.retrieve(UUID.fromString(datasetId));
    List<String> fileIds = fireStoreDao.getDatasetSnapshotFileIds(dataset, snapshotId.toString());
    try {
        if (configService.testInsertFault(SNAPSHOT_GRANT_FILE_ACCESS_FAULT)) {
            throw new StorageException(400, "Fake IAM failure", "badRequest", null);
        }
        gcsPdao.setAclOnFiles(dataset, fileIds, readersPolicyEmail);
    } catch (StorageException ex) {
        // we will log alot and retry on that.
        if (ex.getCode() == 400 && StringUtils.equals(ex.getReason(), "badRequest")) {
            logger.info("Maybe caught an ACL propagation error: " + ex.getMessage() + " reason: " + ex.getReason(), ex);
            return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, ex);
        }
    }
    return StepResult.getStepResultSuccess();
}
Also used : Snapshot(bio.terra.service.snapshot.Snapshot) Dataset(bio.terra.service.dataset.Dataset) SnapshotSource(bio.terra.service.snapshot.SnapshotSource) IamRole(bio.terra.service.iam.IamRole) FlightMap(bio.terra.stairway.FlightMap) UUID(java.util.UUID) StepResult(bio.terra.stairway.StepResult) StorageException(com.google.cloud.storage.StorageException)

Aggregations

SnapshotSource (bio.terra.service.snapshot.SnapshotSource)11 Snapshot (bio.terra.service.snapshot.Snapshot)9 Dataset (bio.terra.service.dataset.Dataset)7 SnapshotMapTable (bio.terra.service.snapshot.SnapshotMapTable)4 MismatchedValueException (bio.terra.service.snapshot.exception.MismatchedValueException)4 StepResult (bio.terra.stairway.StepResult)4 Table (bio.terra.common.Table)3 PdaoException (bio.terra.common.exception.PdaoException)3 SnapshotRequestContentsModel (bio.terra.model.SnapshotRequestContentsModel)3 SnapshotRequestRowIdModel (bio.terra.model.SnapshotRequestRowIdModel)3 SnapshotRequestRowIdTableModel (bio.terra.model.SnapshotRequestRowIdTableModel)3 AssetTable (bio.terra.service.dataset.AssetTable)3 RowIdMatch (bio.terra.service.snapshot.RowIdMatch)3 AssetSpecification (bio.terra.service.dataset.AssetSpecification)2 DatasetService (bio.terra.service.dataset.DatasetService)2 DatasetTable (bio.terra.service.dataset.DatasetTable)2 IamRole (bio.terra.service.iam.IamRole)2 SnapshotMapColumn (bio.terra.service.snapshot.SnapshotMapColumn)2 SnapshotTable (bio.terra.service.snapshot.SnapshotTable)2 FlightMap (bio.terra.stairway.FlightMap)2