Search in sources :

Example 1 with Column

use of bio.terra.common.Column in project jade-data-repo by DataBiosphere.

the class DatasetJsonConversion method tableModelToTable.

public static DatasetTable tableModelToTable(TableModel tableModel) {
    Map<String, Column> columnMap = new HashMap<>();
    List<Column> columns = new ArrayList<>();
    DatasetTable datasetTable = new DatasetTable().name(tableModel.getName());
    for (ColumnModel columnModel : tableModel.getColumns()) {
        Column column = columnModelToDatasetColumn(columnModel).table(datasetTable);
        columnMap.put(column.getName(), column);
        columns.add(column);
    }
    List<Column> primaryKeyColumns = Optional.ofNullable(tableModel.getPrimaryKey()).orElse(Collections.emptyList()).stream().map(columnMap::get).collect(Collectors.toList());
    datasetTable.primaryKey(primaryKeyColumns);
    BigQueryPartitionConfigV1 partitionConfig;
    switch(tableModel.getPartitionMode()) {
        case DATE:
            String column = tableModel.getDatePartitionOptions().getColumn();
            boolean useIngestDate = column.equals(PdaoConstant.PDAO_INGEST_DATE_COLUMN_ALIAS);
            partitionConfig = useIngestDate ? BigQueryPartitionConfigV1.ingestDate() : BigQueryPartitionConfigV1.date(column);
            break;
        case INT:
            IntPartitionOptionsModel options = tableModel.getIntPartitionOptions();
            partitionConfig = BigQueryPartitionConfigV1.intRange(options.getColumn(), options.getMin(), options.getMax(), options.getInterval());
            break;
        default:
            partitionConfig = BigQueryPartitionConfigV1.none();
            break;
    }
    return datasetTable.bigQueryPartitionConfig(partitionConfig).columns(columns);
}
Also used : Column(bio.terra.common.Column) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList)

Example 2 with Column

use of bio.terra.common.Column in project jade-data-repo by DataBiosphere.

the class IngestValidateRefsStep method doStep.

@Override
public StepResult doStep(FlightContext context) throws InterruptedException {
    Dataset dataset = IngestUtils.getDataset(context, datasetService);
    Table table = IngestUtils.getDatasetTable(context, dataset);
    String stagingTableName = IngestUtils.getStagingTableName(context);
    // For each fileref column, scan the staging table and build an array of file ids
    // Then probe the file system to validate that the file exists and is part
    // of this dataset. We check all ids and return one complete error.
    List<String> invalidRefIds = new ArrayList<>();
    for (Column column : table.getColumns()) {
        if (StringUtils.equalsIgnoreCase(column.getType(), "FILEREF")) {
            List<String> refIdArray = bigQueryPdao.getRefIds(dataset, stagingTableName, column);
            List<String> badRefIds = fileDao.validateRefIds(dataset, refIdArray);
            if (badRefIds != null) {
                invalidRefIds.addAll(badRefIds);
            }
        }
    }
    int invalidIdCount = invalidRefIds.size();
    if (invalidIdCount != 0) {
        // Made a string buffer to appease findbugs; it saw + in the loop and said "bad!"
        StringBuffer errorMessage = new StringBuffer("Invalid file ids found during ingest (");
        List<String> errorDetails = new ArrayList<>();
        int count = 0;
        for (String badId : invalidRefIds) {
            errorDetails.add(badId);
            count++;
            if (count > MAX_ERROR_REF_IDS) {
                errorMessage.append(MAX_ERROR_REF_IDS + "out of ");
                break;
            }
        }
        errorMessage.append(invalidIdCount + " returned in details)");
        throw new InvalidFileRefException(errorMessage.toString(), errorDetails);
    }
    return StepResult.getStepResultSuccess();
}
Also used : Table(bio.terra.common.Table) Column(bio.terra.common.Column) InvalidFileRefException(bio.terra.service.dataset.exception.InvalidFileRefException) Dataset(bio.terra.service.dataset.Dataset) ArrayList(java.util.ArrayList)

Example 3 with Column

use of bio.terra.common.Column in project jade-data-repo by DataBiosphere.

the class BigQueryPdao method matchRowIds.

// for each table in a dataset (source), collect row id matches ON the row id
public RowIdMatch matchRowIds(Snapshot snapshot, SnapshotSource source, String tableName, List<String> rowIds) throws InterruptedException {
    // One source: grab it and navigate to the relevant parts
    BigQueryProject bigQueryProject = bigQueryProjectForSnapshot(snapshot);
    Optional<SnapshotMapTable> optTable = source.getSnapshotMapTables().stream().filter(table -> table.getFromTable().getName().equals(tableName)).findFirst();
    // create a column to point to the row id column in the source table to check that passed row ids exist in it
    Column rowIdColumn = new Column().table(optTable.get().getFromTable()).name(PDAO_ROW_ID_COLUMN);
    ST sqlTemplate = new ST(mapValuesToRowsTemplate);
    sqlTemplate.add("project", bigQueryProject.getProjectId());
    sqlTemplate.add("dataset", prefixName(source.getDataset().getName()));
    sqlTemplate.add("table", tableName);
    sqlTemplate.add("column", rowIdColumn.getName());
    sqlTemplate.add("inputVals", rowIds);
    // Execute the query building the row id match structure that tracks the matching
    // ids and the mismatched ids
    RowIdMatch rowIdMatch = new RowIdMatch();
    String sql = sqlTemplate.render();
    logger.debug("mapValuesToRows sql: " + sql);
    TableResult result = bigQueryProject.query(sql);
    for (FieldValueList row : result.iterateAll()) {
        // Test getting these by name
        FieldValue rowId = row.get(0);
        FieldValue inputValue = row.get(1);
        if (rowId.isNull()) {
            rowIdMatch.addMismatch(inputValue.getStringValue());
            logger.debug("rowId=<NULL>" + "  inVal=" + inputValue.getStringValue());
        } else {
            rowIdMatch.addMatch(inputValue.getStringValue(), rowId.getStringValue());
            logger.debug("rowId=" + rowId.getStringValue() + "  inVal=" + inputValue.getStringValue());
        }
    }
    return rowIdMatch;
}
Also used : FieldValue(com.google.cloud.bigquery.FieldValue) IngestFileNotFoundException(bio.terra.service.dataset.exception.IngestFileNotFoundException) DatasetService(bio.terra.service.dataset.DatasetService) BigQueryError(com.google.cloud.bigquery.BigQueryError) ViewDefinition(com.google.cloud.bigquery.ViewDefinition) PrimaryDataAccess(bio.terra.common.PrimaryDataAccess) SnapshotDataProject(bio.terra.service.snapshot.SnapshotDataProject) PDAO_EXTERNAL_TABLE_PREFIX(bio.terra.common.PdaoConstant.PDAO_EXTERNAL_TABLE_PREFIX) TableId(com.google.cloud.bigquery.TableId) LoggerFactory(org.slf4j.LoggerFactory) Autowired(org.springframework.beans.factory.annotation.Autowired) StringUtils(org.apache.commons.lang3.StringUtils) PDAO_TEMP_TABLE(bio.terra.common.PdaoConstant.PDAO_TEMP_TABLE) DatasetTable(bio.terra.service.dataset.DatasetTable) BigQuery(com.google.cloud.bigquery.BigQuery) FieldValueList(com.google.cloud.bigquery.FieldValueList) Schema(com.google.cloud.bigquery.Schema) Map(java.util.Map) Table(bio.terra.common.Table) TableResult(com.google.cloud.bigquery.TableResult) PdaoException(bio.terra.common.exception.PdaoException) DataLocationService(bio.terra.service.resourcemanagement.DataLocationService) MismatchedRowIdException(bio.terra.service.tabulardata.exception.MismatchedRowIdException) Field(com.google.cloud.bigquery.Field) DataDeletionTableModel(bio.terra.model.DataDeletionTableModel) LoadJobConfiguration(com.google.cloud.bigquery.LoadJobConfiguration) PDAO_PREFIX(bio.terra.common.PdaoConstant.PDAO_PREFIX) SnapshotRequestRowIdTableModel(bio.terra.model.SnapshotRequestRowIdTableModel) Collection(java.util.Collection) QueryJobConfiguration(com.google.cloud.bigquery.QueryJobConfiguration) SnapshotMapColumn(bio.terra.service.snapshot.SnapshotMapColumn) Instant(java.time.Instant) SnapshotRequestContentsModel(bio.terra.model.SnapshotRequestContentsModel) Collectors(java.util.stream.Collectors) CorruptMetadataException(bio.terra.service.snapshot.exception.CorruptMetadataException) MismatchedValueException(bio.terra.service.snapshot.exception.MismatchedValueException) List(java.util.List) AssetSpecification(bio.terra.service.dataset.AssetSpecification) ST(org.stringtemplate.v4.ST) PDAO_LOAD_HISTORY_TABLE(bio.terra.common.PdaoConstant.PDAO_LOAD_HISTORY_TABLE) BadExternalFileException(bio.terra.service.tabulardata.exception.BadExternalFileException) ExternalTableDefinition(com.google.cloud.bigquery.ExternalTableDefinition) Optional(java.util.Optional) PDAO_ROW_ID_COLUMN(bio.terra.common.PdaoConstant.PDAO_ROW_ID_COLUMN) PDAO_ROW_ID_TABLE(bio.terra.common.PdaoConstant.PDAO_ROW_ID_TABLE) FormatOptions(com.google.cloud.bigquery.FormatOptions) HashMap(java.util.HashMap) Column(bio.terra.common.Column) RowIdMatch(bio.terra.service.snapshot.RowIdMatch) CsvOptions(com.google.cloud.bigquery.CsvOptions) ArrayList(java.util.ArrayList) PDAO_TABLE_ID_COLUMN(bio.terra.common.PdaoConstant.PDAO_TABLE_ID_COLUMN) AssetTable(bio.terra.service.dataset.AssetTable) Snapshot(bio.terra.service.snapshot.Snapshot) Job(com.google.cloud.bigquery.Job) InvalidQueryException(bio.terra.grammar.exception.InvalidQueryException) PdaoLoadStatistics(bio.terra.common.PdaoLoadStatistics) DatasetDataProject(bio.terra.service.dataset.DatasetDataProject) LegacySQLTypeName(com.google.cloud.bigquery.LegacySQLTypeName) Logger(org.slf4j.Logger) JobInfo(com.google.cloud.bigquery.JobInfo) Acl(com.google.cloud.bigquery.Acl) SnapshotTable(bio.terra.service.snapshot.SnapshotTable) Profile(org.springframework.context.annotation.Profile) JobStatistics(com.google.cloud.bigquery.JobStatistics) PDAO_LOAD_HISTORY_STAGING_TABLE_PREFIX(bio.terra.common.PdaoConstant.PDAO_LOAD_HISTORY_STAGING_TABLE_PREFIX) TimeUnit(java.util.concurrent.TimeUnit) BulkLoadHistoryModel(bio.terra.model.BulkLoadHistoryModel) Component(org.springframework.stereotype.Component) IngestRequestModel(bio.terra.model.IngestRequestModel) IngestFailureException(bio.terra.service.dataset.exception.IngestFailureException) ApplicationConfiguration(bio.terra.app.configuration.ApplicationConfiguration) SnapshotRequestRowIdModel(bio.terra.model.SnapshotRequestRowIdModel) PdaoConstant(bio.terra.common.PdaoConstant) BigQueryPartitionConfigV1(bio.terra.service.dataset.BigQueryPartitionConfigV1) TableInfo(com.google.cloud.bigquery.TableInfo) Dataset(bio.terra.service.dataset.Dataset) SnapshotSource(bio.terra.service.snapshot.SnapshotSource) Collections(java.util.Collections) SnapshotMapTable(bio.terra.service.snapshot.SnapshotMapTable) ST(org.stringtemplate.v4.ST) TableResult(com.google.cloud.bigquery.TableResult) SnapshotMapTable(bio.terra.service.snapshot.SnapshotMapTable) RowIdMatch(bio.terra.service.snapshot.RowIdMatch) SnapshotMapColumn(bio.terra.service.snapshot.SnapshotMapColumn) Column(bio.terra.common.Column) FieldValueList(com.google.cloud.bigquery.FieldValueList) FieldValue(com.google.cloud.bigquery.FieldValue)

Example 4 with Column

use of bio.terra.common.Column in project jade-data-repo by DataBiosphere.

the class BigQueryPdao method buildSchema.

private Schema buildSchema(DatasetTable table, boolean addRowIdColumn) {
    List<Field> fieldList = new ArrayList<>();
    List<String> primaryKeys = table.getPrimaryKey().stream().map(Column::getName).collect(Collectors.toList());
    if (addRowIdColumn) {
        fieldList.add(Field.of(PDAO_ROW_ID_COLUMN, LegacySQLTypeName.STRING));
    }
    for (Column column : table.getColumns()) {
        Field.Mode mode;
        if (primaryKeys.contains(column.getName())) {
            mode = Field.Mode.REQUIRED;
        } else if (column.isArrayOf()) {
            mode = Field.Mode.REPEATED;
        } else {
            mode = Field.Mode.NULLABLE;
        }
        Field fieldSpec = Field.newBuilder(column.getName(), translateType(column.getType())).setMode(mode).build();
        fieldList.add(fieldSpec);
    }
    return Schema.of(fieldList);
}
Also used : Field(com.google.cloud.bigquery.Field) SnapshotMapColumn(bio.terra.service.snapshot.SnapshotMapColumn) Column(bio.terra.common.Column) ArrayList(java.util.ArrayList)

Example 5 with Column

use of bio.terra.common.Column in project jade-data-repo by DataBiosphere.

the class BigQueryPdao method buildLiveView.

private TableInfo buildLiveView(String bigQueryProject, String datasetName, DatasetTable table) {
    ST liveViewSql = new ST(liveViewTemplate);
    liveViewSql.add("project", bigQueryProject);
    liveViewSql.add("dataset", datasetName);
    liveViewSql.add("rawTable", table.getRawTableName());
    liveViewSql.add("sdTable", table.getSoftDeleteTableName());
    liveViewSql.add("columns", PDAO_ROW_ID_COLUMN);
    liveViewSql.add("columns", table.getColumns().stream().map(Column::getName).collect(Collectors.toList()));
    if (table.getBigQueryPartitionConfig().getMode() == BigQueryPartitionConfigV1.Mode.INGEST_DATE) {
        liveViewSql.add("columns", "_PARTITIONDATE AS " + PdaoConstant.PDAO_INGEST_DATE_COLUMN_ALIAS);
    }
    TableId liveViewId = TableId.of(datasetName, table.getName());
    return TableInfo.of(liveViewId, ViewDefinition.of(liveViewSql.render()));
}
Also used : TableId(com.google.cloud.bigquery.TableId) ST(org.stringtemplate.v4.ST) SnapshotMapColumn(bio.terra.service.snapshot.SnapshotMapColumn) Column(bio.terra.common.Column)

Aggregations

Column (bio.terra.common.Column)18 ArrayList (java.util.ArrayList)9 UUID (java.util.UUID)8 MapSqlParameterSource (org.springframework.jdbc.core.namedparam.MapSqlParameterSource)6 Table (bio.terra.common.Table)5 Relationship (bio.terra.common.Relationship)4 AssetColumn (bio.terra.service.dataset.AssetColumn)4 AssetTable (bio.terra.service.dataset.AssetTable)4 Dataset (bio.terra.service.dataset.Dataset)4 DatasetTable (bio.terra.service.dataset.DatasetTable)4 SnapshotMapColumn (bio.terra.service.snapshot.SnapshotMapColumn)4 AssetSpecification (bio.terra.service.dataset.AssetSpecification)3 CorruptMetadataException (bio.terra.service.snapshot.exception.CorruptMetadataException)3 HashMap (java.util.HashMap)3 DaoKeyHolder (bio.terra.common.DaoKeyHolder)2 SnapshotRequestContentsModel (bio.terra.model.SnapshotRequestContentsModel)2 SnapshotRequestRowIdModel (bio.terra.model.SnapshotRequestRowIdModel)2 SnapshotRequestRowIdTableModel (bio.terra.model.SnapshotRequestRowIdTableModel)2 Field (com.google.cloud.bigquery.Field)2 TableId (com.google.cloud.bigquery.TableId)2