Search in sources :

Example 1 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToSQL method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to SQL DML Objects
     *   3) Filter stale rows using stateful PK transform
     *   4) Write DML statements to SQL Database via jdbc
     */
    Pipeline pipeline = Pipeline.create(options);
    CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
    validateOptions(options, dataSourceConfiguration);
    Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
    /*
     * Stage 2: Write JSON Strings to SQL Insert Strings
     *   a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     * Stage 3) Filter stale rows using stateful PK transform
     */
    PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
    /*
     * Stage 4: Write Inserts to CloudSQL
     */
    dmlStatements.apply("Write to SQL", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {

        public String formatStatement(DmlInfo element) {
            return element.getDmlSql();
        }
    }));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) CdcJdbcIO(com.google.cloud.teleport.v2.io.CdcJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 2 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToPostgres method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to Postgres DML Objects
     *   3) Filter stale rows using stateful PK transform
     *   4) Write DML statements to Postgres
     */
    Pipeline pipeline = Pipeline.create(options);
    String jdbcDriverConnectionString = String.format("jdbc:postgresql://%s:%s/%s", options.getDatabaseHost(), options.getDatabasePort(), options.getDatabaseName());
    CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = CdcJdbcIO.DataSourceConfiguration.create("org.postgresql.Driver", jdbcDriverConnectionString).withUsername(options.getDatabaseUser()).withPassword(options.getDatabasePassword()).withMaxIdleConnections(new Integer(0));
    validateOptions(options, dataSourceConfiguration);
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
    /*
     * Stage 2: Write JSON Strings to Postgres Insert Strings
     *   a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     * Stage 3) Filter stale rows using stateful PK transform
     */
    PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to Postgres DML", CreateDml.createDmlObjects(dataSourceConfiguration)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
    /*
     * Stage 4: Write Inserts to CloudSQL
     */
    dmlStatements.apply("Write to Postgres", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {

        public String formatStatement(DmlInfo element) {
            return element.getDmlSql();
        }
    }));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) CdcJdbcIO(com.google.cloud.teleport.v2.io.CdcJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 3 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class ShadowTableCreator method constructShadowTable.

/*
   * Constructs a shadow table for a data table in the information schema.
   * Note: Shadow tables for interleaved tables are not interleaved to
   * their shadow parent table.
   */
Table constructShadowTable(Ddl informationSchema, String dataTableName) {
    // Create a new shadow table with the given prefix.
    Table.Builder shadowTableBuilder = Table.builder();
    String shadowTableName = shadowTablePrefix + dataTableName;
    shadowTableBuilder.name(shadowTableName);
    // Add key columns from the data table to the shadow table builder.
    Table dataTable = informationSchema.table(dataTableName);
    Set<String> primaryKeyColNames = dataTable.primaryKeys().stream().map(k -> k.name()).collect(Collectors.toSet());
    List<Column> primaryKeyCols = dataTable.columns().stream().filter(col -> primaryKeyColNames.contains(col.name())).collect(Collectors.toList());
    for (Column col : primaryKeyCols) {
        shadowTableBuilder.addColumn(col);
    }
    // Add primary key constraints.
    for (IndexColumn keyColumn : dataTable.primaryKeys()) {
        if (keyColumn.order() == IndexColumn.Order.ASC) {
            shadowTableBuilder.primaryKey().asc(keyColumn.name()).end();
        } else if (keyColumn.order() == IndexColumn.Order.DESC) {
            shadowTableBuilder.primaryKey().desc(keyColumn.name()).end();
        }
    }
    // Add extra column to track ChangeEventSequence information
    addChangeEventSequenceColumns(shadowTableBuilder);
    return shadowTableBuilder.build();
}
Also used : List(java.util.List) Pair(org.apache.commons.lang3.tuple.Pair) DatastreamConstants(com.google.cloud.teleport.v2.templates.datastream.DatastreamConstants) Ddl(com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn) Column(com.google.cloud.teleport.v2.templates.spanner.ddl.Column) Map(java.util.Map) Table(com.google.cloud.teleport.v2.templates.spanner.ddl.Table) Set(java.util.Set) Collectors(java.util.stream.Collectors) Table(com.google.cloud.teleport.v2.templates.spanner.ddl.Table) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn) Column(com.google.cloud.teleport.v2.templates.spanner.ddl.Column) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn)

Example 4 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexFileFormatConversion method run.

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @return The pipeline result.
 */
public static PipelineResult run(Pipeline pipeline, FileFormatConversionOptions options, DataplexClient dataplex, OutputPathProvider outputPathProvider) throws IOException {
    boolean isInputAsset = ASSET_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches();
    if (!isInputAsset && !ENTITIES_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches()) {
        throw new IllegalArgumentException("Either input asset or input entities list must be provided");
    }
    GoogleCloudDataplexV1Asset outputAsset = dataplex.getAsset(options.getOutputAsset());
    if (outputAsset == null || outputAsset.getResourceSpec() == null || !DataplexAssetResourceSpec.STORAGE_BUCKET.name().equals(outputAsset.getResourceSpec().getType()) || outputAsset.getResourceSpec().getName() == null) {
        throw new IllegalArgumentException("Output asset must be an existing asset with resource spec name being a GCS bucket and" + " resource spec type of " + DataplexAssetResourceSpec.STORAGE_BUCKET.name());
    }
    String outputBucket = outputAsset.getResourceSpec().getName();
    Predicate<String> inputFilesFilter;
    switch(options.getWriteDisposition()) {
        case OVERWRITE:
            inputFilesFilter = inputFilePath -> true;
            break;
        case FAIL:
            Set<String> outputFilePaths = getAllOutputFilePaths(outputBucket);
            inputFilesFilter = inputFilePath -> {
                if (outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()))) {
                    throw new WriteDispositionException(String.format("The file %s already exists in the output asset bucket: %s", inputFilePath, outputBucket));
                }
                return true;
            };
            break;
        case SKIP:
            outputFilePaths = getAllOutputFilePaths(outputBucket);
            inputFilesFilter = inputFilePath -> !outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()));
            break;
        default:
            throw new UnsupportedOperationException("Unsupported existing file behaviour: " + options.getWriteDisposition());
    }
    ImmutableList<GoogleCloudDataplexV1Entity> entities = isInputAsset ? dataplex.getCloudStorageEntities(options.getInputAssetOrEntitiesList()) : dataplex.getEntities(Splitter.on(',').trimResults().splitToList(options.getInputAssetOrEntitiesList()));
    boolean convertingFiles = false;
    for (GoogleCloudDataplexV1Entity entity : entities) {
        ImmutableList<GoogleCloudDataplexV1Partition> partitions = dataplex.getPartitions(entity.getName());
        if (partitions.isEmpty()) {
            String outputPath = outputPathProvider.outputPathFrom(entity.getDataPath(), outputBucket);
            Iterator<String> inputFilePaths = getFilesFromFilePattern(entityToFileSpec(entity)).filter(inputFilesFilter).iterator();
            convertingFiles = inputFilePaths.hasNext();
            inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(entity.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
        } else {
            for (GoogleCloudDataplexV1Partition partition : partitions) {
                String outputPath = outputPathProvider.outputPathFrom(partition.getLocation(), outputBucket);
                Iterator<String> inputFilePaths = getFilesFromFilePattern(partitionToFileSpec(partition)).filter(inputFilesFilter).iterator();
                convertingFiles = inputFilePaths.hasNext();
                inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(partition.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
            }
        }
    }
    if (!convertingFiles) {
        pipeline.apply("Nothing to convert", new NoopTransform());
    }
    return pipeline.run();
}
Also used : NoopTransform(com.google.cloud.teleport.v2.transforms.NoopTransform) GoogleCloudDataplexV1Partition(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Partition) WriteDispositionException(com.google.cloud.teleport.v2.utils.WriteDisposition.WriteDispositionException) GoogleCloudDataplexV1Asset(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset) GoogleCloudDataplexV1Entity(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Entity)

Example 5 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryMetadataLoader method loadTableMetadata.

/**
 * Populates {@code table} builder with additional metadata like partition names and schema.
 *
 * @param filter optional filter to skip a subset of tables
 * @return {@code true} if the table matches all filters and should be included in the results,
 *     {@code false} if it should be skipped
 */
private boolean loadTableMetadata(BigQueryTable.Builder table, Filter filter) throws InterruptedException {
    TableReadOptions.Builder readOptions = TableReadOptions.newBuilder();
    if (table.getPartitioningColumn() == null) {
        if (filter != null && filter.shouldSkipUnpartitionedTable(table)) {
            return false;
        }
    } else {
        List<BigQueryTablePartition> partitions = loadTablePartitions(table, filter);
        if (filter != null && filter.shouldSkipPartitionedTable(table, partitions)) {
            return false;
        }
        table.setPartitions(partitions);
        LOG.info("Loaded {} partitions for table {}: {}", partitions.size(), table.getTableName(), partitions);
        // Creating a ReadSession without a WHERE clause for a partitioned table that has
        // "require partition filter" param set to true would fail with the error:
        // "Cannot query over table ... without a filter over column(s) ...
        // that can be used for partition elimination".
        // The following is a hack that adds an "is null and is not null" filter over the
        // partitioning column, which shouldn't select any data but should make the query
        // analyzer happy and should be enough to extract the table schema.
        // TODO(an2x): do this only when "require partition filter" = true
        // or load schema differently?
        readOptions.setRowRestriction(String.format("%s is null and %s is not null", table.getPartitioningColumn(), table.getPartitioningColumn()));
    }
    ReadSession session = BigQueryUtils.createReadSession(bqsClient, DatasetId.of(table.getProject(), table.getDataset()), table.getTableName(), readOptions.build());
    table.setSchema(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
    LOG.info("Loaded schema for table {}: {}", table.getTableName(), table.getSchema());
    return true;
}
Also used : BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)

Aggregations

Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)10 Test (org.junit.Test)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)8 TableResult (com.google.cloud.bigquery.TableResult)2 DataStreamIO (com.google.cloud.teleport.v2.cdc.sources.DataStreamIO)2 CdcJdbcIO (com.google.cloud.teleport.v2.io.CdcJdbcIO)2 DmlInfo (com.google.cloud.teleport.v2.values.DmlInfo)2 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 Pipeline (org.apache.beam.sdk.Pipeline)2 GoogleCloudDataplexV1Asset (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset)1 GoogleCloudDataplexV1Entity (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Entity)1 GoogleCloudDataplexV1Partition (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Partition)1 TableReadOptions (com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)1 ReadSession (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession)1 AvroSinkWithJodaDatesConversion (com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion)1 DatastreamConstants (com.google.cloud.teleport.v2.templates.datastream.DatastreamConstants)1