Search in sources :

Example 1 with DmlInfo

use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToSQL method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to SQL DML Objects
     *   3) Filter stale rows using stateful PK transform
     *   4) Write DML statements to SQL Database via jdbc
     */
    Pipeline pipeline = Pipeline.create(options);
    CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
    validateOptions(options, dataSourceConfiguration);
    Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
    /*
     * Stage 2: Write JSON Strings to SQL Insert Strings
     *   a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     * Stage 3) Filter stale rows using stateful PK transform
     */
    PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
    /*
     * Stage 4: Write Inserts to CloudSQL
     */
    dmlStatements.apply("Write to SQL", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {

        public String formatStatement(DmlInfo element) {
            return element.getDmlSql();
        }
    }));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) CdcJdbcIO(com.google.cloud.teleport.v2.io.CdcJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 2 with DmlInfo

use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToPostgres method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to Postgres DML Objects
     *   3) Filter stale rows using stateful PK transform
     *   4) Write DML statements to Postgres
     */
    Pipeline pipeline = Pipeline.create(options);
    String jdbcDriverConnectionString = String.format("jdbc:postgresql://%s:%s/%s", options.getDatabaseHost(), options.getDatabasePort(), options.getDatabaseName());
    CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = CdcJdbcIO.DataSourceConfiguration.create("org.postgresql.Driver", jdbcDriverConnectionString).withUsername(options.getDatabaseUser()).withPassword(options.getDatabasePassword()).withMaxIdleConnections(new Integer(0));
    validateOptions(options, dataSourceConfiguration);
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
    /*
     * Stage 2: Write JSON Strings to Postgres Insert Strings
     *   a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     * Stage 3) Filter stale rows using stateful PK transform
     */
    PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to Postgres DML", CreateDml.createDmlObjects(dataSourceConfiguration)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
    /*
     * Stage 4: Write Inserts to CloudSQL
     */
    dmlStatements.apply("Write to Postgres", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {

        public String formatStatement(DmlInfo element) {
            return element.getDmlSql();
        }
    }));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) CdcJdbcIO(com.google.cloud.teleport.v2.io.CdcJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 3 with DmlInfo

use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.

the class DatabaseMigrationUtils method convertJsonToDmlInfo.

public KV<String, DmlInfo> convertJsonToDmlInfo(FailsafeElement<String, String> element) {
    String jsonString = element.getPayload();
    ObjectMapper mapper = new ObjectMapper();
    JsonNode rowObj;
    try {
        rowObj = mapper.readTree(jsonString);
    } catch (IOException e) {
        LOG.error("IOException: {} :: {}", jsonString, e.toString());
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
        // TODO(dhercher): how should we handle bad data?
        return KV.of(jsonString, dmlInfo);
    }
    try {
        // Oracle uses upper case while Postgres uses all lowercase.
        // We lowercase the values of these metadata fields to align with
        // our schema conversion rules.
        String schemaName = this.getPostgresSchemaName(rowObj);
        String tableName = this.getPostgresTableName(rowObj);
        Map<String, String> tableSchema = this.getTableSchema(schemaName, tableName);
        List<String> primaryKeys = this.getPrimaryKeys(schemaName, tableName, rowObj);
        List<String> orderByFields = Arrays.asList("_metadata_timestamp", "_metadata_scn");
        List<String> primaryKeyValues = getFieldValues(rowObj, primaryKeys);
        List<String> orderByValues = getFieldValues(rowObj, orderByFields);
        if (tableSchema.isEmpty()) {
            // If the table DNE we supply an empty SQL value (NOOP)
            DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
            return KV.of(jsonString, dmlInfo);
        }
        String dmlSql;
        if (rowObj.get("_metadata_deleted").asBoolean()) {
            dmlSql = convertJsonToDeleteSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
        } else if (primaryKeys.size() == 0) {
            // TODO(dhercher): Do we choose to support this case?
            dmlSql = convertJsonToInsertSql(rowObj, tableSchema, schemaName, tableName);
        } else {
            dmlSql = convertJsonToUpsertSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
        }
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), dmlSql, schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
        return KV.of(dmlInfo.getStateWindowKey(), dmlInfo);
    } catch (Exception e) {
        LOG.error("Value Error: {} :: {}", rowObj.toString(), e.toString());
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
        // TODO(dhercher): how should we handle bad data?
        return KV.of(jsonString, dmlInfo);
    }
}
Also used : ArrayList(java.util.ArrayList) JsonNode(org.codehaus.jackson.JsonNode) IOException(java.io.IOException) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) SQLException(java.sql.SQLException) IOException(java.io.IOException)

Example 4 with DmlInfo

use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.

the class DatastreamToDML method convertJsonToDmlInfo.

public DmlInfo convertJsonToDmlInfo(JsonNode rowObj, String failsafeValue) {
    DatastreamRow row = DatastreamRow.of(rowObj);
    try {
        // Oracle uses upper case while Postgres uses all lowercase.
        // We lowercase the values of these metadata fields to align with
        // our schema conversion rules.
        String catalogName = this.getTargetCatalogName(row);
        String schemaName = this.getTargetSchemaName(row);
        String tableName = this.getTargetTableName(row);
        Map<String, String> tableSchema = this.getTableSchema(catalogName, schemaName, tableName);
        if (tableSchema.isEmpty()) {
            // If the table DNE we return null (NOOP)
            return null;
        }
        List<String> primaryKeys = this.getPrimaryKeys(catalogName, schemaName, tableName, rowObj);
        List<String> orderByFields = row.getSortFields();
        List<String> primaryKeyValues = getFieldValues(rowObj, primaryKeys, tableSchema);
        List<String> orderByValues = getFieldValues(rowObj, orderByFields, tableSchema);
        String dmlSqlTemplate = getDmlTemplate(rowObj, primaryKeys);
        Map<String, String> sqlTemplateValues = getSqlTemplateValues(rowObj, catalogName, schemaName, tableName, primaryKeys, tableSchema);
        String dmlSql = StringSubstitutor.replace(dmlSqlTemplate, sqlTemplateValues, "{", "}");
        return DmlInfo.of(failsafeValue, dmlSql, schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
    } catch (DeletedWithoutPrimaryKey e) {
        LOG.error("CDC Error: {} :: {}", rowObj.toString(), e.toString());
        return null;
    } catch (Exception e) {
        // TODO(dhercher): Consider raising an error and pushing to DLQ
        LOG.error("Value Error: {} :: {}", rowObj.toString(), e.toString());
        return null;
    }
}
Also used : DatastreamRow(com.google.cloud.teleport.v2.values.DatastreamRow) SQLException(java.sql.SQLException) IOException(java.io.IOException)

Example 5 with DmlInfo

use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.

the class DatastreamToDML method processElement.

@ProcessElement
public void processElement(ProcessContext context) {
    FailsafeElement<String, String> element = context.element();
    String jsonString = element.getPayload();
    ObjectMapper mapper = new ObjectMapper();
    JsonNode rowObj;
    try {
        rowObj = mapper.readTree(jsonString);
        DmlInfo dmlInfo = convertJsonToDmlInfo(rowObj, element.getOriginalPayload());
        // Null rows suggest no DML is required.
        if (dmlInfo != null) {
            context.output(KV.of(dmlInfo.getStateWindowKey(), dmlInfo));
        }
    } catch (IOException e) {
        // TODO(dhercher): Push failure to DLQ collection
        LOG.error("IOException: {} :: {}", jsonString, e.toString());
    }
}
Also used : JsonNode(org.codehaus.jackson.JsonNode) IOException(java.io.IOException) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) ObjectMapper(org.codehaus.jackson.map.ObjectMapper)

Aggregations

DmlInfo (com.google.cloud.teleport.v2.values.DmlInfo)4 IOException (java.io.IOException)3 DataStreamIO (com.google.cloud.teleport.v2.cdc.sources.DataStreamIO)2 CdcJdbcIO (com.google.cloud.teleport.v2.io.CdcJdbcIO)2 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)2 SQLException (java.sql.SQLException)2 Pipeline (org.apache.beam.sdk.Pipeline)2 JsonNode (org.codehaus.jackson.JsonNode)2 ObjectMapper (org.codehaus.jackson.map.ObjectMapper)2 DatastreamRow (com.google.cloud.teleport.v2.values.DatastreamRow)1 ArrayList (java.util.ArrayList)1