use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToSQL method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to SQL DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to SQL Database via jdbc
*/
Pipeline pipeline = Pipeline.create(options);
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
validateOptions(options, dataSourceConfiguration);
Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
/*
* Stage 2: Write JSON Strings to SQL Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to SQL", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {
public String formatStatement(DmlInfo element) {
return element.getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToPostgres method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to Postgres DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to Postgres
*/
Pipeline pipeline = Pipeline.create(options);
String jdbcDriverConnectionString = String.format("jdbc:postgresql://%s:%s/%s", options.getDatabaseHost(), options.getDatabasePort(), options.getDatabaseName());
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = CdcJdbcIO.DataSourceConfiguration.create("org.postgresql.Driver", jdbcDriverConnectionString).withUsername(options.getDatabaseUser()).withPassword(options.getDatabasePassword()).withMaxIdleConnections(new Integer(0));
validateOptions(options, dataSourceConfiguration);
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
/*
* Stage 2: Write JSON Strings to Postgres Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to Postgres DML", CreateDml.createDmlObjects(dataSourceConfiguration)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to Postgres", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {
public String formatStatement(DmlInfo element) {
return element.getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.
the class DatabaseMigrationUtils method convertJsonToDmlInfo.
public KV<String, DmlInfo> convertJsonToDmlInfo(FailsafeElement<String, String> element) {
String jsonString = element.getPayload();
ObjectMapper mapper = new ObjectMapper();
JsonNode rowObj;
try {
rowObj = mapper.readTree(jsonString);
} catch (IOException e) {
LOG.error("IOException: {} :: {}", jsonString, e.toString());
DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
// TODO(dhercher): how should we handle bad data?
return KV.of(jsonString, dmlInfo);
}
try {
// Oracle uses upper case while Postgres uses all lowercase.
// We lowercase the values of these metadata fields to align with
// our schema conversion rules.
String schemaName = this.getPostgresSchemaName(rowObj);
String tableName = this.getPostgresTableName(rowObj);
Map<String, String> tableSchema = this.getTableSchema(schemaName, tableName);
List<String> primaryKeys = this.getPrimaryKeys(schemaName, tableName, rowObj);
List<String> orderByFields = Arrays.asList("_metadata_timestamp", "_metadata_scn");
List<String> primaryKeyValues = getFieldValues(rowObj, primaryKeys);
List<String> orderByValues = getFieldValues(rowObj, orderByFields);
if (tableSchema.isEmpty()) {
// If the table DNE we supply an empty SQL value (NOOP)
DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
return KV.of(jsonString, dmlInfo);
}
String dmlSql;
if (rowObj.get("_metadata_deleted").asBoolean()) {
dmlSql = convertJsonToDeleteSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
} else if (primaryKeys.size() == 0) {
// TODO(dhercher): Do we choose to support this case?
dmlSql = convertJsonToInsertSql(rowObj, tableSchema, schemaName, tableName);
} else {
dmlSql = convertJsonToUpsertSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
}
DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), dmlSql, schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
return KV.of(dmlInfo.getStateWindowKey(), dmlInfo);
} catch (Exception e) {
LOG.error("Value Error: {} :: {}", rowObj.toString(), e.toString());
DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
// TODO(dhercher): how should we handle bad data?
return KV.of(jsonString, dmlInfo);
}
}
use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.
the class DatastreamToDML method convertJsonToDmlInfo.
public DmlInfo convertJsonToDmlInfo(JsonNode rowObj, String failsafeValue) {
DatastreamRow row = DatastreamRow.of(rowObj);
try {
// Oracle uses upper case while Postgres uses all lowercase.
// We lowercase the values of these metadata fields to align with
// our schema conversion rules.
String catalogName = this.getTargetCatalogName(row);
String schemaName = this.getTargetSchemaName(row);
String tableName = this.getTargetTableName(row);
Map<String, String> tableSchema = this.getTableSchema(catalogName, schemaName, tableName);
if (tableSchema.isEmpty()) {
// If the table DNE we return null (NOOP)
return null;
}
List<String> primaryKeys = this.getPrimaryKeys(catalogName, schemaName, tableName, rowObj);
List<String> orderByFields = row.getSortFields();
List<String> primaryKeyValues = getFieldValues(rowObj, primaryKeys, tableSchema);
List<String> orderByValues = getFieldValues(rowObj, orderByFields, tableSchema);
String dmlSqlTemplate = getDmlTemplate(rowObj, primaryKeys);
Map<String, String> sqlTemplateValues = getSqlTemplateValues(rowObj, catalogName, schemaName, tableName, primaryKeys, tableSchema);
String dmlSql = StringSubstitutor.replace(dmlSqlTemplate, sqlTemplateValues, "{", "}");
return DmlInfo.of(failsafeValue, dmlSql, schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
} catch (DeletedWithoutPrimaryKey e) {
LOG.error("CDC Error: {} :: {}", rowObj.toString(), e.toString());
return null;
} catch (Exception e) {
// TODO(dhercher): Consider raising an error and pushing to DLQ
LOG.error("Value Error: {} :: {}", rowObj.toString(), e.toString());
return null;
}
}
use of com.google.cloud.teleport.v2.values.DmlInfo in project DataflowTemplates by GoogleCloudPlatform.
the class DatastreamToDML method processElement.
@ProcessElement
public void processElement(ProcessContext context) {
FailsafeElement<String, String> element = context.element();
String jsonString = element.getPayload();
ObjectMapper mapper = new ObjectMapper();
JsonNode rowObj;
try {
rowObj = mapper.readTree(jsonString);
DmlInfo dmlInfo = convertJsonToDmlInfo(rowObj, element.getOriginalPayload());
// Null rows suggest no DML is required.
if (dmlInfo != null) {
context.output(KV.of(dmlInfo.getStateWindowKey(), dmlInfo));
}
} catch (IOException e) {
// TODO(dhercher): Push failure to DLQ collection
LOG.error("IOException: {} :: {}", jsonString, e.toString());
}
}
Aggregations