use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class ShadowTableCreator method constructShadowTable.
/*
* Constructs a shadow table for a data table in the information schema.
* Note: Shadow tables for interleaved tables are not interleaved to
* their shadow parent table.
*/
Table constructShadowTable(Ddl informationSchema, String dataTableName) {
// Create a new shadow table with the given prefix.
Table.Builder shadowTableBuilder = Table.builder();
String shadowTableName = shadowTablePrefix + dataTableName;
shadowTableBuilder.name(shadowTableName);
// Add key columns from the data table to the shadow table builder.
Table dataTable = informationSchema.table(dataTableName);
Set<String> primaryKeyColNames = dataTable.primaryKeys().stream().map(k -> k.name()).collect(Collectors.toSet());
List<Column> primaryKeyCols = dataTable.columns().stream().filter(col -> primaryKeyColNames.contains(col.name())).collect(Collectors.toList());
for (Column col : primaryKeyCols) {
shadowTableBuilder.addColumn(col);
}
// Add primary key constraints.
for (IndexColumn keyColumn : dataTable.primaryKeys()) {
if (keyColumn.order() == IndexColumn.Order.ASC) {
shadowTableBuilder.primaryKey().asc(keyColumn.name()).end();
} else if (keyColumn.order() == IndexColumn.Order.DESC) {
shadowTableBuilder.primaryKey().desc(keyColumn.name()).end();
}
}
// Add extra column to track ChangeEventSequence information
addChangeEventSequenceColumns(shadowTableBuilder);
return shadowTableBuilder.build();
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class DatabaseMigrationUtils method convertJsonToDmlInfo.
public KV<String, DmlInfo> convertJsonToDmlInfo(FailsafeElement<String, String> element) {
String jsonString = element.getPayload();
ObjectMapper mapper = new ObjectMapper();
JsonNode rowObj;
try {
rowObj = mapper.readTree(jsonString);
} catch (IOException e) {
LOG.error("IOException: {} :: {}", jsonString, e.toString());
DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
// TODO(dhercher): how should we handle bad data?
return KV.of(jsonString, dmlInfo);
}
try {
// Oracle uses upper case while Postgres uses all lowercase.
// We lowercase the values of these metadata fields to align with
// our schema conversion rules.
String schemaName = this.getPostgresSchemaName(rowObj);
String tableName = this.getPostgresTableName(rowObj);
Map<String, String> tableSchema = this.getTableSchema(schemaName, tableName);
List<String> primaryKeys = this.getPrimaryKeys(schemaName, tableName, rowObj);
List<String> orderByFields = Arrays.asList("_metadata_timestamp", "_metadata_scn");
List<String> primaryKeyValues = getFieldValues(rowObj, primaryKeys);
List<String> orderByValues = getFieldValues(rowObj, orderByFields);
if (tableSchema.isEmpty()) {
// If the table DNE we supply an empty SQL value (NOOP)
DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
return KV.of(jsonString, dmlInfo);
}
String dmlSql;
if (rowObj.get("_metadata_deleted").asBoolean()) {
dmlSql = convertJsonToDeleteSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
} else if (primaryKeys.size() == 0) {
// TODO(dhercher): Do we choose to support this case?
dmlSql = convertJsonToInsertSql(rowObj, tableSchema, schemaName, tableName);
} else {
dmlSql = convertJsonToUpsertSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
}
DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), dmlSql, schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
return KV.of(dmlInfo.getStateWindowKey(), dmlInfo);
} catch (Exception e) {
LOG.error("Value Error: {} :: {}", rowObj.toString(), e.toString());
DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
// TODO(dhercher): how should we handle bad data?
return KV.of(jsonString, dmlInfo);
}
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsTest method setUp.
@Before
public void setUp() throws InterruptedException, IOException {
options = TestPipeline.testingPipelineOptions().as(DataplexBigQueryToGcsOptions.class);
options.setProject(PROJECT);
options.setUpdateDataplexMetadata(true);
options.setEnforceSamePartitionKey(false);
// Required when using BigQueryIO.withMethod(EXPORT).
options.setTempLocation(tmpDir.newFolder("bqTmp").getAbsolutePath());
outDir = tmpDir.newFolder("out");
bqSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("ts").setType("TIMESTAMP"), new TableFieldSchema().setName("s1").setType("STRING"), new TableFieldSchema().setName("d1").setType("DATE"), new TableFieldSchema().setName("t1").setType("TIME").setMode("REQUIRED"), new TableFieldSchema().setName("dt").setType("DATETIME"), new TableFieldSchema().setName("i1").setType("INTEGER")));
avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"__root__\",\"fields\":" + "[{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}]}," + "{\"name\":\"s1\",\"type\":[\"null\",\"string\"]}," + "{\"name\":\"d1\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}]}," + "{\"name\":\"t1\",\"type\":{\"type\":\"long\",\"logicalType\":\"time-micros\"}}," + "{\"name\":\"dt\",\"type\":[\"null\",{\"type\":\"string\",\"logicalType\":\"datetime\"}]}," + "{\"name\":\"i1\",\"type\":[\"null\",\"long\"]}]}");
long modTime = System.currentTimeMillis() * 1000;
BigQueryTablePartition p1 = BigQueryTablePartition.builder().setPartitionName("p1").setLastModificationTime(modTime).build();
BigQueryTablePartition p2 = BigQueryTablePartition.builder().setPartitionName("p2").setLastModificationTime(modTime).build();
BigQueryTable t1 = BigQueryTable.builder().setTableName("partitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).setPartitioningColumn("ts").setPartitions(Arrays.asList(p1, p2)).build();
BigQueryTable t2 = BigQueryTable.builder().setTableName("unpartitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).build();
tableByName = new HashMap<>();
tableByName.put(t1.getTableName(), t1);
tableByName.put(t2.getTableName(), t2);
defaultRecords = new TableRow[] { new TableRow().set("ts", 1L).set("s1", "1001").set("d1", "1970-01-01").set("t1", "00:00:00.000001").set("dt", "2020-01-01T00:42:00.123").set("i1", 2001L), new TableRow().set("ts", 2L).set("s1", "1002").set("d1", "1970-01-02").set("t1", "00:00:00.000002").set("dt", "2020-01-02T00:42:00.123").set("i1", 2002L), new TableRow().set("ts", 3L).set("s1", "1003").set("d1", "1970-01-03").set("t1", "00:00:00.000003").set("dt", "2020-01-03T00:42:00.123").set("i1", 2003L), new TableRow().set("ts", 4L).set("s1", "1004").set("d1", "1970-01-04").set("t1", "00:00:00.000004").set("dt", "2020-01-04T00:42:00.123").set("i1", null), new TableRow().set("ts", 5L).set("s1", "1005").set("d1", "1970-01-05").set("t1", "00:00:00.000005").set("dt", "2020-01-05T00:42:00.123").set("i1", 2005L) };
defaultExpectedRecords = new String[] { "{\"ts\": 1, \"s1\": \"1001\", \"d1\": 0, \"t1\": 1, \"dt\": \"2020-01-01T00:42:00.123\"," + " \"i1\": 2001}", "{\"ts\": 2, \"s1\": \"1002\", \"d1\": 1, \"t1\": 2, \"dt\": \"2020-01-02T00:42:00.123\"," + " \"i1\": 2002}", "{\"ts\": 3, \"s1\": \"1003\", \"d1\": 2, \"t1\": 3, \"dt\": \"2020-01-03T00:42:00.123\"," + " \"i1\": 2003}", "{\"ts\": 4, \"s1\": \"1004\", \"d1\": 3, \"t1\": 4, \"dt\": \"2020-01-04T00:42:00.123\"," + " \"i1\": null}", "{\"ts\": 5, \"s1\": \"1005\", \"d1\": 4, \"t1\": 5, \"dt\": \"2020-01-05T00:42:00.123\"," + " \"i1\": 2005}" };
FakeDatasetService.setUp();
fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset(PROJECT, DATASET, "", "", null);
fakeDatasetService.createTable(new Table().setTableReference(t1.toTableReference()).setSchema(bqSchema).setRequirePartitionFilter(true).setTimePartitioning(new TimePartitioning().setField("ts").setType("DAY")));
fakeDatasetService.createTable(new Table().setTableReference(t2.toTableReference()).setSchema(bqSchema));
fakeJobService = new CustomFakeJobService();
bqFakeServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
when(tableResultMock.iterateAll()).thenReturn(Collections.singleton(fields("unpartitioned_table", "0", null)));
when(bqMock.query(any())).thenReturn(tableResultMock);
when(bqMock.delete(any(TableId.class))).thenReturn(true);
when(bqsMock.createReadSession(any())).thenReturn(ReadSession.newBuilder().setAvroSchema(AvroSchema.newBuilder().setSchema(avroSchema.toString())).build());
metadataLoader = new BigQueryMetadataLoader(bqMock, bqsMock, MAX_PARALLEL_REQUESTS);
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class SpannerChangeStreamsToGcsTest method testWriteToGCSAvro.
@Test
@Category(IntegrationTest.class)
public // mvn -Dexcluded.spanner.tests="" -Dtest=SpannerChangeStreamsToGcsTest test
void testWriteToGCSAvro() throws Exception {
// Create a test database.
String testDatabase = generateDatabaseName();
fakeDir = tmpDir.newFolder("output").getAbsolutePath();
fakeTempLocation = tmpDir.newFolder("temporaryLocation").getAbsolutePath();
spannerServer.dropDatabase(testDatabase);
// Create a table.
List<String> statements = new ArrayList<String>();
final String createTable = "CREATE TABLE " + TEST_TABLE + " (" + "user_id INT64 NOT NULL," + "name STRING(MAX) " + ") PRIMARY KEY(user_id)";
final String createChangeStream = "CREATE CHANGE STREAM " + TEST_CHANGE_STREAM + " FOR Users";
statements.add(createTable);
statements.add(createChangeStream);
spannerServer.createDatabase(testDatabase, statements);
Timestamp startTimestamp = Timestamp.now();
// Create a mutation for the table that will generate 1 data change record.
List<Mutation> mutations = new ArrayList<>();
mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(1).set("name").to("Name1").build());
mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(2).set("name").to("Name2").build());
spannerServer.getDbClient(testDatabase).write(mutations);
Timestamp endTimestamp = Timestamp.now();
SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
options.setSpannerProjectId(TEST_PROJECT);
options.setSpannerInstanceId(TEST_INSTANCE);
options.setSpannerDatabase(testDatabase);
options.setSpannerMetadataInstanceId(TEST_INSTANCE);
options.setSpannerMetadataDatabase(testDatabase);
options.setSpannerChangeStreamName(TEST_CHANGE_STREAM);
options.setStartTimestamp(startTimestamp.toString());
options.setEndTimestamp(endTimestamp.toString());
List<String> experiments = new ArrayList<String>();
options.setExperiments(experiments);
options.setOutputFileFormat(FileFormat.AVRO);
options.setGcsOutputDirectory(fakeDir);
options.setOutputFilenamePrefix(AVRO_FILENAME_PREFIX);
options.setNumShards(NUM_SHARDS);
options.setTempLocation(fakeTempLocation);
// Run the pipeline.
PipelineResult result = run(options);
result.waitUntilFinish();
// Read from the output Avro file to assert that 1 data change record has been generated.
PCollection<com.google.cloud.teleport.v2.DataChangeRecord> dataChangeRecords = pipeline.apply("readRecords", AvroIO.read(com.google.cloud.teleport.v2.DataChangeRecord.class).from(fakeDir + "/avro-output-*.avro"));
PAssert.that(dataChangeRecords).satisfies(new VerifyDataChangeRecordAvro());
pipeline.run();
// Drop the database.
spannerServer.dropDatabase(testDatabase);
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class StreamingDataGeneratorWriteToBigQuery method expand.
@Override
public PDone expand(PCollection<byte[]> fakeMessages) {
WriteResult writeResults = fakeMessages.apply("Write Json messsages", BigQueryIO.<byte[]>write().to(getPipelineOptions().getOutputTableSpec()).withMethod(Method.STREAMING_INSERTS).ignoreInsertIds().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.valueOf(getPipelineOptions().getWriteDisposition())).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withExtendedErrorInfo().withFormatFunction((message) -> {
TableRow row = null;
try {
row = TableRowJsonCoder.of().decode(new ByteArrayInputStream(message), Coder.Context.OUTER);
} catch (IOException e) {
throw new RuntimeException("Failed converting to TableRow with an error:" + e.getMessage());
}
return row;
}));
// Write errors to Dead Letter table
writeResults.getFailedInsertsWithErr().apply("Convert to FailSafe Element", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(BigQueryConverters::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER).apply("Write Failed Records", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(MoreObjects.firstNonNull(getPipelineOptions().getOutputDeadletterTable(), getPipelineOptions().getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(// i.e schema in above method
SchemaUtils.DEADLETTER_SCHEMA).build());
return PDone.in(fakeMessages.getPipeline());
}
Aggregations