Search in sources :

Example 16 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class ShadowTableCreator method constructShadowTable.

/*
   * Constructs a shadow table for a data table in the information schema.
   * Note: Shadow tables for interleaved tables are not interleaved to
   * their shadow parent table.
   */
Table constructShadowTable(Ddl informationSchema, String dataTableName) {
    // Create a new shadow table with the given prefix.
    Table.Builder shadowTableBuilder = Table.builder();
    String shadowTableName = shadowTablePrefix + dataTableName;
    shadowTableBuilder.name(shadowTableName);
    // Add key columns from the data table to the shadow table builder.
    Table dataTable = informationSchema.table(dataTableName);
    Set<String> primaryKeyColNames = dataTable.primaryKeys().stream().map(k -> k.name()).collect(Collectors.toSet());
    List<Column> primaryKeyCols = dataTable.columns().stream().filter(col -> primaryKeyColNames.contains(col.name())).collect(Collectors.toList());
    for (Column col : primaryKeyCols) {
        shadowTableBuilder.addColumn(col);
    }
    // Add primary key constraints.
    for (IndexColumn keyColumn : dataTable.primaryKeys()) {
        if (keyColumn.order() == IndexColumn.Order.ASC) {
            shadowTableBuilder.primaryKey().asc(keyColumn.name()).end();
        } else if (keyColumn.order() == IndexColumn.Order.DESC) {
            shadowTableBuilder.primaryKey().desc(keyColumn.name()).end();
        }
    }
    // Add extra column to track ChangeEventSequence information
    addChangeEventSequenceColumns(shadowTableBuilder);
    return shadowTableBuilder.build();
}
Also used : List(java.util.List) Pair(org.apache.commons.lang3.tuple.Pair) DatastreamConstants(com.google.cloud.teleport.v2.templates.datastream.DatastreamConstants) Ddl(com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn) Column(com.google.cloud.teleport.v2.templates.spanner.ddl.Column) Map(java.util.Map) Table(com.google.cloud.teleport.v2.templates.spanner.ddl.Table) Set(java.util.Set) Collectors(java.util.stream.Collectors) Table(com.google.cloud.teleport.v2.templates.spanner.ddl.Table) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn) Column(com.google.cloud.teleport.v2.templates.spanner.ddl.Column) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn)

Example 17 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class DatabaseMigrationUtils method convertJsonToDmlInfo.

public KV<String, DmlInfo> convertJsonToDmlInfo(FailsafeElement<String, String> element) {
    String jsonString = element.getPayload();
    ObjectMapper mapper = new ObjectMapper();
    JsonNode rowObj;
    try {
        rowObj = mapper.readTree(jsonString);
    } catch (IOException e) {
        LOG.error("IOException: {} :: {}", jsonString, e.toString());
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
        // TODO(dhercher): how should we handle bad data?
        return KV.of(jsonString, dmlInfo);
    }
    try {
        // Oracle uses upper case while Postgres uses all lowercase.
        // We lowercase the values of these metadata fields to align with
        // our schema conversion rules.
        String schemaName = this.getPostgresSchemaName(rowObj);
        String tableName = this.getPostgresTableName(rowObj);
        Map<String, String> tableSchema = this.getTableSchema(schemaName, tableName);
        List<String> primaryKeys = this.getPrimaryKeys(schemaName, tableName, rowObj);
        List<String> orderByFields = Arrays.asList("_metadata_timestamp", "_metadata_scn");
        List<String> primaryKeyValues = getFieldValues(rowObj, primaryKeys);
        List<String> orderByValues = getFieldValues(rowObj, orderByFields);
        if (tableSchema.isEmpty()) {
            // If the table DNE we supply an empty SQL value (NOOP)
            DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
            return KV.of(jsonString, dmlInfo);
        }
        String dmlSql;
        if (rowObj.get("_metadata_deleted").asBoolean()) {
            dmlSql = convertJsonToDeleteSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
        } else if (primaryKeys.size() == 0) {
            // TODO(dhercher): Do we choose to support this case?
            dmlSql = convertJsonToInsertSql(rowObj, tableSchema, schemaName, tableName);
        } else {
            dmlSql = convertJsonToUpsertSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
        }
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), dmlSql, schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
        return KV.of(dmlInfo.getStateWindowKey(), dmlInfo);
    } catch (Exception e) {
        LOG.error("Value Error: {} :: {}", rowObj.toString(), e.toString());
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
        // TODO(dhercher): how should we handle bad data?
        return KV.of(jsonString, dmlInfo);
    }
}
Also used : ArrayList(java.util.ArrayList) JsonNode(org.codehaus.jackson.JsonNode) IOException(java.io.IOException) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) SQLException(java.sql.SQLException) IOException(java.io.IOException)

Example 18 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcsTest method setUp.

@Before
public void setUp() throws InterruptedException, IOException {
    options = TestPipeline.testingPipelineOptions().as(DataplexBigQueryToGcsOptions.class);
    options.setProject(PROJECT);
    options.setUpdateDataplexMetadata(true);
    options.setEnforceSamePartitionKey(false);
    // Required when using BigQueryIO.withMethod(EXPORT).
    options.setTempLocation(tmpDir.newFolder("bqTmp").getAbsolutePath());
    outDir = tmpDir.newFolder("out");
    bqSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("ts").setType("TIMESTAMP"), new TableFieldSchema().setName("s1").setType("STRING"), new TableFieldSchema().setName("d1").setType("DATE"), new TableFieldSchema().setName("t1").setType("TIME").setMode("REQUIRED"), new TableFieldSchema().setName("dt").setType("DATETIME"), new TableFieldSchema().setName("i1").setType("INTEGER")));
    avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"__root__\",\"fields\":" + "[{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}]}," + "{\"name\":\"s1\",\"type\":[\"null\",\"string\"]}," + "{\"name\":\"d1\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}]}," + "{\"name\":\"t1\",\"type\":{\"type\":\"long\",\"logicalType\":\"time-micros\"}}," + "{\"name\":\"dt\",\"type\":[\"null\",{\"type\":\"string\",\"logicalType\":\"datetime\"}]}," + "{\"name\":\"i1\",\"type\":[\"null\",\"long\"]}]}");
    long modTime = System.currentTimeMillis() * 1000;
    BigQueryTablePartition p1 = BigQueryTablePartition.builder().setPartitionName("p1").setLastModificationTime(modTime).build();
    BigQueryTablePartition p2 = BigQueryTablePartition.builder().setPartitionName("p2").setLastModificationTime(modTime).build();
    BigQueryTable t1 = BigQueryTable.builder().setTableName("partitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).setPartitioningColumn("ts").setPartitions(Arrays.asList(p1, p2)).build();
    BigQueryTable t2 = BigQueryTable.builder().setTableName("unpartitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).build();
    tableByName = new HashMap<>();
    tableByName.put(t1.getTableName(), t1);
    tableByName.put(t2.getTableName(), t2);
    defaultRecords = new TableRow[] { new TableRow().set("ts", 1L).set("s1", "1001").set("d1", "1970-01-01").set("t1", "00:00:00.000001").set("dt", "2020-01-01T00:42:00.123").set("i1", 2001L), new TableRow().set("ts", 2L).set("s1", "1002").set("d1", "1970-01-02").set("t1", "00:00:00.000002").set("dt", "2020-01-02T00:42:00.123").set("i1", 2002L), new TableRow().set("ts", 3L).set("s1", "1003").set("d1", "1970-01-03").set("t1", "00:00:00.000003").set("dt", "2020-01-03T00:42:00.123").set("i1", 2003L), new TableRow().set("ts", 4L).set("s1", "1004").set("d1", "1970-01-04").set("t1", "00:00:00.000004").set("dt", "2020-01-04T00:42:00.123").set("i1", null), new TableRow().set("ts", 5L).set("s1", "1005").set("d1", "1970-01-05").set("t1", "00:00:00.000005").set("dt", "2020-01-05T00:42:00.123").set("i1", 2005L) };
    defaultExpectedRecords = new String[] { "{\"ts\": 1, \"s1\": \"1001\", \"d1\": 0, \"t1\": 1, \"dt\": \"2020-01-01T00:42:00.123\"," + " \"i1\": 2001}", "{\"ts\": 2, \"s1\": \"1002\", \"d1\": 1, \"t1\": 2, \"dt\": \"2020-01-02T00:42:00.123\"," + " \"i1\": 2002}", "{\"ts\": 3, \"s1\": \"1003\", \"d1\": 2, \"t1\": 3, \"dt\": \"2020-01-03T00:42:00.123\"," + " \"i1\": 2003}", "{\"ts\": 4, \"s1\": \"1004\", \"d1\": 3, \"t1\": 4, \"dt\": \"2020-01-04T00:42:00.123\"," + " \"i1\": null}", "{\"ts\": 5, \"s1\": \"1005\", \"d1\": 4, \"t1\": 5, \"dt\": \"2020-01-05T00:42:00.123\"," + " \"i1\": 2005}" };
    FakeDatasetService.setUp();
    fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset(PROJECT, DATASET, "", "", null);
    fakeDatasetService.createTable(new Table().setTableReference(t1.toTableReference()).setSchema(bqSchema).setRequirePartitionFilter(true).setTimePartitioning(new TimePartitioning().setField("ts").setType("DAY")));
    fakeDatasetService.createTable(new Table().setTableReference(t2.toTableReference()).setSchema(bqSchema));
    fakeJobService = new CustomFakeJobService();
    bqFakeServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
    when(tableResultMock.iterateAll()).thenReturn(Collections.singleton(fields("unpartitioned_table", "0", null)));
    when(bqMock.query(any())).thenReturn(tableResultMock);
    when(bqMock.delete(any(TableId.class))).thenReturn(true);
    when(bqsMock.createReadSession(any())).thenReturn(ReadSession.newBuilder().setAvroSchema(AvroSchema.newBuilder().setSchema(avroSchema.toString())).build());
    metadataLoader = new BigQueryMetadataLoader(bqMock, bqsMock, MAX_PARALLEL_REQUESTS);
}
Also used : TableId(com.google.cloud.bigquery.TableId) BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) BigQueryMetadataLoader(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) TableRow(com.google.api.services.bigquery.model.TableRow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) DataplexBigQueryToGcsOptions(com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions) Before(org.junit.Before)

Example 19 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class SpannerChangeStreamsToGcsTest method testWriteToGCSAvro.

@Test
@Category(IntegrationTest.class)
public // mvn -Dexcluded.spanner.tests="" -Dtest=SpannerChangeStreamsToGcsTest test
void testWriteToGCSAvro() throws Exception {
    // Create a test database.
    String testDatabase = generateDatabaseName();
    fakeDir = tmpDir.newFolder("output").getAbsolutePath();
    fakeTempLocation = tmpDir.newFolder("temporaryLocation").getAbsolutePath();
    spannerServer.dropDatabase(testDatabase);
    // Create a table.
    List<String> statements = new ArrayList<String>();
    final String createTable = "CREATE TABLE " + TEST_TABLE + " (" + "user_id INT64 NOT NULL," + "name STRING(MAX) " + ") PRIMARY KEY(user_id)";
    final String createChangeStream = "CREATE CHANGE STREAM " + TEST_CHANGE_STREAM + " FOR Users";
    statements.add(createTable);
    statements.add(createChangeStream);
    spannerServer.createDatabase(testDatabase, statements);
    Timestamp startTimestamp = Timestamp.now();
    // Create a mutation for the table that will generate 1 data change record.
    List<Mutation> mutations = new ArrayList<>();
    mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(1).set("name").to("Name1").build());
    mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(2).set("name").to("Name2").build());
    spannerServer.getDbClient(testDatabase).write(mutations);
    Timestamp endTimestamp = Timestamp.now();
    SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
    options.setSpannerProjectId(TEST_PROJECT);
    options.setSpannerInstanceId(TEST_INSTANCE);
    options.setSpannerDatabase(testDatabase);
    options.setSpannerMetadataInstanceId(TEST_INSTANCE);
    options.setSpannerMetadataDatabase(testDatabase);
    options.setSpannerChangeStreamName(TEST_CHANGE_STREAM);
    options.setStartTimestamp(startTimestamp.toString());
    options.setEndTimestamp(endTimestamp.toString());
    List<String> experiments = new ArrayList<String>();
    options.setExperiments(experiments);
    options.setOutputFileFormat(FileFormat.AVRO);
    options.setGcsOutputDirectory(fakeDir);
    options.setOutputFilenamePrefix(AVRO_FILENAME_PREFIX);
    options.setNumShards(NUM_SHARDS);
    options.setTempLocation(fakeTempLocation);
    // Run the pipeline.
    PipelineResult result = run(options);
    result.waitUntilFinish();
    // Read from the output Avro file to assert that 1 data change record has been generated.
    PCollection<com.google.cloud.teleport.v2.DataChangeRecord> dataChangeRecords = pipeline.apply("readRecords", AvroIO.read(com.google.cloud.teleport.v2.DataChangeRecord.class).from(fakeDir + "/avro-output-*.avro"));
    PAssert.that(dataChangeRecords).satisfies(new VerifyDataChangeRecordAvro());
    pipeline.run();
    // Drop the database.
    spannerServer.dropDatabase(testDatabase);
}
Also used : ArrayList(java.util.ArrayList) DataChangeRecord(org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord) PipelineResult(org.apache.beam.sdk.PipelineResult) Timestamp(com.google.cloud.Timestamp) SpannerChangeStreamsToGcsOptions(com.google.cloud.teleport.v2.options.SpannerChangeStreamsToGcsOptions) Mutation(com.google.cloud.spanner.Mutation) Category(org.junit.experimental.categories.Category) Test(org.junit.Test) IntegrationTest(com.google.cloud.teleport.v2.spanner.IntegrationTest)

Example 20 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class StreamingDataGeneratorWriteToBigQuery method expand.

@Override
public PDone expand(PCollection<byte[]> fakeMessages) {
    WriteResult writeResults = fakeMessages.apply("Write Json messsages", BigQueryIO.<byte[]>write().to(getPipelineOptions().getOutputTableSpec()).withMethod(Method.STREAMING_INSERTS).ignoreInsertIds().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.valueOf(getPipelineOptions().getWriteDisposition())).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withExtendedErrorInfo().withFormatFunction((message) -> {
        TableRow row = null;
        try {
            row = TableRowJsonCoder.of().decode(new ByteArrayInputStream(message), Coder.Context.OUTER);
        } catch (IOException e) {
            throw new RuntimeException("Failed converting to TableRow with an error:" + e.getMessage());
        }
        return row;
    }));
    // Write errors to Dead Letter table
    writeResults.getFailedInsertsWithErr().apply("Convert to FailSafe Element", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(BigQueryConverters::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER).apply("Write Failed Records", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(MoreObjects.firstNonNull(getPipelineOptions().getOutputDeadletterTable(), getPipelineOptions().getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(// i.e schema in above method
    SchemaUtils.DEADLETTER_SCHEMA).build());
    return PDone.in(fakeMessages.getPipeline());
}
Also used : Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method) MapElements(org.apache.beam.sdk.transforms.MapElements) PDone(org.apache.beam.sdk.values.PDone) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) InsertRetryPolicy(org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy) MoreObjects(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects) Coder(org.apache.beam.sdk.coders.Coder) IOException(java.io.IOException) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollection(org.apache.beam.sdk.values.PCollection) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) PTransform(org.apache.beam.sdk.transforms.PTransform) SchemaUtils(com.google.cloud.teleport.v2.utils.SchemaUtils) ByteArrayInputStream(java.io.ByteArrayInputStream) FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) TableRow(com.google.api.services.bigquery.model.TableRow) AutoValue(com.google.auto.value.AutoValue) StreamingDataGenerator(com.google.cloud.teleport.v2.templates.StreamingDataGenerator) TableRowJsonCoder(org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) ByteArrayInputStream(java.io.ByteArrayInputStream) TableRow(com.google.api.services.bigquery.model.TableRow) IOException(java.io.IOException)

Aggregations

Test (org.junit.Test)26 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 ArrayList (java.util.ArrayList)10 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)9 Pipeline (org.apache.beam.sdk.Pipeline)9 Ddl (com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl)8 Table (com.google.cloud.teleport.v2.templates.spanner.ddl.Table)8 Set (java.util.Set)8 PipelineResult (org.apache.beam.sdk.PipelineResult)8 TableRow (com.google.api.services.bigquery.model.TableRow)6 IntegrationTest (com.google.cloud.teleport.v2.spanner.IntegrationTest)6 PCollection (org.apache.beam.sdk.values.PCollection)6 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)6 IOException (java.io.IOException)5 WriteResult (org.apache.beam.sdk.io.gcp.bigquery.WriteResult)5 Timestamp (com.google.cloud.Timestamp)4 Column (com.google.cloud.teleport.v2.templates.spanner.ddl.Column)4 IndexColumn (com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn)4