Search in sources :

Example 1 with DynamicDataSourceConfiguration

use of com.google.cloud.teleport.v2.io.DynamicJdbcIO.DynamicDataSourceConfiguration in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexJdbcIngestion method main.

/**
 * Main entry point for pipeline execution.
 *
 * @param args Command line arguments to the pipeline.
 */
public static void main(String[] args) throws IOException {
    DataplexJdbcIngestionOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(DataplexJdbcIngestionOptions.class);
    Pipeline pipeline = Pipeline.create(options);
    DataplexClient dataplexClient = DefaultDataplexClient.withDefaultClient(options.getGcpCredential());
    String assetName = options.getOutputAsset();
    GoogleCloudDataplexV1Asset asset = resolveAsset(assetName, dataplexClient);
    DynamicDataSourceConfiguration dataSourceConfig = configDataSource(options);
    String assetType = asset.getResourceSpec().getType();
    if (DataplexAssetResourceSpec.BIGQUERY_DATASET.name().equals(assetType)) {
        buildBigQueryPipeline(pipeline, options, dataSourceConfig);
    } else if (DataplexAssetResourceSpec.STORAGE_BUCKET.name().equals(assetType)) {
        String targetRootPath = "gs://" + asset.getResourceSpec().getName() + "/" + options.getOutputTable();
        buildGcsPipeline(pipeline, options, dataSourceConfig, targetRootPath);
    } else {
        throw new IllegalArgumentException(String.format("Asset " + assetName + " is of type " + assetType + ". Only " + DataplexAssetResourceSpec.BIGQUERY_DATASET.name() + "and " + DataplexAssetResourceSpec.STORAGE_BUCKET.name() + " supported."));
    }
    pipeline.run();
}
Also used : DataplexJdbcIngestionOptions(com.google.cloud.teleport.v2.options.DataplexJdbcIngestionOptions) DefaultDataplexClient(com.google.cloud.teleport.v2.clients.DefaultDataplexClient) DataplexClient(com.google.cloud.teleport.v2.clients.DataplexClient) DynamicDataSourceConfiguration(com.google.cloud.teleport.v2.io.DynamicJdbcIO.DynamicDataSourceConfiguration) GoogleCloudDataplexV1Asset(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset) Pipeline(org.apache.beam.sdk.Pipeline)

Example 2 with DynamicDataSourceConfiguration

use of com.google.cloud.teleport.v2.io.DynamicJdbcIO.DynamicDataSourceConfiguration in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexJdbcIngestion method buildGcsPipeline.

@VisibleForTesting
static void buildGcsPipeline(Pipeline pipeline, DataplexJdbcIngestionOptions options, DynamicDataSourceConfiguration dataSourceConfig, String targetRootPath) {
    // Auto inferring beam schema
    Schema beamSchema = Schemas.jdbcSchemaToBeamSchema(dataSourceConfig.buildDatasource(), options.getQuery());
    // Convert to Avro Schema
    org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema);
    // Read from JdbcIO and convert ResultSet to Beam Row
    PCollection<Row> resultRows = pipeline.apply("Read from JdbcIO", DynamicJdbcIO.<Row>read().withDataSourceConfiguration(dataSourceConfig).withQuery(options.getQuery()).withCoder(RowCoder.of(beamSchema)).withRowMapper(BeamSchemaUtil.of(beamSchema)));
    // Convert Beam Row to GenericRecord
    PCollection<GenericRecord> genericRecords = resultRows.apply("convert to GenericRecord", ParDo.of(new BeamRowToGenericRecordFn(avroSchema))).setCoder(AvroCoder.of(avroSchema));
    // Write to GCS bucket
    PCollection<PartitionMetadata> metadata = genericRecords.apply("Write to GCS", new GenericRecordsToGcsPartitioned(targetRootPath, Schemas.serialize(avroSchema), options.getParitionColumn(), options.getPartitioningScheme(), options.getFileFormat()));
}
Also used : BeamRowToGenericRecordFn(com.google.cloud.teleport.v2.transforms.BeamRowToGenericRecordFn) GenericRecordsToGcsPartitioned(com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned) Schema(org.apache.beam.sdk.schemas.Schema) PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) TableRow(com.google.api.services.bigquery.model.TableRow) Row(org.apache.beam.sdk.values.Row) GenericRecord(org.apache.avro.generic.GenericRecord) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with DynamicDataSourceConfiguration

use of com.google.cloud.teleport.v2.io.DynamicJdbcIO.DynamicDataSourceConfiguration in project DataflowTemplates by GoogleCloudPlatform.

the class DynamicJdbcIOTest method testReadRows.

@Test
@Category(NeedsRunner.class)
public void testReadRows() throws Exception {
    DynamicDataSourceConfiguration dataSourceConfig = DynamicJdbcIO.DynamicDataSourceConfiguration.create("org.apache.derby.jdbc.ClientDriver", maybeDecrypt("jdbc:derby://localhost:" + port + "/target/beam", null));
    String query = "select name, id from " + readTableName;
    org.apache.beam.sdk.schemas.Schema beamSchema = Schemas.jdbcSchemaToBeamSchema(dataSourceConfig.buildDatasource(), query);
    PCollection<Row> resultRows = pipeline.apply(DynamicJdbcIO.<Row>read().withDataSourceConfiguration(dataSourceConfig).withQuery(query).withCoder(RowCoder.of(beamSchema)).withRowMapper(BeamSchemaUtil.of(beamSchema)));
    PAssert.thatSingleton(resultRows.apply("Count", Count.globally())).isEqualTo((long) EXPECTED_ROW_COUNT);
    Schema schema = Schema.builder().addField("name", Schema.FieldType.STRING).addField("id", Schema.FieldType.INT32).build();
    List<Row> expectedList = Lists.newArrayListWithExpectedSize(EXPECTED_ROW_COUNT);
    for (int i = 0; i < EXPECTED_ROW_COUNT; i++) {
        Row row = Row.withSchema(schema).addValues(TEST_ROW_SUFFIX + "-" + i, i).build();
        expectedList.add(row);
    }
    PAssert.that(resultRows).containsInAnyOrder(expectedList);
    pipeline.run();
}
Also used : Schema(org.apache.beam.sdk.schemas.Schema) DynamicDataSourceConfiguration(com.google.cloud.teleport.v2.io.DynamicJdbcIO.DynamicDataSourceConfiguration) Row(org.apache.beam.sdk.values.Row) Schema(org.apache.beam.sdk.schemas.Schema) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Aggregations

DynamicDataSourceConfiguration (com.google.cloud.teleport.v2.io.DynamicJdbcIO.DynamicDataSourceConfiguration)2 Schema (org.apache.beam.sdk.schemas.Schema)2 Row (org.apache.beam.sdk.values.Row)2 TableRow (com.google.api.services.bigquery.model.TableRow)1 GoogleCloudDataplexV1Asset (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset)1 DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)1 DefaultDataplexClient (com.google.cloud.teleport.v2.clients.DefaultDataplexClient)1 DataplexJdbcIngestionOptions (com.google.cloud.teleport.v2.options.DataplexJdbcIngestionOptions)1 BeamRowToGenericRecordFn (com.google.cloud.teleport.v2.transforms.BeamRowToGenericRecordFn)1 GenericRecordsToGcsPartitioned (com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned)1 PartitionMetadata (com.google.cloud.teleport.v2.values.PartitionMetadata)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 Pipeline (org.apache.beam.sdk.Pipeline)1 Test (org.junit.Test)1 Category (org.junit.experimental.categories.Category)1