Search in sources :

Example 86 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexJdbcIngestion method buildGcsPipeline.

@VisibleForTesting
static void buildGcsPipeline(Pipeline pipeline, DataplexJdbcIngestionOptions options, DynamicDataSourceConfiguration dataSourceConfig, String targetRootPath) {
    List<String> existingFiles = StorageUtils.getFilesInDirectory(targetRootPath);
    // Auto inferring beam schema
    Schema beamSchema = Schemas.jdbcSchemaToBeamSchema(dataSourceConfig.buildDatasource(), options.getQuery());
    // Convert to Avro Schema
    org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema);
    // Read from JdbcIO and convert ResultSet to Beam Row
    PCollection<Row> resultRows = pipeline.apply("Read from JdbcIO", DynamicJdbcIO.<Row>read().withDataSourceConfiguration(dataSourceConfig).withQuery(options.getQuery()).withCoder(RowCoder.of(beamSchema)).withRowMapper(BeamSchemaUtil.of(beamSchema)));
    // Convert Beam Row to GenericRecord
    PCollection<GenericRecord> genericRecords = resultRows.apply("convert to GenericRecord", ParDo.of(new BeamRowToGenericRecordFn(avroSchema))).setCoder(AvroCoder.of(avroSchema));
    // existingFiles
    if (options.getParitionColumn() == null || options.getPartitioningScheme() == null) {
        if (shouldSkipUnpartitionedTable(options, targetRootPath, existingFiles)) {
            return;
        }
    } else {
        genericRecords = applyPartitionedWriteDispositionFilter(genericRecords, options, targetRootPath, avroSchema, existingFiles);
    }
    // Write to GCS bucket
    PCollection<PartitionMetadata> metadata = genericRecords.apply("Write to GCS", new GenericRecordsToGcsPartitioned(targetRootPath, Schemas.serialize(avroSchema), options.getParitionColumn(), options.getPartitioningScheme(), options.getFileFormat()));
}
Also used : BeamRowToGenericRecordFn(com.google.cloud.teleport.v2.transforms.BeamRowToGenericRecordFn) Schema(org.apache.beam.sdk.schemas.Schema) GenericRecordsToGcsPartitioned(com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned) PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) TableRow(com.google.api.services.bigquery.model.TableRow) Row(org.apache.beam.sdk.values.Row) GenericRecord(org.apache.avro.generic.GenericRecord) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 87 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexJdbcIngestion method applyPartitionedWriteDispositionFilter.

private static PCollection<GenericRecord> applyPartitionedWriteDispositionFilter(PCollection<GenericRecord> genericRecords, DataplexJdbcIngestionOptions options, String targetRootPath, org.apache.avro.Schema avroSchema, List<String> existingFiles) {
    PCollectionTuple filteredRecordsTuple = genericRecords.apply("Filter pre-existing records", new DataplexJdbcIngestionFilter(targetRootPath, Schemas.serialize(avroSchema), options.getParitionColumn(), options.getPartitioningScheme(), options.getFileFormat().getFileSuffix(), options.getWriteDisposition(), existingFiles, FILTERED_RECORDS_OUT, EXISTING_TARGET_FILES_OUT));
    filteredRecordsTuple.get(EXISTING_TARGET_FILES_OUT).apply(Distinct.create()).apply("Log existing target file names", ParDo.of(// PCollection will be empty.
    new DoFn<String, String>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            String filename = c.element();
            LOG.info("Target File {} already exists in the output asset bucket {}. Performing " + " {} writeDisposition strategy.", filename, targetRootPath, options.getWriteDisposition());
        }
    }));
    return filteredRecordsTuple.get(FILTERED_RECORDS_OUT);
}
Also used : DataplexJdbcIngestionFilter(com.google.cloud.teleport.v2.utils.DataplexJdbcIngestionFilter) DoFn(org.apache.beam.sdk.transforms.DoFn) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 88 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class JdbcToPubsub method run.

/**
 * Runs a pipeline which reads message from JdbcIO and writes to Pub/Sub.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(JdbcToPubsubOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    LOG.info("Starting Jdbc-To-PubSub Pipeline.");
    /*
     * Steps:
     *  1) Read data from a Jdbc Table
     *  2) Write to Pub/Sub topic
     */
    DynamicJdbcIO.DynamicDataSourceConfiguration dataSourceConfiguration = DynamicJdbcIO.DynamicDataSourceConfiguration.create(options.getDriverClassName(), maybeDecrypt(options.getConnectionUrl(), options.getKMSEncryptionKey())).withDriverJars(options.getDriverJars());
    if (options.getUsername() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withUsername(maybeDecrypt(options.getUsername(), options.getKMSEncryptionKey()));
    }
    if (options.getPassword() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withPassword(maybeDecrypt(options.getPassword(), options.getKMSEncryptionKey()));
    }
    if (options.getConnectionProperties() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withConnectionProperties(options.getConnectionProperties());
    }
    PCollection<String> jdbcData = pipeline.apply("readFromJdbc", DynamicJdbcIO.<String>read().withDataSourceConfiguration(dataSourceConfiguration).withQuery(options.getQuery()).withCoder(StringUtf8Coder.of()).withRowMapper(new ResultSetToJSONString()));
    jdbcData.apply("writeSuccessMessages", PubsubIO.writeStrings().to(options.getOutputTopic()));
    return pipeline.run();
}
Also used : DynamicJdbcIO(com.google.cloud.teleport.v2.io.DynamicJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline)

Example 89 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class PubsubToJdbc method run.

/**
 * Runs a pipeline which reads message from Pub/Sub and writes to JdbcIO.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(PubsubToJdbcOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    LOG.info("Starting Pubsub-to-Jdbc Pipeline.");
    /*
     * Steps:
     *  1) Read data from a Pub/Sub subscription
     *  2) Write to Jdbc Table
     *  3) Write errors to deadletter topic
     */
    PCollection<String> pubsubData = pipeline.apply("readFromPubSubSubscription", PubsubIO.readStrings().fromSubscription(options.getInputSubscription()));
    DynamicJdbcIO.DynamicDataSourceConfiguration dataSourceConfiguration = DynamicJdbcIO.DynamicDataSourceConfiguration.create(options.getDriverClassName(), maybeDecrypt(options.getConnectionUrl(), options.getKMSEncryptionKey())).withDriverJars(options.getDriverJars());
    if (options.getUsername() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withUsername(maybeDecrypt(options.getUsername(), options.getKMSEncryptionKey()));
    }
    if (options.getPassword() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withPassword(maybeDecrypt(options.getPassword(), options.getKMSEncryptionKey()));
    }
    if (options.getConnectionProperties() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withConnectionProperties(options.getConnectionProperties());
    }
    PCollection<FailsafeElement<String, String>> errors = pubsubData.apply("writeToJdbc", DynamicJdbcIO.<String>write().withDataSourceConfiguration(dataSourceConfiguration).withStatement(options.getStatement()).withPreparedStatementSetter(new MapJsonStringToQuery(getKeyOrder(options.getStatement())))).setCoder(FAILSAFE_ELEMENT_CODER);
    errors.apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrorsToPubSub.newBuilder().setErrorRecordsTopic(options.getOutputDeadletterTopic()).build());
    return pipeline.run();
}
Also used : DynamicJdbcIO(com.google.cloud.teleport.v2.io.DynamicJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 90 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class SpannerChangeStreamsToGcs method main.

public static void main(String[] args) {
    LOG.info("Starting Input Files to GCS");
    SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.fromArgs(args).as(SpannerChangeStreamsToGcsOptions.class);
    run(options);
}
Also used : SpannerChangeStreamsToGcsOptions(com.google.cloud.teleport.v2.options.SpannerChangeStreamsToGcsOptions)

Aggregations

Test (org.junit.Test)63 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)25 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)20 Pipeline (org.apache.beam.sdk.Pipeline)19 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)19 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 GenericRecord (org.apache.avro.generic.GenericRecord)12 Category (org.junit.experimental.categories.Category)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)10 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)9 TableRow (com.google.api.services.bigquery.model.TableRow)8 DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)8 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions)8 KV (org.apache.beam.sdk.values.KV)8 ArrayList (java.util.ArrayList)7 ElasticsearchWriteOptions (com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions)6 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)6 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions)6 PubSubProtoToBigQueryOptions (com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions)6