Search in sources :

Example 26 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToSpanner method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to Cloud Spanner
     *   3) Write Failures to GCS Dead Letter Queue
     */
    Pipeline pipeline = Pipeline.create(options);
    DeadLetterQueueManager dlqManager = buildDlqManager(options);
    /*
     * Stage 1: Ingest/Normalize Data to FailsafeElement with JSON Strings and
     * read Cloud Spanner information schema.
     *   a) Prepare spanner config and process information schema
     *   b) Read DataStream data from GCS into JSON String FailsafeElements
     *   c) Reconsume Dead Letter Queue data from GCS into JSON String FailsafeElements
     *   d) Flatten DataStream and DLQ Streams
     */
    // Prepare Spanner config
    SpannerConfig spannerConfig = ExposedSpannerConfig.create().withHost(ValueProvider.StaticValueProvider.of(options.getSpannerHost())).withInstanceId(ValueProvider.StaticValueProvider.of(options.getInstanceId())).withDatabaseId(ValueProvider.StaticValueProvider.of(options.getDatabaseId()));
    /* Process information schema
     * 1) Read information schema from destination Cloud Spanner database
     * 2) Check if shadow tables are present and create if necessary
     * 3) Return new information schema
     */
    PCollection<Ddl> ddl = pipeline.apply("Process Information Schema", new ProcessInformationSchema(spannerConfig, options.getShouldCreateShadowTables(), options.getShadowTablePrefix(), options.getDatastreamSourceType()));
    PCollectionView<Ddl> ddlView = ddl.apply("Cloud Spanner DDL as view", View.asSingleton());
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
    // Elements sent to the Dead Letter Queue are to be reconsumed.
    // A DLQManager is to be created using PipelineOptions, and it is in charge
    // of building pieces of the DLQ.
    PCollectionTuple reconsumedElements = dlqManager.getReconsumerDataTransform(pipeline.apply(dlqManager.dlqReconsumer(options.getDlqRetryMinutes())));
    PCollection<FailsafeElement<String, String>> dlqJsonRecords = reconsumedElements.get(DeadLetterQueueManager.RETRYABLE_ERRORS).setCoder(FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
    PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).and(dlqJsonRecords).apply(Flatten.pCollections()).apply("Reshuffle", Reshuffle.viaRandomKey());
    /*
     * Stage 2: Write records to Cloud Spanner
     */
    SpannerTransactionWriter.Result spannerWriteResults = jsonRecords.apply("Write events to Cloud Spanner", new SpannerTransactionWriter(spannerConfig, ddlView, options.getShadowTablePrefix(), options.getDatastreamSourceType()));
    /*
     * Stage 3: Write failures to GCS Dead Letter Queue
     * a) Retryable errors are written to retry GCS Dead letter queue
     * b) Severe errors are written to severe GCS Dead letter queue
     */
    spannerWriteResults.retryableErrors().apply("DLQ: Write retryable Failures to GCS", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply("Write To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getRetryDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getRetryDlqDirectory() + "tmp/").setIncludePaneInfo(true).build());
    PCollection<FailsafeElement<String, String>> dlqErrorRecords = reconsumedElements.get(DeadLetterQueueManager.PERMANENT_ERRORS).setCoder(FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
    PCollection<FailsafeElement<String, String>> permanentErrors = PCollectionList.of(dlqErrorRecords).and(spannerWriteResults.permanentErrors()).apply(Flatten.pCollections()).apply("Reshuffle", Reshuffle.viaRandomKey());
    permanentErrors.apply("DLQ: Write Severe errors to GCS", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply("Write To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getSevereDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getSevereDlqDirectory() + "tmp/").setIncludePaneInfo(true).build());
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) ExposedSpannerConfig(org.apache.beam.sdk.io.gcp.spanner.ExposedSpannerConfig) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) Ddl(com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) ProcessInformationSchema(com.google.cloud.teleport.v2.templates.spanner.ProcessInformationSchema) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer)

Example 27 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToBigQuery method main.

/**
 * Main entry point for executing the pipeline.
 *
 * @param args The command-line arguments to the pipeline.
 */
public static void main(String[] args) {
    LOG.info("Starting Input Files to BigQuery");
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    options.setStreaming(true);
    options.setEnableStreamingEngine(true);
    validateOptions(options);
    run(options);
}
Also used : InputUDFOptions(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFOptions) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) BigQueryOptions(org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)

Example 28 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToMongoDB method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return  The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Push the data to MongoDB
     */
    Pipeline pipeline = Pipeline.create(options);
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getInputSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
    PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).apply(Flatten.pCollections());
    /**
     * Does below steps:
     * 1. Converts JSON to BSON documents.
     * 2. Removes the metadata fileds.
     * 3. Inserts the data into MongoDB collections.
     */
    jsonRecords.apply("jsonToDocuments", MapElements.via(new SimpleFunction<FailsafeElement<String, String>, Document>() {

        @Override
        public Document apply(FailsafeElement<String, String> jsonString) {
            String s = jsonString.getOriginalPayload();
            Document doc = Document.parse(s);
            return removeTableRowFields(doc, MAPPER_IGNORE_FIELDS);
        }
    })).apply("Write To MongoDB", MongoDbIO.write().withUri(options.getMongoDBUri()).withDatabase(options.getDatabase()).withCollection(options.getCollection()));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) Document(org.bson.Document) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 29 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class WriteToElasticsearch method expand.

@Override
public PDone expand(PCollection<String> jsonStrings) {
    ConnectionInformation connectionInformation = new ConnectionInformation(options().getConnectionUrl());
    ElasticsearchIO.ConnectionConfiguration config = ElasticsearchIO.ConnectionConfiguration.create(new String[] { connectionInformation.getElasticsearchURL().toString() }, options().getIndex(), DOCUMENT_TYPE);
    // If username and password are not blank, use them instead of ApiKey
    if (StringUtils.isNotBlank(options().getElasticsearchUsername()) && StringUtils.isNotBlank(options().getElasticsearchPassword())) {
        config = config.withUsername(options().getElasticsearchUsername()).withPassword(options().getElasticsearchPassword());
    } else {
        config = config.withApiKey(options().getApiKey());
    }
    ElasticsearchIO.Write elasticsearchWriter = ElasticsearchIO.write().withConnectionConfiguration(config).withMaxBatchSize(options().getBatchSize()).withMaxBatchSizeBytes(options().getBatchSizeBytes());
    if (Optional.ofNullable(options().getMaxRetryAttempts()).isPresent()) {
        elasticsearchWriter.withRetryConfiguration(ElasticsearchIO.RetryConfiguration.create(options().getMaxRetryAttempts(), getDuration(options().getMaxRetryDuration())));
    }
    return jsonStrings.apply("WriteDocuments", elasticsearchWriter);
}
Also used : ConnectionInformation(com.google.cloud.teleport.v2.elasticsearch.utils.ConnectionInformation) ElasticsearchIO(com.google.cloud.teleport.v2.elasticsearch.utils.ElasticsearchIO)

Example 30 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToSQL method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to SQL DML Objects
     *   3) Filter stale rows using stateful PK transform
     *   4) Write DML statements to SQL Database via jdbc
     */
    Pipeline pipeline = Pipeline.create(options);
    CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
    validateOptions(options, dataSourceConfiguration);
    Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withRenameColumnValue("_metadata_row_id", "rowid").withHashRowId());
    /*
     * Stage 2: Write JSON Strings to SQL Insert Strings
     *   a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     * Stage 3) Filter stale rows using stateful PK transform
     */
    PCollection<KV<String, DmlInfo>> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
    /*
     * Stage 4: Write Inserts to CloudSQL
     */
    dmlStatements.apply("Write to SQL", CdcJdbcIO.<KV<String, DmlInfo>>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<KV<String, DmlInfo>>() {

        public String formatStatement(KV<String, DmlInfo> element) {
            LOG.debug("Executing SQL: {}", element.getValue().getDmlSql());
            return element.getValue().getDmlSql();
        }
    }));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) KV(org.apache.beam.sdk.values.KV) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) CdcJdbcIO(com.google.cloud.teleport.v2.io.CdcJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Aggregations

Test (org.junit.Test)63 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)25 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)20 Pipeline (org.apache.beam.sdk.Pipeline)19 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)19 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 GenericRecord (org.apache.avro.generic.GenericRecord)12 Category (org.junit.experimental.categories.Category)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)10 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)9 TableRow (com.google.api.services.bigquery.model.TableRow)8 DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)8 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions)8 KV (org.apache.beam.sdk.values.KV)8 ArrayList (java.util.ArrayList)7 ElasticsearchWriteOptions (com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions)6 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)6 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions)6 PubSubProtoToBigQueryOptions (com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions)6