Search in sources :

Example 66 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class WriteToElasticsearchTest method testInvalidConnectionInformation.

/**
 * Tests that {@link WriteToElasticsearch} throws an exception if an invalid ConnectionInformation
 * is provided.
 */
@Test
public void testInvalidConnectionInformation() {
    exceptionRule.expect(IllegalStateException.class);
    ElasticsearchWriteOptions options = PipelineOptionsFactory.create().as(ElasticsearchWriteOptions.class);
    options.setConnectionUrl(",");
    options.setIndex("index");
    options.setApiKey("key");
    pipeline.apply("CreateInput", Create.of("test")).apply("TestWriteToElasticsearch", WriteToElasticsearch.newBuilder().setOptions(options).build());
    pipeline.run();
}
Also used : ElasticsearchWriteOptions(com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions) Test(org.junit.Test)

Example 67 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class PubsubProtoToBigQueryTest method testApplyUdfWithNoUdfPathSet.

@Test
public void testApplyUdfWithNoUdfPathSet() {
    PubSubProtoToBigQueryOptions options = getOptions();
    ImmutableList<String> inputs = ImmutableList.of("First", "Second", "Third");
    PCollection<String> pInput = pipeline.apply(Create.of(inputs));
    PAssert.that(runUdf(pInput, options)).containsInAnyOrder(inputs);
    pipeline.run();
}
Also used : PubSubProtoToBigQueryOptions(com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions) Test(org.junit.Test)

Example 68 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class PubsubProtoToBigQueryTest method testGetDescriptorWithInvalidSchemaPath.

@Test
public void testGetDescriptorWithInvalidSchemaPath() {
    PubSubProtoToBigQueryOptions options = getOptions();
    String path = "/some/invalid.path.pb";
    options.setProtoSchemaPath(path);
    options.setFullMessageName("some.message.Name");
    IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> PubsubProtoToBigQuery.getDescriptor(options));
    assertThat(exception).hasMessageThat().contains(path);
}
Also used : PubSubProtoToBigQueryOptions(com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions) Test(org.junit.Test)

Example 69 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class PubsubProtoToBigQueryTest method testGetDescriptorWithInvalidProtoSchemaContents.

@Test
public void testGetDescriptorWithInvalidProtoSchemaContents() {
    PubSubProtoToBigQueryOptions options = getOptions();
    options.setProtoSchemaPath(Resources.getResource("invalid_proto_schema.pb").toString());
    options.setFullMessageName("some.message.Name");
    IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> PubsubProtoToBigQuery.getDescriptor(options));
    assertThat(exception).hasCauseThat().isInstanceOf(InvalidProtocolBufferException.class);
}
Also used : PubSubProtoToBigQueryOptions(com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions) Test(org.junit.Test)

Example 70 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class PubSubCdcToBigQuery method run.

/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(Options options) {
    Pipeline pipeline = Pipeline.create(options);
    DeadLetterQueueManager dlqManager = buildDlqManager(options);
    String gcsOutputDateTimeDirectory = null;
    if (options.getDeadLetterQueueDirectory() != null) {
        gcsOutputDateTimeDirectory = dlqManager.getRetryDlqDirectory() + "YYYY/MM/DD/HH/mm/";
    }
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(CODER.getEncodedTypeDescriptor(), CODER);
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    InputUDFToTableRow<String> failsafeTableRowTransformer = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), FAILSAFE_ELEMENT_CODER);
    BigQueryTableConfigManager bqConfigManager = new BigQueryTableConfigManager((String) options.as(GcpOptions.class).getProject(), (String) options.getOutputDatasetTemplate(), (String) options.getOutputTableNameTemplate(), (String) options.getOutputTableSpec());
    /*
     * Steps:
     *  1) Read messages in from Pub/Sub
     *  2) Transform the PubsubMessages into TableRows
     *     - Transform message payload via UDF
     *     - Convert UDF result to TableRow objects
     *  3) Write successful records out to BigQuery
     *     - Automap new objects to BigQuery if enabled
     *     - Write records to BigQuery tables
     *  4) Write failed records out to BigQuery
     */
    /*
     * Step #1: Read messages in from Pub/Sub
     */
    PCollection<PubsubMessage> messages = pipeline.apply("ReadPubSubSubscription", PubsubIO.readMessagesWithAttributes().fromSubscription(options.getInputSubscription()));
    PCollection<FailsafeElement<String, String>> jsonRecords;
    if (options.getDeadLetterQueueDirectory() != null) {
        PCollection<FailsafeElement<String, String>> failsafeMessages = messages.apply("ConvertPubSubToFailsafe", ParDo.of(new PubSubToFailSafeElement()));
        PCollection<FailsafeElement<String, String>> dlqJsonRecords = pipeline.apply(dlqManager.dlqReconsumer()).apply(ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

            @ProcessElement
            public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
                receiver.output(FailsafeElement.of(input, input));
            }
        })).setCoder(FAILSAFE_ELEMENT_CODER);
        jsonRecords = PCollectionList.of(failsafeMessages).and(dlqJsonRecords).apply(Flatten.pCollections());
    } else {
        jsonRecords = messages.apply("ConvertPubSubToFailsafe", ParDo.of(new PubSubToFailSafeElement()));
    }
    PCollectionTuple convertedTableRows = jsonRecords.apply(Reshuffle.<FailsafeElement<String, String>>viaRandomKey().withNumBuckets(options.getThreadCount())).apply("ApplyUdfAndConvertToTableRow", failsafeTableRowTransformer);
    /*
     * Step #3: Write the successful records out to BigQuery
     *   Either extract table destination only
     *   or extract table destination and auto-map new columns
     */
    PCollection<KV<TableId, TableRow>> tableEvents;
    if (options.getAutoMapTables()) {
        tableEvents = convertedTableRows.get(failsafeTableRowTransformer.transformOut).apply("Map Data to BigQuery Tables", new BigQueryMappers(bqConfigManager.getProjectId()).buildBigQueryTableMapper(bqConfigManager.getDatasetTemplate(), bqConfigManager.getTableTemplate()).withDefaultSchemaFromGCS(options.getSchemaFilePath()));
    } else {
        tableEvents = convertedTableRows.get(failsafeTableRowTransformer.transformOut).apply("ExtractBigQueryTableDestination", BigQueryDynamicConverters.extractTableRowDestination(bqConfigManager.getProjectId(), bqConfigManager.getDatasetTemplate(), bqConfigManager.getTableTemplate()));
    }
    /*
     * Step #3: Cont.
     *    - Write rows out to BigQuery
     */
    // TODO(https://github.com/apache/beam/pull/12004): Switch out alwaysRetry
    WriteResult writeResult = tableEvents.apply("WriteSuccessfulRecords", BigQueryIO.<KV<TableId, TableRow>>write().to(new BigQueryDynamicConverters().bigQueryDynamicDestination()).withFormatFunction(element -> element.getValue()).withoutValidation().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.alwaysRetry()));
    // TODO: Cover tableRowRecords.get(TRANSFORM_DEADLETTER_OUT) error values
    if (options.getDeadLetterQueueDirectory() != null) {
        writeResult.getFailedInsertsWithErr().apply("DLQ: Write Insert Failures to GCS", MapElements.via(new BigQueryDeadLetterQueueSanitizer())).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration())))).apply("DLQ: Write File(s)", TextIO.write().withWindowedWrites().withNumShards(20).to(new WindowedFilenamePolicy(gcsOutputDateTimeDirectory, "error", "-SSSSS-of-NNNNN", ".json")).withTempDirectory(FileBasedSink.convertToFileResourceIfPossible(options.getDeadLetterQueueDirectory())));
        PCollection<FailsafeElement<String, String>> transformDeadletter = PCollectionList.of(ImmutableList.of(convertedTableRows.get(failsafeTableRowTransformer.udfDeadletterOut), convertedTableRows.get(failsafeTableRowTransformer.transformDeadletterOut))).apply("Flatten", Flatten.pCollections()).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration()))));
        PCollection<String> dlqWindowing = transformDeadletter.apply("Sanitize records", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of());
        dlqWindowing.apply("DLQ: Write File(s)", TextIO.write().withWindowedWrites().withNumShards(20).to(new WindowedFilenamePolicy(gcsOutputDateTimeDirectory, "error", "-SSSSS-of-NNNNN", ".json")).withTempDirectory(FileBasedSink.convertToFileResourceIfPossible(gcsOutputDateTimeDirectory + "tmp/")));
    } else {
        PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via((BigQueryInsertError e) -> BigQueryConverters.wrapBigQueryInsertError(e))).setCoder(FAILSAFE_ELEMENT_CODER);
        /*
       * Step #4: Write records that failed table row transformation
       * or conversion out to BigQuery deadletter table.
       */
        PCollectionList.of(ImmutableList.of(convertedTableRows.get(failsafeTableRowTransformer.udfDeadletterOut), convertedTableRows.get(failsafeTableRowTransformer.transformDeadletterOut))).apply("Flatten", Flatten.pCollections()).apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(BigQueryConverters.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), bqConfigManager.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
        // 5) Insert records that failed insert into deadletter table
        failedInserts.apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(BigQueryConverters.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), bqConfigManager.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
    }
    return pipeline.run();
}
Also used : TableId(com.google.cloud.bigquery.TableId) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PubSubToFailSafeElement(com.google.cloud.teleport.v2.transforms.PubSubToFailSafeElement) PipelineResult(org.apache.beam.sdk.PipelineResult) TableId(com.google.cloud.bigquery.TableId) InsertRetryPolicy(org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy) LoggerFactory(org.slf4j.LoggerFactory) InputUDFOptions(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFOptions) Description(org.apache.beam.sdk.options.Description) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) PCollectionList(org.apache.beam.sdk.values.PCollectionList) TableRow(com.google.api.services.bigquery.model.TableRow) Window(org.apache.beam.sdk.transforms.windowing.Window) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) ErrorConverters(com.google.cloud.teleport.v2.transforms.ErrorConverters) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) WindowedFilenamePolicy(com.google.cloud.teleport.v2.io.WindowedFilenamePolicy) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) BigQueryDynamicConverters(com.google.cloud.teleport.v2.transforms.BigQueryDynamicConverters) ParDo(org.apache.beam.sdk.transforms.ParDo) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) DurationUtils(com.google.cloud.teleport.v2.utils.DurationUtils) KV(org.apache.beam.sdk.values.KV) Default(org.apache.beam.sdk.options.Default) BigQueryConverters(com.google.cloud.teleport.v2.transforms.BigQueryConverters) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) PubsubMessageWithAttributesCoder(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessageWithAttributesCoder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) FileBasedSink(org.apache.beam.sdk.io.FileBasedSink) FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) BigQueryTableConfigManager(com.google.cloud.teleport.v2.transforms.BigQueryConverters.BigQueryTableConfigManager) BigQueryInsertError(org.apache.beam.sdk.io.gcp.bigquery.BigQueryInsertError) DoFn(org.apache.beam.sdk.transforms.DoFn) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) Logger(org.slf4j.Logger) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) BigQueryDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.BigQueryDeadLetterQueueSanitizer) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollection(org.apache.beam.sdk.values.PCollection) PubsubIO(org.apache.beam.sdk.io.gcp.pubsub.PubsubIO) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) BigQueryMappers(com.google.cloud.teleport.v2.cdc.mappers.BigQueryMappers) ResourceUtils(com.google.cloud.teleport.v2.utils.ResourceUtils) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) TextIO(org.apache.beam.sdk.io.TextIO) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) BigQueryInsertError(org.apache.beam.sdk.io.gcp.bigquery.BigQueryInsertError) BigQueryDynamicConverters(com.google.cloud.teleport.v2.transforms.BigQueryDynamicConverters) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) PubSubToFailSafeElement(com.google.cloud.teleport.v2.transforms.PubSubToFailSafeElement) BigQueryDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.BigQueryDeadLetterQueueSanitizer) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) BigQueryTableConfigManager(com.google.cloud.teleport.v2.transforms.BigQueryConverters.BigQueryTableConfigManager) KV(org.apache.beam.sdk.values.KV) BigQueryMappers(com.google.cloud.teleport.v2.cdc.mappers.BigQueryMappers) Pipeline(org.apache.beam.sdk.Pipeline) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) TableRow(com.google.api.services.bigquery.model.TableRow) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) WindowedFilenamePolicy(com.google.cloud.teleport.v2.io.WindowedFilenamePolicy) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow)

Aggregations

Test (org.junit.Test)63 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)25 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)20 Pipeline (org.apache.beam.sdk.Pipeline)19 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)19 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 GenericRecord (org.apache.avro.generic.GenericRecord)12 Category (org.junit.experimental.categories.Category)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)10 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)9 TableRow (com.google.api.services.bigquery.model.TableRow)8 DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)8 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions)8 KV (org.apache.beam.sdk.values.KV)8 ArrayList (java.util.ArrayList)7 ElasticsearchWriteOptions (com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions)6 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)6 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions)6 PubSubProtoToBigQueryOptions (com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions)6