Search in sources :

Example 21 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class KafkaToBigQuery method run.

/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(KafkaToBQOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    List<String> topicsList;
    if (options.getKafkaReadTopics() != null) {
        topicsList = new ArrayList<>(Arrays.asList(options.getKafkaReadTopics().split(",")));
    } else if (options.getInputTopics() != null) {
        topicsList = new ArrayList<>(Arrays.asList(options.getInputTopics().split(",")));
    } else {
        throw new IllegalArgumentException("Please Provide --kafkaReadTopic");
    }
    String bootstrapServers;
    if (options.getReadBootstrapServers() != null) {
        bootstrapServers = options.getReadBootstrapServers();
    } else if (options.getBootstrapServers() != null) {
        bootstrapServers = options.getBootstrapServers();
    } else {
        throw new IllegalArgumentException("Please Provide --bootstrapServers");
    }
    /*
     * Steps:
     *  1) Read messages in from Kafka
     *  2) Transform the messages into TableRows
     *     - Transform message payload via UDF
     *     - Convert UDF result to TableRow objects
     *  3) Write successful records out to BigQuery
     *  4) Write failed records out to BigQuery
     */
    PCollectionTuple convertedTableRows = pipeline.apply("ReadFromKafka", readFromKafka(bootstrapServers, topicsList, ImmutableMap.of(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"), null)).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
    /*
     * Step #3: Write the successful records out to BigQuery
     */
    WriteResult writeResult = convertedTableRows.get(TRANSFORM_OUT).apply("WriteSuccessfulRecords", BigQueryIO.writeTableRows().withoutValidation().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).to(options.getOutputTableSpec()));
    /*
     * Step 3 Contd.
     * Elements that failed inserts into BigQuery are extracted and converted to FailsafeElement
     */
    PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(KafkaToBigQuery::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER);
    /*
     * Step #4: Write failed records out to BigQuery
     */
    PCollectionList.of(convertedTableRows.get(UDF_DEADLETTER_OUT)).and(convertedTableRows.get(TRANSFORM_DEADLETTER_OUT)).apply("Flatten", Flatten.pCollections()).apply("WriteTransformationFailedRecords", WriteKafkaMessageErrors.newBuilder().setErrorRecordsTable(ObjectUtils.firstNonNull(options.getOutputDeadletterTable(), options.getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    /*
     * Step #5: Insert records that failed BigQuery inserts into a deadletter table.
     */
    failedInserts.apply("WriteInsertionFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(ObjectUtils.firstNonNull(options.getOutputDeadletterTable(), options.getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    return pipeline.run();
}
Also used : ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 22 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class DeleteBigQueryDataFnTest method testTransform_withDeleteSourceDataEnabled_truncatesData.

@Test
@Category(NeedsRunner.class)
public void testTransform_withDeleteSourceDataEnabled_truncatesData() throws InterruptedException {
    Options options = TestPipeline.testingPipelineOptions().as(Options.class);
    options.setDeleteSourceData(true);
    PCollection<Void> actual = testPipeline.apply("CreateInput", Create.of(KV.of(partitionedTable, partition), KV.of(table, (BigQueryTablePartition) null)).withCoder(fnCoder)).apply("TestDeleteBigQueryDataFn", ParDo.of(fnUnderTest));
    PAssert.that(actual).empty();
    testPipeline.run(options);
    verify(bqMock, times(1)).query(QueryJobConfiguration.newBuilder("truncate table `pr1.d1.t1`").build());
    verify(bqMock, times(1)).delete(TableId.of("pr1", "d1", "t1p$p1"));
    verifyNoMoreInteractions(bqMock);
}
Also used : Options(com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 23 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcsFilterTest method test_whenPartitionedTableHasNoPartitions_filterExcludesTable.

@Test
public void test_whenPartitionedTableHasNoPartitions_filterExcludesTable() {
    options.setTables(null);
    options.setExportDataModifiedBeforeDateTime(null);
    Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
    assertThat(f.shouldSkipPartitionedTable(table(), Collections.emptyList())).isTrue();
}
Also used : Filter(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter) Test(org.junit.Test)

Example 24 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcsFilterTest method test_whenNoFilterOptions_filterAcceptsAllTablesAndPartitions.

@Test
public void test_whenNoFilterOptions_filterAcceptsAllTablesAndPartitions() {
    BigQueryTable.Builder t = table();
    BigQueryTablePartition p = partition().build();
    options.setTables(null);
    options.setExportDataModifiedBeforeDateTime(null);
    Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
    assertThat(f.shouldSkipUnpartitionedTable(t)).isFalse();
    assertThat(f.shouldSkipPartitionedTable(t, Collections.singletonList(p))).isFalse();
    assertThat(f.shouldSkipPartition(t, p)).isFalse();
}
Also used : BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) Filter(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) Test(org.junit.Test)

Example 25 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryToParquetTest method testReadSessionFactoryBadTable.

/**
 * Test {@link ReadSessionFactory} throws exception when invalid table reference is provided.
 */
@Test(expected = IllegalArgumentException.class)
public void testReadSessionFactoryBadTable() {
    // Test input
    final String badTableRef = "fantasmic-999999;great_data.table";
    final TableReadOptions tableReadOptions = TableReadOptions.newBuilder().build();
    ReadSessionFactory trsf = new ReadSessionFactory();
    ReadSession trs = trsf.create(client, badTableRef, tableReadOptions);
}
Also used : ReadSessionFactory(com.google.cloud.teleport.v2.templates.BigQueryToParquet.ReadSessionFactory) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions) Test(org.junit.Test)

Aggregations

Test (org.junit.Test)26 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 ArrayList (java.util.ArrayList)10 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)9 Pipeline (org.apache.beam.sdk.Pipeline)9 Ddl (com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl)8 Table (com.google.cloud.teleport.v2.templates.spanner.ddl.Table)8 Set (java.util.Set)8 PipelineResult (org.apache.beam.sdk.PipelineResult)8 TableRow (com.google.api.services.bigquery.model.TableRow)6 IntegrationTest (com.google.cloud.teleport.v2.spanner.IntegrationTest)6 PCollection (org.apache.beam.sdk.values.PCollection)6 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)6 IOException (java.io.IOException)5 WriteResult (org.apache.beam.sdk.io.gcp.bigquery.WriteResult)5 Timestamp (com.google.cloud.Timestamp)4 Column (com.google.cloud.teleport.v2.templates.spanner.ddl.Column)4 IndexColumn (com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn)4