use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class KafkaToBigQuery method run.
/**
* Runs the pipeline to completion with the specified options. This method does not wait until the
* pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
* object to block until the pipeline is finished running if blocking programmatic execution is
* required.
*
* @param options The execution options.
* @return The pipeline result.
*/
public static PipelineResult run(KafkaToBQOptions options) {
// Create the pipeline
Pipeline pipeline = Pipeline.create(options);
// Register the coder for pipeline
FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())), NullableCoder.of(StringUtf8Coder.of()));
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
List<String> topicsList;
if (options.getKafkaReadTopics() != null) {
topicsList = new ArrayList<>(Arrays.asList(options.getKafkaReadTopics().split(",")));
} else if (options.getInputTopics() != null) {
topicsList = new ArrayList<>(Arrays.asList(options.getInputTopics().split(",")));
} else {
throw new IllegalArgumentException("Please Provide --kafkaReadTopic");
}
String bootstrapServers;
if (options.getReadBootstrapServers() != null) {
bootstrapServers = options.getReadBootstrapServers();
} else if (options.getBootstrapServers() != null) {
bootstrapServers = options.getBootstrapServers();
} else {
throw new IllegalArgumentException("Please Provide --bootstrapServers");
}
/*
* Steps:
* 1) Read messages in from Kafka
* 2) Transform the messages into TableRows
* - Transform message payload via UDF
* - Convert UDF result to TableRow objects
* 3) Write successful records out to BigQuery
* 4) Write failed records out to BigQuery
*/
PCollectionTuple convertedTableRows = pipeline.apply("ReadFromKafka", readFromKafka(bootstrapServers, topicsList, ImmutableMap.of(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"), null)).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
/*
* Step #3: Write the successful records out to BigQuery
*/
WriteResult writeResult = convertedTableRows.get(TRANSFORM_OUT).apply("WriteSuccessfulRecords", BigQueryIO.writeTableRows().withoutValidation().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).to(options.getOutputTableSpec()));
/*
* Step 3 Contd.
* Elements that failed inserts into BigQuery are extracted and converted to FailsafeElement
*/
PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(KafkaToBigQuery::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER);
/*
* Step #4: Write failed records out to BigQuery
*/
PCollectionList.of(convertedTableRows.get(UDF_DEADLETTER_OUT)).and(convertedTableRows.get(TRANSFORM_DEADLETTER_OUT)).apply("Flatten", Flatten.pCollections()).apply("WriteTransformationFailedRecords", WriteKafkaMessageErrors.newBuilder().setErrorRecordsTable(ObjectUtils.firstNonNull(options.getOutputDeadletterTable(), options.getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
/*
* Step #5: Insert records that failed BigQuery inserts into a deadletter table.
*/
failedInserts.apply("WriteInsertionFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(ObjectUtils.firstNonNull(options.getOutputDeadletterTable(), options.getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
return pipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class KafkaToBigQueryTest method testKafkaToBigQueryE2E.
/**
* Tests the {@link KafkaToBigQuery} pipeline end-to-end.
*/
@Test
public void testKafkaToBigQueryE2E() throws Exception {
// Test input
final String key = "{\"id\": \"1001\"}";
final String badKey = "{\"id\": \"1002\"}";
final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
final String badPayload = "{\"tickets\": \"AMZ\", \"proctor\": 007";
final KV<String, String> message = KV.of(key, payload);
final KV<String, String> badMessage = KV.of(badKey, badPayload);
final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();
final FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), StringUtf8Coder.of());
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
KafkaToBigQuery.KafkaToBQOptions options = PipelineOptionsFactory.create().as(KafkaToBigQuery.KafkaToBQOptions.class);
options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
options.setJavascriptTextTransformFunctionName("transform");
// Build pipeline
PCollectionTuple transformOut = pipeline.apply("CreateInput", Create.of(message).withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
// Assert
PAssert.that(transformOut.get(KafkaToBigQuery.UDF_DEADLETTER_OUT)).empty();
PAssert.that(transformOut.get(KafkaToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
PAssert.that(transformOut.get(KafkaToBigQuery.TRANSFORM_OUT)).satisfies(collection -> {
TableRow result = collection.iterator().next();
assertThat(result.get("ticker"), is(equalTo("GOOGL")));
assertThat(result.get("price"), is(equalTo(1006.94)));
return null;
});
// Execute pipeline
pipeline.run();
// Build pipeline with malformed payload
PCollectionTuple badTransformOut = pipeline.apply("CreateBadInput", Create.of(badMessage).withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
// Assert
PAssert.that(badTransformOut.get(KafkaToBigQuery.UDF_DEADLETTER_OUT)).satisfies(collection -> {
FailsafeElement badResult = collection.iterator().next();
assertThat(badResult.getOriginalPayload(), is(equalTo(badMessage)));
assertThat(badResult.getPayload(), is(equalTo(badPayload)));
return null;
});
PAssert.that(badTransformOut.get(KafkaToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
PAssert.that(badTransformOut.get(KafkaToBigQuery.TRANSFORM_OUT)).empty();
// Execute pipeline
pipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DeleteBigQueryDataFnTest method testTransform_withDeleteSourceDataEnabled_doesntTruncateSpecialPartitions.
/**
* Test that DeleteBigQueryDataFn doesn't attempt to delete special BigQuery partitions even if
* {@code deleteSourceData = true}.
*
* <p>As per <a
* href="https://cloud.google.com/bigquery/docs/managing-partitioned-tables#delete_a_partition">
* this documentation</a>, special partitions "__NULL__" and "__UNPARTITIONED__" cannot be
* deleted.
*/
@Test
@Category(NeedsRunner.class)
public void testTransform_withDeleteSourceDataEnabled_doesntTruncateSpecialPartitions() {
Options options = TestPipeline.testingPipelineOptions().as(Options.class);
options.setDeleteSourceData(true);
BigQueryTablePartition.Builder builder = BigQueryTablePartition.builder().setLastModificationTime(System.currentTimeMillis() * 1000);
BigQueryTablePartition p1 = builder.setPartitionName("__NULL__").build();
BigQueryTablePartition p2 = builder.setPartitionName("__UNPARTITIONED__").build();
BigQueryTablePartition p3 = builder.setPartitionName("NORMAL_PARTITION").build();
BigQueryTable t1 = table.toBuilder().setPartitions(Arrays.asList(p1, p2, p3)).setPartitioningColumn("column-name-doesnt-matter").build();
DeleteBigQueryDataFn fn = new DeleteBigQueryDataFn().withTestBqClientFactory(() -> bqMock);
testPipeline.apply("CreateInput", Create.of(KV.of(t1, p1), KV.of(t1, p2), KV.of(t1, p3)).withCoder(fnCoder)).apply("TestDeleteBigQueryDataFn", ParDo.of(fn));
testPipeline.run(options);
verify(bqMock, times(1)).delete(TableId.of("pr1", "d1", "t1$NORMAL_PARTITION"));
verifyNoMoreInteractions(bqMock);
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DeleteBigQueryDataFnTest method testTransform_withDeleteSourceDataEnabled_truncatesData.
@Test
@Category(NeedsRunner.class)
public void testTransform_withDeleteSourceDataEnabled_truncatesData() throws InterruptedException {
Options options = TestPipeline.testingPipelineOptions().as(Options.class);
options.setDeleteSourceData(true);
PCollection<Void> actual = testPipeline.apply("CreateInput", Create.of(KV.of(partitionedTable, partition), KV.of(table, (BigQueryTablePartition) null)).withCoder(fnCoder)).apply("TestDeleteBigQueryDataFn", ParDo.of(fnUnderTest));
PAssert.that(actual).empty();
testPipeline.run(options);
verify(bqMock, times(1)).query(QueryJobConfiguration.newBuilder("truncate table `pr1.d1.t1`").build());
verify(bqMock, times(1)).delete(TableId.of("pr1", "d1", "t1p$p1"));
verifyNoMoreInteractions(bqMock);
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsFilterTest method test_whenPartitionedTableHasNoPartitions_filterExcludesTable.
@Test
public void test_whenPartitionedTableHasNoPartitions_filterExcludesTable() {
options.setTables(null);
options.setExportDataModifiedBeforeDateTime(null);
Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
assertThat(f.shouldSkipPartitionedTable(table(), Collections.emptyList())).isTrue();
}
Aggregations