use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsFilterTest method test_whenNoFilterOptions_filterAcceptsAllTablesAndPartitions.
@Test
public void test_whenNoFilterOptions_filterAcceptsAllTablesAndPartitions() {
BigQueryTable.Builder t = table();
BigQueryTablePartition p = partition().build();
options.setTables(null);
options.setExportDataModifiedBeforeDateTime(null);
Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
assertThat(f.shouldSkipUnpartitionedTable(t)).isFalse();
assertThat(f.shouldSkipPartitionedTable(t, Collections.singletonList(p))).isFalse();
assertThat(f.shouldSkipPartition(t, p)).isFalse();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatFactory method expand.
@Override
public POutput expand(PCollection<KV<String, String>> records) {
POutput output = null;
final String errorMessage = "Invalid output format:" + outputFileFormat() + ". Supported output formats:" + FileFormatFactory.EXPECTED_FILE_FORMAT;
/*
* Call the function to check file format passed by user is valid.
*/
if (!WriteToGCSUtility.isValidFileFormat(outputFileFormat())) {
LOG.info(errorMessage);
throw new IllegalArgumentException(errorMessage);
}
WriteToGCSUtility.FileFormat outputFileFormat = WriteToGCSUtility.FileFormat.valueOf(outputFileFormat());
/*
* Calls appropriate class Builder to performs PTransform based on user provided File Format.
*/
switch(outputFileFormat) {
case TEXT:
output = records.apply("Write Text File(s)", WriteToGCSText.newBuilder().withOutputDirectory(options().getOutputDirectory()).withOutputFilenamePrefix((options().getOutputFilenamePrefix())).setNumShards(options().getNumShards()).withTempLocation(options().getTempLocation()).build());
break;
case AVRO:
output = records.apply("Write Avro File(s)", WriteToGCSAvro.newBuilder().withOutputDirectory(options().getOutputDirectory()).withOutputFilenamePrefix((options().getOutputFilenamePrefix())).setNumShards(options().getNumShards()).withTempLocation(options().getTempLocation()).build());
break;
case PARQUET:
output = records.apply("Write Parquet File(s)", WriteToGCSParquet.newBuilder().withOutputDirectory(options().getOutputDirectory()).withOutputFilenamePrefix((options().getOutputFilenamePrefix())).setNumShards(options().getNumShards()).build());
break;
default:
LOG.info("Invalid output format:" + outputFileFormat());
throw new IllegalArgumentException("Invalid output format:" + outputFileFormat());
}
return output;
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class KafkaToPubsub method run.
/**
* Runs a pipeline which reads message from Kafka and writes to Pub/Sub.
*
* @param options arguments to the pipeline
*/
public static PipelineResult run(KafkaToPubsubOptions options) {
List<String> topicsList = new ArrayList<>(Arrays.asList(options.getInputTopics().split(",")));
checkArgument(topicsList.size() > 0 && topicsList.stream().allMatch((s) -> s.trim().length() > 0), "inputTopics cannot be an empty string.");
List<String> bootstrapServersList = new ArrayList<>(Arrays.asList(options.getBootstrapServers().split(",")));
checkArgument(bootstrapServersList.size() > 0 && bootstrapServersList.stream().allMatch((s) -> s.trim().length() > 0), "bootstrapServers cannot be an empty string.");
// Configure Kafka consumer properties
Map<String, Object> kafkaConfig = new HashMap<>();
Map<String, String> sslConfig = null;
if (options.getSecretStoreUrl() != null && options.getVaultToken() != null) {
Map<String, Map<String, String>> credentials = getKafkaCredentialsFromVault(options.getSecretStoreUrl(), options.getVaultToken());
kafkaConfig = configureKafka(credentials.get(KafkaPubsubConstants.KAFKA_CREDENTIALS));
sslConfig = credentials.get(KafkaPubsubConstants.SSL_CREDENTIALS);
} else {
LOG.warn("No information to retrieve Kafka credentials was provided. " + "Trying to initiate an unauthorized connection.");
}
// Create the pipeline
Pipeline pipeline = Pipeline.create(options);
// Register the coder for pipeline
FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())), NullableCoder.of(StringUtf8Coder.of()));
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
TypeDescriptor<String> stringTypeDescriptor = TypeDescriptors.strings();
LOG.info("Starting Kafka-To-PubSub Pipeline with parameters bootstrap servers:{} input topics:{}" + " output pubsub topic:{} ", options.getBootstrapServers(), options.getInputTopics(), options.getOutputTopic());
/*
* Steps:
* 1) Read messages in from Kafka
* 2) Transform message payload via UDF
* 3) Write successful records out to Pub/Sub
* 4) Write failed records out to Pub/Sub dead-letter topic
*/
PCollectionTuple appliedUdf = pipeline.apply("readFromKafka", readFromKafka(options.getBootstrapServers(), topicsList, kafkaConfig, sslConfig)).apply("applyUDF", new FormatTransform.UdfProcess(options));
/* Step #3: Write the successful records out to Pub/Sub */
appliedUdf.get(KafkaPubsubConstants.UDF_OUT).apply("getSuccessUDFOutElements", MapElements.into(stringTypeDescriptor).via(FailsafeElement::getPayload)).setCoder(NullableCoder.of(StringUtf8Coder.of())).apply("writeSuccessMessages", PubsubIO.writeStrings().to(options.getOutputTopic()));
/* Step #4: Write failed messages out to Pub/Sub */
if (options.getOutputDeadLetterTopic() != null) {
appliedUdf.get(KafkaPubsubConstants.UDF_DEADLETTER_OUT).apply("getFailedMessages", MapElements.into(TypeDescriptors.kvs(stringTypeDescriptor, stringTypeDescriptor)).via(FailsafeElement::getOriginalPayload)).apply("extractMessageValues", MapElements.into(stringTypeDescriptor).via(KV<String, String>::getValue)).setCoder(NullableCoder.of(StringUtf8Coder.of())).apply("writeFailureMessages", PubsubIO.writeStrings().to(options.getOutputDeadLetterTopic()));
}
return pipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class PubsubProtoToBigQuery method runUdf.
/**
* Handles running the UDF.
*
* <p>If {@code options} is configured so as not to run the UDF, then the UDF will not be called.
*
* <p>This may add a branch to the pipeline for outputting failed UDF records to an unprocessed
* topic.
*
* @param jsonCollection {@link PCollection} of JSON strings for use as input to the UDF
* @param options the options containing info on running the UDF
* @return the {@link PCollection} of UDF output as JSON or {@code jsonCollection} if UDF not
* called
*/
@VisibleForTesting
static PCollection<String> runUdf(PCollection<String> jsonCollection, PubSubProtoToBigQueryOptions options) {
// intended, simply return the input as "success" output.
if (Strings.isNullOrEmpty(options.getJavascriptTextTransformGcsPath())) {
return jsonCollection;
}
// a value.
if (Strings.isNullOrEmpty(options.getJavascriptTextTransformFunctionName())) {
throw new IllegalArgumentException("JavaScript function name cannot be null or empty if file is set");
}
PCollectionTuple maybeSuccess = jsonCollection.apply("Run UDF", new RunUdf(options));
maybeSuccess.get(UDF_FAILURE_TAG).setCoder(FAILSAFE_CODER).apply("Get UDF Failures", ConvertFailsafeElementToPubsubMessage.<String, String>builder().setOriginalPayloadSerializeFn(s -> ArrayUtils.toObject(s.getBytes(UTF_8))).setErrorMessageAttributeKey("udfErrorMessage").build()).apply("Write Failed UDF", writeUdfFailures(options));
return maybeSuccess.get(UDF_SUCCESS_TAG).setCoder(FAILSAFE_CODER).apply("Get UDF Output", MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload)).setCoder(NullableCoder.of(StringUtf8Coder.of()));
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class GCSToSplunkTest method testGCSToSplunkReadHeaders.
@Test
public void testGCSToSplunkReadHeaders() {
// Arrange
String stringifiedJsonRecord = "{\"id\":\"008\",\"state\":\"CA\",\"price\":\"26.23\"}";
SplunkEvent expectedSplunkEvent = SplunkEvent.newBuilder().withEvent(stringifiedJsonRecord).create();
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForClass(SplunkEvent.class, SplunkEventCoder.of());
coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
GCSToSplunkOptions options = PipelineOptionsFactory.create().as(GCSToSplunkOptions.class);
options.setContainsHeaders(true);
options.setInputFileSpec(HEADER_CSV_FILE_PATH);
// Act
PCollectionTuple readCsvOut = pipeline.apply("Read CSV", readFromCsv(options));
PCollectionTuple transformedLines = readCsvOut.apply("Convert to JSON", convertToFailsafeAndMaybeApplyUdf(options));
PCollectionTuple splunkEventTuple = transformedLines.get(UDF_OUT).apply("Convert to Splunk Event", convertToSplunkEvent());
// Assert
PAssert.that(transformedLines.get(UDF_OUT)).satisfies(collection -> {
FailsafeElement element = collection.iterator().next();
assertThat(element.getPayload()).isEqualTo(stringifiedJsonRecord);
return null;
});
PAssert.that(transformedLines.get(UDF_ERROR_OUT)).empty();
PAssert.that(splunkEventTuple.get(SPLUNK_EVENT_OUT)).containsInAnyOrder(expectedSplunkEvent);
PAssert.that(splunkEventTuple.get(SPLUNK_EVENT_ERROR_OUT)).empty();
// Execute pipeline
pipeline.run();
}
Aggregations