use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class BigQueryMetadataLoader method loadTableMetadata.
/**
* Populates {@code table} builder with additional metadata like partition names and schema.
*
* @param filter optional filter to skip a subset of tables
* @return {@code true} if the table matches all filters and should be included in the results,
* {@code false} if it should be skipped
*/
private boolean loadTableMetadata(BigQueryTable.Builder table, Filter filter) throws InterruptedException {
TableReadOptions.Builder readOptions = TableReadOptions.newBuilder();
if (table.getPartitioningColumn() == null) {
if (filter != null && filter.shouldSkipUnpartitionedTable(table)) {
return false;
}
} else {
List<BigQueryTablePartition> partitions = loadTablePartitions(table, filter);
if (filter != null && filter.shouldSkipPartitionedTable(table, partitions)) {
return false;
}
table.setPartitions(partitions);
LOG.info("Loaded {} partitions for table {}: {}", partitions.size(), table.getTableName(), partitions);
// Creating a ReadSession without a WHERE clause for a partitioned table that has
// "require partition filter" param set to true would fail with the error:
// "Cannot query over table ... without a filter over column(s) ...
// that can be used for partition elimination".
// The following is a hack that adds an "is null and is not null" filter over the
// partitioning column, which shouldn't select any data but should make the query
// analyzer happy and should be enough to extract the table schema.
// TODO(an2x): do this only when "require partition filter" = true
// or load schema differently?
readOptions.setRowRestriction(String.format("%s is null and %s is not null", table.getPartitioningColumn(), table.getPartitioningColumn()));
}
ReadSession session = BigQueryUtils.createReadSession(bqsClient, DatasetId.of(table.getProject(), table.getDataset()), table.getTableName(), readOptions.build());
table.setSchema(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
LOG.info("Loaded schema for table {}: {}", table.getTableName(), table.getSchema());
return true;
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class GCSToElasticsearch method run.
/**
* Runs the pipeline to completion with the specified options.
*
* @param options The execution options.
* @return The pipeline result.
*/
private static PipelineResult run(GCSToElasticsearchOptions options) {
// Create the pipeline
Pipeline pipeline = Pipeline.create(options);
// Register the coder for pipeline
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
// Throw error if containsHeaders is true and a schema or Udf is also set.
if (options.getContainsHeaders()) {
checkArgument(options.getJavascriptTextTransformGcsPath() == null && options.getJsonSchemaPath() == null, "Cannot parse file containing headers with UDF or Json schema.");
}
// Throw error if only one retry configuration parameter is set.
checkArgument((options.getMaxRetryAttempts() == null && options.getMaxRetryDuration() == null) || (options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null), "To specify retry configuration both max attempts and max duration must be set.");
/*
* Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
* 2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
* 3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
* 3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
*/
PCollectionTuple convertedCsvLines = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setUdfOutputTag(PROCESSING_OUT).setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT).build());
/*
* Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
*/
convertedCsvLines.get(PROCESSING_OUT).apply("GetJsonDocuments", MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload)).apply("WriteToElasticsearch", WriteToElasticsearch.newBuilder().setOptions(options.as(GCSToElasticsearchOptions.class)).build());
/*
* Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
*/
convertedCsvLines.get(PROCESSING_DEADLETTER_OUT).apply("AddTimestamps", WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant())).apply("WriteFailedElementsToBigQuery", WriteStringMessageErrors.newBuilder().setErrorRecordsTable(options.getDeadletterTable()).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
return pipeline.run();
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class SpannerTransactionWriterDoFn method processElement.
@ProcessElement
public void processElement(ProcessContext c) {
FailsafeElement<String, String> msg = c.element();
Ddl ddl = c.sideInput(ddlView);
processedEvents.inc();
/*
* Try Catch block to capture any exceptions that might occur while processing
* DataStream events while writing to Cloud Spanner. All Exceptions that are caught
* can be retried based on the exception type.
*/
try {
JsonNode changeEvent = mapper.readTree(msg.getPayload());
ChangeEventContext changeEventContext = ChangeEventContextFactory.createChangeEventContext(changeEvent, ddl, shadowTablePrefix, sourceType);
// Sequence information for the current change event.
ChangeEventSequence currentChangeEventSequence = ChangeEventSequenceFactory.createChangeEventSequenceFromChangeEventContext(changeEventContext);
// Start transaction
spannerAccessor.getDatabaseClient().readWriteTransaction().run((TransactionCallable<Void>) transaction -> {
ChangeEventSequence previousChangeEventSequence = ChangeEventSequenceFactory.createChangeEventSequenceFromShadowTable(transaction, changeEventContext);
if (previousChangeEventSequence != null && previousChangeEventSequence.compareTo(currentChangeEventSequence) >= 0) {
return null;
}
transaction.buffer(changeEventContext.getMutations());
return null;
});
com.google.cloud.Timestamp timestamp = com.google.cloud.Timestamp.now();
c.output(timestamp);
sucessfulEvents.inc();
} catch (InvalidChangeEventException e) {
// Errors that result from invalid change events.
outputWithErrorTag(c, msg, e, SpannerTransactionWriter.PERMANENT_ERROR_TAG);
skippedEvents.inc();
} catch (ChangeEventConvertorException e) {
// Errors that result during Event conversions are not retryable.
outputWithErrorTag(c, msg, e, SpannerTransactionWriter.PERMANENT_ERROR_TAG);
conversionErrors.inc();
} catch (SpannerException se) {
/* Errors that happen when writing to Cloud Spanner are considered retryable.
* Since all event convertion errors are caught beforehand as permanent errors,
* any other errors encountered while writing to Cloud Spanner can be retried.
* Examples include:
* 1. Deadline exceeded errors from Cloud Spanner.
* 2. Failures due to foreign key/interleaved table constraints.
* 3. Any transient errors in Cloud Spanner.
*/
outputWithErrorTag(c, msg, se, SpannerTransactionWriter.RETRYABLE_ERROR_TAG);
retryableErrors.inc();
} catch (Exception e) {
// Any other errors are considered severe and not retryable.
outputWithErrorTag(c, msg, e, SpannerTransactionWriter.PERMANENT_ERROR_TAG);
failedEvents.inc();
}
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class ChangeEventConvertor method changeEventToPrimaryKey.
static com.google.cloud.spanner.Key changeEventToPrimaryKey(Ddl ddl, JsonNode changeEvent) throws ChangeEventConvertorException {
String tableName = changeEvent.get(DatastreamConstants.EVENT_TABLE_NAME_KEY).asText();
try {
Table table = ddl.table(tableName);
ImmutableList<IndexColumn> keyColumns = table.primaryKeys();
com.google.cloud.spanner.Key.Builder pk = com.google.cloud.spanner.Key.newBuilder();
for (IndexColumn keyColumn : keyColumns) {
Column key = table.column(keyColumn.name());
Type keyColType = key.type();
String keyColName = key.name().toLowerCase();
switch(keyColType.getCode()) {
case BOOL:
pk.append(ChangeEventTypeConvertor.toBoolean(changeEvent, keyColName, /*requiredField=*/
true));
break;
case INT64:
pk.append(ChangeEventTypeConvertor.toLong(changeEvent, keyColName, /*requiredField=*/
true));
break;
case FLOAT64:
pk.append(ChangeEventTypeConvertor.toDouble(changeEvent, keyColName, /*requiredField=*/
true));
break;
case STRING:
pk.append(ChangeEventTypeConvertor.toString(changeEvent, keyColName, /*requiredField=*/
true));
break;
case NUMERIC:
pk.append(ChangeEventTypeConvertor.toNumericBigDecimal(changeEvent, keyColName, /*requiredField=*/
true));
break;
case BYTES:
pk.append(ChangeEventTypeConvertor.toByteArray(changeEvent, keyColName, /*requiredField=*/
true));
break;
case TIMESTAMP:
pk.append(ChangeEventTypeConvertor.toTimestamp(changeEvent, keyColName, /*requiredField=*/
true));
break;
case DATE:
pk.append(ChangeEventTypeConvertor.toDate(changeEvent, keyColName, /*requiredField=*/
true));
break;
// TODO(b/179070999) - Add support for other data types.
default:
throw new IllegalArgumentException("Column name(" + keyColName + ") has unsupported column type(" + keyColType + ")");
}
}
return pk.build();
} catch (Exception e) {
throw new ChangeEventConvertorException(e);
}
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class ChangeEventConvertor method changeEventToInsertOrUpdateMutation.
private static Mutation changeEventToInsertOrUpdateMutation(Ddl ddl, JsonNode changeEvent) throws ChangeEventConvertorException, InvalidChangeEventException {
String tableName = changeEvent.get(DatastreamConstants.EVENT_TABLE_NAME_KEY).asText();
List<String> changeEventKeys = getEventColumnKeys(changeEvent);
try {
Table table = ddl.table(tableName);
Mutation.WriteBuilder builder = Mutation.newInsertOrUpdateBuilder(table.name());
Set<String> keyColumns = table.primaryKeys().stream().map(keyCol -> keyCol.name()).map(colName -> colName.toLowerCase()).collect(Collectors.toSet());
populateMutationBuilderWithEvent(table, builder, changeEvent, changeEventKeys, keyColumns);
return builder.build();
} catch (Exception e) {
throw new ChangeEventConvertorException(e);
}
}
Aggregations