Search in sources :

Example 6 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryMetadataLoader method loadTableMetadata.

/**
 * Populates {@code table} builder with additional metadata like partition names and schema.
 *
 * @param filter optional filter to skip a subset of tables
 * @return {@code true} if the table matches all filters and should be included in the results,
 *     {@code false} if it should be skipped
 */
private boolean loadTableMetadata(BigQueryTable.Builder table, Filter filter) throws InterruptedException {
    TableReadOptions.Builder readOptions = TableReadOptions.newBuilder();
    if (table.getPartitioningColumn() == null) {
        if (filter != null && filter.shouldSkipUnpartitionedTable(table)) {
            return false;
        }
    } else {
        List<BigQueryTablePartition> partitions = loadTablePartitions(table, filter);
        if (filter != null && filter.shouldSkipPartitionedTable(table, partitions)) {
            return false;
        }
        table.setPartitions(partitions);
        LOG.info("Loaded {} partitions for table {}: {}", partitions.size(), table.getTableName(), partitions);
        // Creating a ReadSession without a WHERE clause for a partitioned table that has
        // "require partition filter" param set to true would fail with the error:
        // "Cannot query over table ... without a filter over column(s) ...
        // that can be used for partition elimination".
        // The following is a hack that adds an "is null and is not null" filter over the
        // partitioning column, which shouldn't select any data but should make the query
        // analyzer happy and should be enough to extract the table schema.
        // TODO(an2x): do this only when "require partition filter" = true
        // or load schema differently?
        readOptions.setRowRestriction(String.format("%s is null and %s is not null", table.getPartitioningColumn(), table.getPartitioningColumn()));
    }
    ReadSession session = BigQueryUtils.createReadSession(bqsClient, DatasetId.of(table.getProject(), table.getDataset()), table.getTableName(), readOptions.build());
    table.setSchema(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
    LOG.info("Loaded schema for table {}: {}", table.getTableName(), table.getSchema());
    return true;
}
Also used : BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)

Example 7 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearch method run.

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
private static PipelineResult run(GCSToElasticsearchOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    // Throw error if containsHeaders is true and a schema or Udf is also set.
    if (options.getContainsHeaders()) {
        checkArgument(options.getJavascriptTextTransformGcsPath() == null && options.getJsonSchemaPath() == null, "Cannot parse file containing headers with UDF or Json schema.");
    }
    // Throw error if only one retry configuration parameter is set.
    checkArgument((options.getMaxRetryAttempts() == null && options.getMaxRetryDuration() == null) || (options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null), "To specify retry configuration both max attempts and max duration must be set.");
    /*
     * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
     *        2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
     *        3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
     *        3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
     */
    PCollectionTuple convertedCsvLines = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setUdfOutputTag(PROCESSING_OUT).setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT).build());
    /*
     * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
     */
    convertedCsvLines.get(PROCESSING_OUT).apply("GetJsonDocuments", MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload)).apply("WriteToElasticsearch", WriteToElasticsearch.newBuilder().setOptions(options.as(GCSToElasticsearchOptions.class)).build());
    /*
     * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
     */
    convertedCsvLines.get(PROCESSING_DEADLETTER_OUT).apply("AddTimestamps", WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant())).apply("WriteFailedElementsToBigQuery", WriteStringMessageErrors.newBuilder().setErrorRecordsTable(options.getDeadletterTable()).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    return pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) Instant(org.joda.time.Instant) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 8 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class SpannerTransactionWriterDoFn method processElement.

@ProcessElement
public void processElement(ProcessContext c) {
    FailsafeElement<String, String> msg = c.element();
    Ddl ddl = c.sideInput(ddlView);
    processedEvents.inc();
    /*
     * Try Catch block to capture any exceptions that might occur while processing
     * DataStream events while writing to Cloud Spanner. All Exceptions that are caught
     * can be retried based on the exception type.
     */
    try {
        JsonNode changeEvent = mapper.readTree(msg.getPayload());
        ChangeEventContext changeEventContext = ChangeEventContextFactory.createChangeEventContext(changeEvent, ddl, shadowTablePrefix, sourceType);
        // Sequence information for the current change event.
        ChangeEventSequence currentChangeEventSequence = ChangeEventSequenceFactory.createChangeEventSequenceFromChangeEventContext(changeEventContext);
        // Start transaction
        spannerAccessor.getDatabaseClient().readWriteTransaction().run((TransactionCallable<Void>) transaction -> {
            ChangeEventSequence previousChangeEventSequence = ChangeEventSequenceFactory.createChangeEventSequenceFromShadowTable(transaction, changeEventContext);
            if (previousChangeEventSequence != null && previousChangeEventSequence.compareTo(currentChangeEventSequence) >= 0) {
                return null;
            }
            transaction.buffer(changeEventContext.getMutations());
            return null;
        });
        com.google.cloud.Timestamp timestamp = com.google.cloud.Timestamp.now();
        c.output(timestamp);
        sucessfulEvents.inc();
    } catch (InvalidChangeEventException e) {
        // Errors that result from invalid change events.
        outputWithErrorTag(c, msg, e, SpannerTransactionWriter.PERMANENT_ERROR_TAG);
        skippedEvents.inc();
    } catch (ChangeEventConvertorException e) {
        // Errors that result during Event conversions are not retryable.
        outputWithErrorTag(c, msg, e, SpannerTransactionWriter.PERMANENT_ERROR_TAG);
        conversionErrors.inc();
    } catch (SpannerException se) {
        /* Errors that happen when writing to Cloud Spanner are considered retryable.
       * Since all event convertion errors are caught beforehand as permanent errors,
       * any other errors encountered while writing to Cloud Spanner can be retried.
       * Examples include:
       * 1. Deadline exceeded errors from Cloud Spanner.
       * 2. Failures due to foreign key/interleaved table constraints.
       * 3. Any transient errors in Cloud Spanner.
       */
        outputWithErrorTag(c, msg, se, SpannerTransactionWriter.RETRYABLE_ERROR_TAG);
        retryableErrors.inc();
    } catch (Exception e) {
        // Any other errors are considered severe and not retryable.
        outputWithErrorTag(c, msg, e, SpannerTransactionWriter.PERMANENT_ERROR_TAG);
        failedEvents.inc();
    }
}
Also used : TransactionCallable(com.google.cloud.spanner.TransactionRunner.TransactionCallable) InvalidChangeEventException(com.google.cloud.teleport.v2.templates.datastream.InvalidChangeEventException) Ddl(com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl) LoggerFactory(org.slf4j.LoggerFactory) Timestamp(com.google.cloud.Timestamp) DeserializationFeature(com.fasterxml.jackson.databind.DeserializationFeature) Metrics(org.apache.beam.sdk.metrics.Metrics) ChangeEventConvertorException(com.google.cloud.teleport.v2.templates.datastream.ChangeEventConvertorException) TupleTag(org.apache.beam.sdk.values.TupleTag) ChangeEventSequenceFactory(com.google.cloud.teleport.v2.templates.datastream.ChangeEventSequenceFactory) JsonNode(com.fasterxml.jackson.databind.JsonNode) ExposedSpannerAccessor(org.apache.beam.sdk.io.gcp.spanner.ExposedSpannerAccessor) ChangeEventContext(com.google.cloud.teleport.v2.templates.datastream.ChangeEventContext) ChangeEventSequence(com.google.cloud.teleport.v2.templates.datastream.ChangeEventSequence) PrintWriter(java.io.PrintWriter) DoFn(org.apache.beam.sdk.transforms.DoFn) Logger(org.slf4j.Logger) StringWriter(java.io.StringWriter) Counter(org.apache.beam.sdk.metrics.Counter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Serializable(java.io.Serializable) SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) SpannerException(com.google.cloud.spanner.SpannerException) PCollectionView(org.apache.beam.sdk.values.PCollectionView) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Preconditions(com.google.common.base.Preconditions) ChangeEventContextFactory(com.google.cloud.teleport.v2.templates.datastream.ChangeEventContextFactory) JsonNode(com.fasterxml.jackson.databind.JsonNode) Timestamp(com.google.cloud.Timestamp) Ddl(com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl) InvalidChangeEventException(com.google.cloud.teleport.v2.templates.datastream.InvalidChangeEventException) ChangeEventConvertorException(com.google.cloud.teleport.v2.templates.datastream.ChangeEventConvertorException) SpannerException(com.google.cloud.spanner.SpannerException) ChangeEventConvertorException(com.google.cloud.teleport.v2.templates.datastream.ChangeEventConvertorException) InvalidChangeEventException(com.google.cloud.teleport.v2.templates.datastream.InvalidChangeEventException) ChangeEventContext(com.google.cloud.teleport.v2.templates.datastream.ChangeEventContext) SpannerException(com.google.cloud.spanner.SpannerException) ChangeEventSequence(com.google.cloud.teleport.v2.templates.datastream.ChangeEventSequence)

Example 9 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class ChangeEventConvertor method changeEventToPrimaryKey.

static com.google.cloud.spanner.Key changeEventToPrimaryKey(Ddl ddl, JsonNode changeEvent) throws ChangeEventConvertorException {
    String tableName = changeEvent.get(DatastreamConstants.EVENT_TABLE_NAME_KEY).asText();
    try {
        Table table = ddl.table(tableName);
        ImmutableList<IndexColumn> keyColumns = table.primaryKeys();
        com.google.cloud.spanner.Key.Builder pk = com.google.cloud.spanner.Key.newBuilder();
        for (IndexColumn keyColumn : keyColumns) {
            Column key = table.column(keyColumn.name());
            Type keyColType = key.type();
            String keyColName = key.name().toLowerCase();
            switch(keyColType.getCode()) {
                case BOOL:
                    pk.append(ChangeEventTypeConvertor.toBoolean(changeEvent, keyColName, /*requiredField=*/
                    true));
                    break;
                case INT64:
                    pk.append(ChangeEventTypeConvertor.toLong(changeEvent, keyColName, /*requiredField=*/
                    true));
                    break;
                case FLOAT64:
                    pk.append(ChangeEventTypeConvertor.toDouble(changeEvent, keyColName, /*requiredField=*/
                    true));
                    break;
                case STRING:
                    pk.append(ChangeEventTypeConvertor.toString(changeEvent, keyColName, /*requiredField=*/
                    true));
                    break;
                case NUMERIC:
                    pk.append(ChangeEventTypeConvertor.toNumericBigDecimal(changeEvent, keyColName, /*requiredField=*/
                    true));
                    break;
                case BYTES:
                    pk.append(ChangeEventTypeConvertor.toByteArray(changeEvent, keyColName, /*requiredField=*/
                    true));
                    break;
                case TIMESTAMP:
                    pk.append(ChangeEventTypeConvertor.toTimestamp(changeEvent, keyColName, /*requiredField=*/
                    true));
                    break;
                case DATE:
                    pk.append(ChangeEventTypeConvertor.toDate(changeEvent, keyColName, /*requiredField=*/
                    true));
                    break;
                // TODO(b/179070999) -  Add support for other data types.
                default:
                    throw new IllegalArgumentException("Column name(" + keyColName + ") has unsupported column type(" + keyColType + ")");
            }
        }
        return pk.build();
    } catch (Exception e) {
        throw new ChangeEventConvertorException(e);
    }
}
Also used : Table(com.google.cloud.teleport.v2.templates.spanner.ddl.Table) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn) Type(com.google.cloud.spanner.Type) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn) Column(com.google.cloud.teleport.v2.templates.spanner.ddl.Column)

Example 10 with Table

use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.

the class ChangeEventConvertor method changeEventToInsertOrUpdateMutation.

private static Mutation changeEventToInsertOrUpdateMutation(Ddl ddl, JsonNode changeEvent) throws ChangeEventConvertorException, InvalidChangeEventException {
    String tableName = changeEvent.get(DatastreamConstants.EVENT_TABLE_NAME_KEY).asText();
    List<String> changeEventKeys = getEventColumnKeys(changeEvent);
    try {
        Table table = ddl.table(tableName);
        Mutation.WriteBuilder builder = Mutation.newInsertOrUpdateBuilder(table.name());
        Set<String> keyColumns = table.primaryKeys().stream().map(keyCol -> keyCol.name()).map(colName -> colName.toLowerCase()).collect(Collectors.toSet());
        populateMutationBuilderWithEvent(table, builder, changeEvent, changeEventKeys, keyColumns);
        return builder.build();
    } catch (Exception e) {
        throw new ChangeEventConvertorException(e);
    }
}
Also used : Iterator(java.util.Iterator) Ddl(com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl) Spliterators(java.util.Spliterators) Set(java.util.Set) Type(com.google.cloud.spanner.Type) Mutation(com.google.cloud.spanner.Mutation) Value(com.google.cloud.spanner.Value) Collectors(java.util.stream.Collectors) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) HashSet(java.util.HashSet) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) IndexColumn(com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn) Column(com.google.cloud.teleport.v2.templates.spanner.ddl.Column) Table(com.google.cloud.teleport.v2.templates.spanner.ddl.Table) JsonNode(com.fasterxml.jackson.databind.JsonNode) StreamSupport(java.util.stream.StreamSupport) Spliterator(java.util.Spliterator) Table(com.google.cloud.teleport.v2.templates.spanner.ddl.Table) Mutation(com.google.cloud.spanner.Mutation)

Aggregations

Test (org.junit.Test)26 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 ArrayList (java.util.ArrayList)10 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)9 Pipeline (org.apache.beam.sdk.Pipeline)9 Ddl (com.google.cloud.teleport.v2.templates.spanner.ddl.Ddl)8 Table (com.google.cloud.teleport.v2.templates.spanner.ddl.Table)8 Set (java.util.Set)8 PipelineResult (org.apache.beam.sdk.PipelineResult)8 TableRow (com.google.api.services.bigquery.model.TableRow)6 IntegrationTest (com.google.cloud.teleport.v2.spanner.IntegrationTest)6 PCollection (org.apache.beam.sdk.values.PCollection)6 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)6 IOException (java.io.IOException)5 WriteResult (org.apache.beam.sdk.io.gcp.bigquery.WriteResult)5 Timestamp (com.google.cloud.Timestamp)4 Column (com.google.cloud.teleport.v2.templates.spanner.ddl.Column)4 IndexColumn (com.google.cloud.teleport.v2.templates.spanner.ddl.IndexColumn)4