Search in sources :

Example 1 with Export

use of com.google.cloud.teleport.spanner.ExportProtos.Export in project DataflowTemplates by GoogleCloudPlatform.

the class ExportTransform method expand.

/**
 * Read the Cloud Spanner schema and all the rows in all the tables of the database. Create and
 * write the exported Avro files to GCS.
 */
@Override
public WriteFilesResult<String> expand(PBegin begin) {
    Pipeline p = begin.getPipeline();
    /*
     * Allow users to specify read timestamp.
     * CreateTransaction and CreateTransactionFn classes in SpannerIO
     * only take a timestamp object for exact staleness which works when
     * parameters are provided during template compile time. They do not work with
     * a Timestamp valueProvider which can take parameters at runtime. Hence a new
     * ParDo class CreateTransactionFnWithTimestamp had to be created for this
     * purpose.
     */
    PCollectionView<Transaction> tx = p.apply("CreateTransaction", Create.of(1)).apply("Create transaction", ParDo.of(new CreateTransactionFnWithTimestamp(spannerConfig, snapshotTime))).apply("Tx As PCollectionView", View.asSingleton());
    PCollectionView<Dialect> dialectView = p.apply("Read Dialect", new ReadDialect(spannerConfig)).apply("Dialect As PCollectionView", View.asSingleton());
    PCollection<Ddl> ddl = p.apply("Read Information Schema", new ReadInformationSchema(spannerConfig, tx, dialectView));
    PCollection<Ddl> exportState = ddl.apply("Check export conditions", ParDo.of(new DoFn<Ddl, Ddl>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            Ddl ddl = c.element();
            List<String> tablesList = Collections.emptyList();
            // a list of export tables, throw an exception.
            if (tableNames.get().trim().isEmpty() && exportRelatedTables.get()) {
                throw new Exception("Invalid usage of --tableNames and --shouldExportRelatedTables. Set" + " --shouldExportRelatedTables=true only if --tableNames is given" + " selected tables for export.");
            }
            // If the user provides a comma-separated list of strings, parse it into a List
            if (!tableNames.get().trim().isEmpty()) {
                tablesList = Arrays.asList(tableNames.get().split(",\\s*"));
            }
            // If the user provided any invalid table names, throw an exception.
            List<String> allSpannerTables = ddl.allTables().stream().map(t -> t.name()).collect(Collectors.toList());
            List<String> invalidTables = tablesList.stream().distinct().filter(t -> !allSpannerTables.contains(t)).collect(Collectors.toList());
            if (invalidTables.size() != 0) {
                throw new Exception("INVALID_ARGUMENT: Table(s) not found: " + String.join(", ", invalidTables) + ".");
            }
            List<String> filteredTables = getFilteredTables(ddl, tablesList).stream().map(t -> t.name()).collect(Collectors.toList());
            // Save any missing necessary export table names; save a copy of the original
            // table list to bypass 'final or effectively final' condition of the lambda
            // expression below.
            List<String> usersTables = tablesList.stream().collect(Collectors.toList());
            List<String> missingTables = filteredTables.stream().distinct().filter(t -> !usersTables.contains(t)).collect(Collectors.toList());
            Collections.sort(missingTables);
            // throw an exception.
            if (tablesList.size() != 0 && !(tablesList.equals(filteredTables)) && !exportRelatedTables.get()) {
                throw new Exception("Attempted to export table(s) requiring parent and/or foreign keys tables" + " without setting the shouldExportRelatedTables parameter. Set" + " --shouldExportRelatedTables=true to export all necessary" + " tables, or add " + String.join(", ", missingTables) + " to --tableNames.");
            }
            c.output(ddl);
        }
    }));
    PCollection<ReadOperation> tables = ddl.apply("Build table read operations", new BuildReadFromTableOperations(tableNames));
    PCollection<KV<String, Void>> allTableAndViewNames = ddl.apply("List all table and view names", ParDo.of(new DoFn<Ddl, KV<String, Void>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            Ddl ddl = c.element();
            for (Table t : ddl.allTables()) {
                c.output(KV.of(t.name(), null));
            }
            // we need to add the names of all views separately here.
            for (com.google.cloud.teleport.spanner.ddl.View v : ddl.views()) {
                c.output(KV.of(v.name(), null));
            }
        }
    }));
    PCollection<String> allChangeStreamNames = ddl.apply("List all change stream names", ParDo.of(new DoFn<Ddl, String>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            Ddl ddl = c.element();
            for (ChangeStream changeStream : ddl.changeStreams()) {
                c.output(changeStream.name());
            }
        }
    }));
    // Generate a unique output directory name.
    final PCollectionView<String> outputDirectoryName = p.apply(Create.of(1)).apply("Create Avro output folder", ParDo.of(new DoFn<Integer, String>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            String instanceId = spannerConfig.getInstanceId().get();
            String dbId = spannerConfig.getDatabaseId().get();
            // For direct runner or tests we need a deterministic jobId.
            String testJobId = ExportTransform.this.testJobId.get();
            if (!Strings.isNullOrEmpty(testJobId)) {
                c.output(testJobId);
                return;
            }
            try {
                DataflowWorkerHarnessOptions workerHarnessOptions = c.getPipelineOptions().as(DataflowWorkerHarnessOptions.class);
                String jobId = workerHarnessOptions.getJobId();
                c.output(instanceId + "-" + dbId + "-" + jobId);
            } catch (Exception e) {
                throw new IllegalStateException("Please specify --testJobId to run with non-dataflow runner");
            }
        }
    })).apply(View.asSingleton());
    final PCollectionView<Map<String, SerializableSchemaSupplier>> avroSchemas = ddl.apply("Build Avro schemas from DDL", ParDo.of(new DoFn<Ddl, KV<String, SerializableSchemaSupplier>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            Collection<Schema> avroSchemas = new DdlToAvroSchemaConverter("spannerexport", "1.0.0", shouldExportTimestampAsLogicalType.get()).convert(c.element());
            for (Schema schema : avroSchemas) {
                c.output(KV.of(schema.getName(), new SerializableSchemaSupplier(schema)));
            }
        }
    })).apply("As view", View.asMap());
    PCollection<Struct> rows = tables.apply("Read all rows from Spanner", SpannerIO.readAll().withTransaction(tx).withSpannerConfig(spannerConfig));
    ValueProvider<ResourceId> resource = ValueProvider.NestedValueProvider.of(outputDir, (SerializableFunction<String, ResourceId>) s -> FileSystems.matchNewResource(s, true));
    ValueProvider<ResourceId> tempResource = ValueProvider.NestedValueProvider.of(eitherOrValueProvider(avroTempDirectory, outputDir), (SerializableFunction<String, ResourceId>) s -> FileSystems.matchNewResource(s, true));
    WriteFilesResult<String> fileWriteResults = rows.apply("Store Avro files", AvroIO.<Struct>writeCustomTypeToGenericRecords().to(new SchemaBasedDynamicDestinations(avroSchemas, outputDirectoryName, dialectView, resource)).withTempDirectory(tempResource));
    // Generate the manifest file.
    PCollection<KV<String, Iterable<String>>> tableFiles = fileWriteResults.getPerDestinationOutputFilenames().apply(GroupByKey.create());
    final TupleTag<Void> allTables = new TupleTag<>();
    final TupleTag<Iterable<String>> nonEmptyTables = new TupleTag<>();
    PCollection<KV<String, CoGbkResult>> groupedTables = KeyedPCollectionTuple.of(allTables, allTableAndViewNames).and(nonEmptyTables, tableFiles).apply("Group with all tables", CoGroupByKey.create());
    // The following is to export empty tables and views from the database.  Empty tables and views
    // are handled together because we do not export any rows for views, only their metadata,
    // including the queries defining them.
    PCollection<KV<String, Iterable<String>>> emptyTablesAndViews = groupedTables.apply("Export empty tables and views", ParDo.of(new DoFn<KV<String, CoGbkResult>, KV<String, Iterable<String>>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            KV<String, CoGbkResult> kv = c.element();
            String table = kv.getKey();
            CoGbkResult coGbkResult = kv.getValue();
            Iterable<String> only = coGbkResult.getOnly(nonEmptyTables, null);
            if (only == null) {
                LOG.info("Exporting empty table or view: " + table);
                // This file will contain the schema definition: column definitions for empty
                // tables or defining queries for views.
                c.output(KV.of(table, Collections.singleton(table + ".avro-00000-of-00001")));
            }
        }
    }));
    PCollection<KV<String, Iterable<String>>> changeStreams = allChangeStreamNames.apply("Export change streams", ParDo.of(new DoFn<String, KV<String, Iterable<String>>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            String changeStreamName = c.element();
            LOG.info("Exporting change stream: " + changeStreamName);
            // This file will contain the schema definition for the change stream.
            c.output(KV.of(changeStreamName, Collections.singleton(changeStreamName + ".avro-00000-of-00001")));
        }
    }));
    // Empty tables, views and change streams are handled together, because we export them as empty
    // Avro files that only contain the Avro schemas.
    PCollection<KV<String, Iterable<String>>> emptySchemaFiles = PCollectionList.of(emptyTablesAndViews).and(changeStreams).apply("Combine all empty schema files", Flatten.pCollections());
    emptySchemaFiles = emptySchemaFiles.apply("Save empty schema files", ParDo.of(new DoFn<KV<String, Iterable<String>>, KV<String, Iterable<String>>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            Map<String, SerializableSchemaSupplier> schemaMap = c.sideInput(avroSchemas);
            KV<String, Iterable<String>> kv = c.element();
            String objectName = kv.getKey();
            String fileName = kv.getValue().iterator().next();
            Schema schema = schemaMap.get(objectName).get();
            DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
            Path fullPath = createOutputPath(outputDir.get(), c.sideInput(outputDirectoryName), fileName);
            try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
                dataFileWriter.create(schema, createOutputStream(fullPath, c));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            c.output(KV.of(objectName, Collections.singleton(fullPath.toString())));
        }

        /**
         * Resolves the complete path name for Avro files for both GCS and local FS
         * (for testing).
         *
         * @param outputDirectoryPath Initial directory path for the file.
         * @param outputDirectoryName Terminal directory for the file.
         * @param fileName Name of the Avro file
         * @return The full {@link Path} of the output Avro file.
         */
        private Path createOutputPath(String outputDirectoryPath, String outputDirectoryName, String fileName) {
            if (GcsPath.GCS_URI.matcher(outputDirectoryPath).matches()) {
                // Avro file path in GCS.
                return GcsPath.fromUri(outputDirectoryPath).resolve(outputDirectoryName).resolve(fileName);
            } else {
                // Avro file path in local filesystem
                return Paths.get(outputDirectoryPath, outputDirectoryName, fileName);
            }
        }

        /**
         * Creates the {@link OutputStream} for the output file either on GCS or on
         * local FS (for testing).
         *
         * @param outputPath The full path of the output file.
         * @param c The {@link org.apache.beam.sdk.transforms.DoFn.ProcessContext}
         * @return An {@link OutputStream} for the opened output file.
         * @throws IOException if the output file cannot be opened.
         */
        private OutputStream createOutputStream(Path outputPath, ProcessContext c) throws IOException {
            if (GcsPath.GCS_URI.matcher(outputPath.toString()).matches()) {
                // Writing the Avro file to GCS.
                org.apache.beam.sdk.extensions.gcp.util.GcsUtil gcsUtil = c.getPipelineOptions().as(GcsOptions.class).getGcsUtil();
                String gcsType = "application/octet-stream";
                WritableByteChannel gcsChannel = gcsUtil.create((GcsPath) outputPath, gcsType);
                return Channels.newOutputStream(gcsChannel);
            } else {
                // Avro file is created on local filesystem (for testing).
                Files.createDirectories(outputPath.getParent());
                return Files.newOutputStream(outputPath);
            }
        }
    }).withSideInputs(avroSchemas, outputDirectoryName));
    PCollection<KV<String, Iterable<String>>> allFiles = PCollectionList.of(tableFiles).and(emptySchemaFiles).apply("Combine all files", Flatten.pCollections());
    PCollection<KV<String, String>> tableManifests = allFiles.apply("Build table manifests", ParDo.of(new BuildTableManifests()));
    Contextful.Fn<String, FileIO.Write.FileNaming> tableManifestNaming = (element, c) -> (window, pane, numShards, shardIndex, compression) -> GcsUtil.joinPath(outputDir.get(), c.sideInput(outputDirectoryName), tableManifestFileName(element));
    tableManifests.apply("Store table manifests", FileIO.<String, KV<String, String>>writeDynamic().by(KV::getKey).withDestinationCoder(StringUtf8Coder.of()).withNaming(Contextful.of(tableManifestNaming, Requirements.requiresSideInputs(outputDirectoryName))).via(Contextful.fn(KV::getValue), TextIO.sink()).withTempDirectory(eitherOrValueProvider(avroTempDirectory, outputDir)));
    PCollection<List<Export.Table>> metadataTables = tableManifests.apply("Combine table metadata", Combine.globally(new CombineTableMetadata()));
    PCollectionView<Ddl> ddlView = ddl.apply("Cloud Spanner DDL as view", View.asSingleton());
    PCollection<String> metadataContent = metadataTables.apply("Create database manifest", ParDo.of(new CreateDatabaseManifest(ddlView, dialectView)).withSideInputs(ddlView, dialectView));
    Contextful.Fn<String, FileIO.Write.FileNaming> manifestNaming = (element, c) -> (window, pane, numShards, shardIndex, compression) -> GcsUtil.joinPath(outputDir.get(), c.sideInput(outputDirectoryName), "spanner-export.json");
    metadataContent.apply("Store the database manifest", FileIO.<String, String>writeDynamic().by(SerializableFunctions.constant("")).withDestinationCoder(StringUtf8Coder.of()).via(TextIO.sink()).withNaming(Contextful.of(manifestNaming, Requirements.requiresSideInputs(outputDirectoryName))).withTempDirectory(eitherOrValueProvider(avroTempDirectory, outputDir)));
    return fileWriteResults;
}
Also used : CombineFn(org.apache.beam.sdk.transforms.Combine.CombineFn) Arrays(java.util.Arrays) AvroIO(org.apache.beam.sdk.io.AvroIO) FileIO(org.apache.beam.sdk.io.FileIO) Table(com.google.cloud.teleport.spanner.ddl.Table) PBegin(org.apache.beam.sdk.values.PBegin) WriteFilesResult(org.apache.beam.sdk.io.WriteFilesResult) Dialect(com.google.cloud.spanner.Dialect) LoggerFactory(org.slf4j.LoggerFactory) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) ValueProviderUtils.eitherOrValueProvider(com.google.cloud.teleport.util.ValueProviderUtils.eitherOrValueProvider) ReadOperation(org.apache.beam.sdk.io.gcp.spanner.ReadOperation) DataflowWorkerHarnessOptions(org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions) PCollectionList(org.apache.beam.sdk.values.PCollectionList) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) KeyedPCollectionTuple(org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple) TableManifest(com.google.cloud.teleport.spanner.ExportProtos.TableManifest) Path(java.nio.file.Path) ValueProvider(org.apache.beam.sdk.options.ValueProvider) Flatten(org.apache.beam.sdk.transforms.Flatten) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) Schema(org.apache.avro.Schema) DatumWriter(org.apache.avro.io.DatumWriter) Collection(java.util.Collection) DataFileWriter(org.apache.avro.file.DataFileWriter) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) Export(com.google.cloud.teleport.spanner.ExportProtos.Export) Objects(java.util.Objects) List(java.util.List) JsonFormat(com.google.protobuf.util.JsonFormat) ParDo(org.apache.beam.sdk.transforms.ParDo) Struct(com.google.cloud.spanner.Struct) ResolveOptions(org.apache.beam.sdk.io.fs.ResolveOptions) Requirements(org.apache.beam.sdk.transforms.Requirements) DynamicAvroDestinations(org.apache.beam.sdk.io.DynamicAvroDestinations) Iterables(com.google.common.collect.Iterables) KV(org.apache.beam.sdk.values.KV) DefaultFilenamePolicy(org.apache.beam.sdk.io.DefaultFilenamePolicy) Combine(org.apache.beam.sdk.transforms.Combine) Supplier(com.google.common.base.Supplier) View(org.apache.beam.sdk.transforms.View) SerializableFunctions(org.apache.beam.sdk.transforms.SerializableFunctions) ArrayList(java.util.ArrayList) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) PTransform(org.apache.beam.sdk.transforms.PTransform) CoGbkResult(org.apache.beam.sdk.transforms.join.CoGbkResult) FileBasedSink(org.apache.beam.sdk.io.FileBasedSink) Contextful(org.apache.beam.sdk.transforms.Contextful) Strings(com.google.common.base.Strings) Transaction(org.apache.beam.sdk.io.gcp.spanner.Transaction) TupleTag(org.apache.beam.sdk.values.TupleTag) Pipeline(org.apache.beam.sdk.Pipeline) Ddl(com.google.cloud.teleport.spanner.ddl.Ddl) CreateTransactionFnWithTimestamp(com.google.cloud.teleport.templates.common.SpannerConverters.CreateTransactionFnWithTimestamp) SpannerTableFilter.getFilteredTables(com.google.cloud.teleport.spanner.SpannerTableFilter.getFilteredTables) OutputStream(java.io.OutputStream) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) DoFn(org.apache.beam.sdk.transforms.DoFn) GenericRecord(org.apache.avro.generic.GenericRecord) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) Files(java.nio.file.Files) Channels(java.nio.channels.Channels) ProtoDialect(com.google.cloud.teleport.spanner.ExportProtos.ProtoDialect) IOException(java.io.IOException) TimestampBound(com.google.cloud.spanner.TimestampBound) PCollection(org.apache.beam.sdk.values.PCollection) SpannerIO(org.apache.beam.sdk.io.gcp.spanner.SpannerIO) SchemaBuilder(org.apache.avro.SchemaBuilder) CoGroupByKey(org.apache.beam.sdk.transforms.join.CoGroupByKey) ChangeStream(com.google.cloud.teleport.spanner.ddl.ChangeStream) Paths(java.nio.file.Paths) GcsOptions(org.apache.beam.sdk.extensions.gcp.options.GcsOptions) PCollectionView(org.apache.beam.sdk.values.PCollectionView) WritableByteChannel(java.nio.channels.WritableByteChannel) VisibleForTesting(com.google.common.annotations.VisibleForTesting) FileSystems(org.apache.beam.sdk.io.FileSystems) Collections(java.util.Collections) TextIO(org.apache.beam.sdk.io.TextIO) DataflowWorkerHarnessOptions(org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions) Schema(org.apache.avro.Schema) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionList(org.apache.beam.sdk.values.PCollectionList) List(java.util.List) ArrayList(java.util.ArrayList) GcsOptions(org.apache.beam.sdk.extensions.gcp.options.GcsOptions) ReadOperation(org.apache.beam.sdk.io.gcp.spanner.ReadOperation) WritableByteChannel(java.nio.channels.WritableByteChannel) Contextful(org.apache.beam.sdk.transforms.Contextful) FileIO(org.apache.beam.sdk.io.FileIO) Transaction(org.apache.beam.sdk.io.gcp.spanner.Transaction) Collection(java.util.Collection) PCollection(org.apache.beam.sdk.values.PCollection) ChangeStream(com.google.cloud.teleport.spanner.ddl.ChangeStream) Map(java.util.Map) Ddl(com.google.cloud.teleport.spanner.ddl.Ddl) Struct(com.google.cloud.spanner.Struct) CoGbkResult(org.apache.beam.sdk.transforms.join.CoGbkResult) Dialect(com.google.cloud.spanner.Dialect) ProtoDialect(com.google.cloud.teleport.spanner.ExportProtos.ProtoDialect) Export(com.google.cloud.teleport.spanner.ExportProtos.Export) GenericRecord(org.apache.avro.generic.GenericRecord) Path(java.nio.file.Path) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) Table(com.google.cloud.teleport.spanner.ddl.Table) DataFileWriter(org.apache.avro.file.DataFileWriter) KV(org.apache.beam.sdk.values.KV) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) IOException(java.io.IOException) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) Pipeline(org.apache.beam.sdk.Pipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) CreateTransactionFnWithTimestamp(com.google.cloud.teleport.templates.common.SpannerConverters.CreateTransactionFnWithTimestamp)

Example 2 with Export

use of com.google.cloud.teleport.spanner.ExportProtos.Export in project DataflowTemplates by GoogleCloudPlatform.

the class ImportTransform method expand.

@Override
public PDone expand(PBegin begin) {
    PCollectionView<Dialect> dialectView = begin.apply("Read Dialect", new ReadDialect(spannerConfig)).apply("Dialect As PCollectionView", View.asSingleton());
    PCollection<Export> manifest = begin.apply("Read manifest", new ReadExportManifestFile(importDirectory, dialectView));
    PCollectionView<Export> manifestView = manifest.apply("Manifest as view", View.asSingleton());
    PCollection<KV<String, String>> allFiles = manifest.apply("Read all manifest files", new ReadManifestFiles(importDirectory));
    PCollection<KV<String, List<String>>> tableFiles = allFiles.apply(Combine.perKey(AsList.fn()));
    PCollection<KV<String, String>> schemas = tableFiles.apply("File per table, view or change stream", ParDo.of(new DoFn<KV<String, List<String>>, KV<String, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            KV<String, List<String>> kv = c.element();
            if (!kv.getValue().isEmpty()) {
                c.output(KV.of(kv.getKey(), kv.getValue().get(0)));
            }
        }
    })).apply("Extract avro schemas", ParDo.of(new ReadAvroSchemas()));
    final PCollection<List<KV<String, String>>> avroSchemas = schemas.apply("Build avro DDL", Combine.globally(AsList.fn()));
    PCollectionView<Transaction> tx = begin.apply(SpannerIO.createTransaction().withSpannerConfig(spannerConfig));
    PCollection<Ddl> informationSchemaDdl = begin.apply("Read Information Schema", new ReadInformationSchema(spannerConfig, tx, dialectView));
    final PCollectionView<List<KV<String, String>>> avroDdlView = avroSchemas.apply("Avro ddl view", View.asSingleton());
    final PCollectionView<Ddl> informationSchemaView = informationSchemaDdl.apply("Information schema view", View.asSingleton());
    final PCollectionTuple createTableOutput = begin.apply("Create Cloud Spanner Tables and indexes", new CreateTables(spannerConfig, avroDdlView, informationSchemaView, manifestView, earlyIndexCreateFlag, ddlCreationTimeoutInMinutes));
    final PCollection<Ddl> ddl = createTableOutput.get(CreateTables.getDdlObjectTag());
    final PCollectionView<List<String>> pendingIndexes = createTableOutput.get(CreateTables.getPendingIndexesTag()).apply("As Index view", View.asSingleton());
    final PCollectionView<List<String>> pendingForeignKeys = createTableOutput.get(CreateTables.getPendingForeignKeysTag()).apply("As Foreign keys view", View.asSingleton());
    final PCollectionView<List<String>> pendingChangeStreams = createTableOutput.get(CreateTables.getPendingChangeStreamsTag()).apply("As change streams view", View.asSingleton());
    PCollectionView<Ddl> ddlView = ddl.apply("Cloud Spanner DDL as view", View.asSingleton());
    PCollectionView<HashMultimap<Integer, String>> levelsView = ddl.apply("Group tables by depth", ParDo.of(new DoFn<Ddl, HashMultimap<Integer, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            Ddl ddl = c.element();
            c.output(ddl.perLevelView());
        }
    })).apply(View.asSingleton());
    PCollection<HashMultimap<String, String>> acc = tableFiles.apply("Combine table files", Combine.globally(AsList.fn())).apply("As HashMultimap", ParDo.of(new DoFn<List<KV<String, List<String>>>, HashMultimap<String, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            HashMultimap<String, String> result = HashMultimap.create();
            for (KV<String, List<String>> kv : c.element()) {
                result.putAll(kv.getKey().toLowerCase(), kv.getValue());
            }
            c.output(result);
        }
    }));
    PCollection<?> previousComputation = ddl;
    for (int i = 0; i < MAX_DEPTH; i++) {
        final int depth = i;
        PCollection<KV<String, String>> levelFiles = acc.apply("Get Avro filenames depth " + depth, ParDo.of(new DoFn<HashMultimap<String, String>, KV<String, String>>() {

            @ProcessElement
            public void processElement(ProcessContext c) {
                HashMultimap<String, String> allFiles = c.element();
                HashMultimap<Integer, String> levels = c.sideInput(levelsView);
                Set<String> tables = levels.get(depth);
                for (String table : tables) {
                    for (String file : allFiles.get(table)) {
                        c.output(KV.of(file, table));
                    }
                }
            }
        }).withSideInputs(levelsView)).apply("Wait for previous depth " + depth, Wait.on(previousComputation));
        PCollection<Mutation> mutations = levelFiles.apply("Avro files as mutations " + depth, new AvroTableFileAsMutations(ddlView));
        SpannerWriteResult result = mutations.apply("Write mutations " + depth, SpannerIO.write().withSchemaReadySignal(ddl).withSpannerConfig(spannerConfig).withCommitDeadline(Duration.standardMinutes(1)).withMaxCumulativeBackoff(Duration.standardHours(2)).withMaxNumMutations(10000).withGroupingFactor(100).withDialectView(dialectView));
        previousComputation = result.getOutput();
    }
    ddl.apply(Wait.on(previousComputation)).apply("Create Indexes", new ApplyDDLTransform(spannerConfig, pendingIndexes, waitForIndexes)).apply("Add Foreign Keys", new ApplyDDLTransform(spannerConfig, pendingForeignKeys, waitForForeignKeys)).apply("Create Change Streams", new ApplyDDLTransform(spannerConfig, pendingChangeStreams, waitForChangeStreams));
    return PDone.in(begin.getPipeline());
}
Also used : Ddl(com.google.cloud.teleport.spanner.ddl.Ddl) Dialect(com.google.cloud.spanner.Dialect) ProtoDialect(com.google.cloud.teleport.spanner.ExportProtos.ProtoDialect) Export(com.google.cloud.teleport.spanner.ExportProtos.Export) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) PCollectionList(org.apache.beam.sdk.values.PCollectionList) List(java.util.List) TupleTagList(org.apache.beam.sdk.values.TupleTagList) ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) HashMultimap(com.google.common.collect.HashMultimap) DoFn(org.apache.beam.sdk.transforms.DoFn) Transaction(org.apache.beam.sdk.io.gcp.spanner.Transaction) SpannerWriteResult(org.apache.beam.sdk.io.gcp.spanner.SpannerWriteResult) Mutation(com.google.cloud.spanner.Mutation)

Example 3 with Export

use of com.google.cloud.teleport.spanner.ExportProtos.Export in project DataflowTemplates by GoogleCloudPlatform.

the class ExportTransformTest method buildDatabaseManifestFile.

@Test
public void buildDatabaseManifestFile() throws InvalidProtocolBufferException {
    Map<String, String> tablesAndManifests = ImmutableMap.of("table1", "table1 manifest", "table2", "table2 manifest", "changeStream", "changeStream manifest");
    PCollection<List<Export.Table>> metadataTables = pipeline.apply("Initialize table manifests", Create.of(tablesAndManifests)).apply("Combine table manifests", Combine.globally(new CombineTableMetadata()));
    ImmutableList<Export.DatabaseOption> databaseOptions = ImmutableList.of(Export.DatabaseOption.newBuilder().setOptionName("version_retention_period").setOptionValue("5d").build());
    Ddl.Builder ddlBuilder = Ddl.builder();
    ddlBuilder.mergeDatabaseOptions(databaseOptions);
    ddlBuilder.createChangeStream("changeStream").endChangeStream();
    Ddl ddl = ddlBuilder.build();
    PCollectionView<Ddl> ddlView = pipeline.apply(Create.of(ddl)).apply(View.asSingleton());
    PCollectionView<Dialect> dialectView = pipeline.apply("CreateSingleton", Create.of(Dialect.GOOGLE_STANDARD_SQL)).apply("As PCollectionView", View.asSingleton());
    PCollection<String> databaseManifest = metadataTables.apply("Test adding database option to manifest", ParDo.of(new CreateDatabaseManifest(ddlView, dialectView)).withSideInputs(ddlView, dialectView));
    // The output JSON may contain the tables in any order, so a string comparison is not
    // sufficient. Have to convert the manifest string to a protobuf. Also for the checker function
    // to be serializable, it has to be written as a lambda.
    PAssert.thatSingleton(databaseManifest).satisfies((SerializableFunction<String, Void>) input -> {
        Builder builder1 = Export.newBuilder();
        try {
            JsonFormat.parser().merge(input, builder1);
        } catch (InvalidProtocolBufferException e) {
            throw new RuntimeException(e);
        }
        Export manifestProto = builder1.build();
        assertThat(manifestProto.getTablesCount(), is(2));
        assertThat(manifestProto.getDialect(), is(ProtoDialect.GOOGLE_STANDARD_SQL));
        String table1Name = manifestProto.getTables(0).getName();
        assertThat(table1Name, startsWith("table"));
        assertThat(manifestProto.getTables(0).getManifestFile(), is(table1Name + "-manifest.json"));
        Export.DatabaseOption dbOptions = manifestProto.getDatabaseOptions(0);
        String optionName = dbOptions.getOptionName();
        String optionValue = dbOptions.getOptionValue();
        assertThat(optionName, is("version_retention_period"));
        assertThat(optionValue, is("5d"));
        assertThat(manifestProto.getChangeStreamsCount(), is(1));
        assertThat(manifestProto.getChangeStreams(0).getName(), is("changeStream"));
        assertThat(manifestProto.getChangeStreams(0).getManifestFile(), is("changeStream-manifest.json"));
        return null;
    });
    pipeline.run();
}
Also used : Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) KV(org.apache.beam.sdk.values.KV) CombineTableMetadata(com.google.cloud.teleport.spanner.ExportTransform.CombineTableMetadata) Dialect(com.google.cloud.spanner.Dialect) Combine(org.apache.beam.sdk.transforms.Combine) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) View(org.apache.beam.sdk.transforms.View) Timestamp(com.google.cloud.Timestamp) BuildTableManifests(com.google.cloud.teleport.spanner.ExportTransform.BuildTableManifests) Assert.assertThat(org.junit.Assert.assertThat) Builder(com.google.cloud.teleport.spanner.ExportProtos.Export.Builder) ImmutableList(com.google.common.collect.ImmutableList) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) TableManifest(com.google.cloud.teleport.spanner.ExportProtos.TableManifest) Ddl(com.google.cloud.teleport.spanner.ddl.Ddl) Path(java.nio.file.Path) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) ImmutableMap(com.google.common.collect.ImmutableMap) Files(java.nio.file.Files) PAssert(org.apache.beam.sdk.testing.PAssert) ProtoDialect(com.google.cloud.teleport.spanner.ExportProtos.ProtoDialect) Test(org.junit.Test) TimestampBound(com.google.cloud.spanner.TimestampBound) PCollection(org.apache.beam.sdk.values.PCollection) Matchers.startsWith(org.hamcrest.Matchers.startsWith) Export(com.google.cloud.teleport.spanner.ExportProtos.Export) List(java.util.List) Rule(org.junit.Rule) CreateDatabaseManifest(com.google.cloud.teleport.spanner.ExportTransform.CreateDatabaseManifest) JsonFormat(com.google.protobuf.util.JsonFormat) ParDo(org.apache.beam.sdk.transforms.ParDo) PCollectionView(org.apache.beam.sdk.values.PCollectionView) Matchers.is(org.hamcrest.Matchers.is) Assert.assertEquals(org.junit.Assert.assertEquals) CombineTableMetadata(com.google.cloud.teleport.spanner.ExportTransform.CombineTableMetadata) Builder(com.google.cloud.teleport.spanner.ExportProtos.Export.Builder) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) Ddl(com.google.cloud.teleport.spanner.ddl.Ddl) CreateDatabaseManifest(com.google.cloud.teleport.spanner.ExportTransform.CreateDatabaseManifest) Dialect(com.google.cloud.spanner.Dialect) ProtoDialect(com.google.cloud.teleport.spanner.ExportProtos.ProtoDialect) Export(com.google.cloud.teleport.spanner.ExportProtos.Export) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Test(org.junit.Test)

Example 4 with Export

use of com.google.cloud.teleport.spanner.ExportProtos.Export in project DataflowTemplates by GoogleCloudPlatform.

the class DdlTest method simple.

@Test
public void simple() {
    Ddl.Builder builder = Ddl.builder();
    builder.createTable("Users").column("id").int64().notNull().endColumn().column("first_name").string().size(10).endColumn().column("last_name").type(Type.string()).max().endColumn().column("full_name").type(Type.string()).max().generatedAs("CONCAT(first_name, ' ', last_name)").stored().endColumn().primaryKey().asc("id").end().indexes(ImmutableList.of("CREATE INDEX `UsersByFirstName` ON `Users` (`first_name`)")).foreignKeys(ImmutableList.of("ALTER TABLE `Users` ADD CONSTRAINT `fk` FOREIGN KEY (`first_name`)" + " REFERENCES `AllowedNames` (`first_name`)")).checkConstraints(ImmutableList.of("CONSTRAINT `ck` CHECK (`first_name` != `last_name`)")).endTable();
    Export export = Export.newBuilder().addDatabaseOptions(Export.DatabaseOption.newBuilder().setOptionName("version_retention_period").setOptionValue("4d").build()).build();
    builder.mergeDatabaseOptions(export.getDatabaseOptionsList());
    Ddl ddl = builder.build();
    assertThat(ddl.prettyPrint(), equalToCompressingWhiteSpace("ALTER DATABASE `%db_name%` SET OPTIONS ( version_retention_period = 4d )" + " CREATE TABLE `Users` (" + " `id` INT64 NOT NULL," + " `first_name` STRING(10)," + " `last_name` STRING(MAX)," + " `full_name` STRING(MAX) AS (CONCAT(first_name, ' ', last_name)) STORED," + " CONSTRAINT `ck` CHECK (`first_name` != `last_name`)," + " ) PRIMARY KEY (`id` ASC)" + " CREATE INDEX `UsersByFirstName` ON `Users` (`first_name`)" + " ALTER TABLE `Users` ADD CONSTRAINT `fk` FOREIGN KEY (`first_name`)" + " REFERENCES `AllowedNames` (`first_name`)"));
}
Also used : Export(com.google.cloud.teleport.spanner.ExportProtos.Export) Test(org.junit.Test)

Example 5 with Export

use of com.google.cloud.teleport.spanner.ExportProtos.Export in project DataflowTemplates by GoogleCloudPlatform.

the class DdlTest method pgSimple.

@Test
public void pgSimple() {
    Ddl.Builder builder = Ddl.builder(Dialect.POSTGRESQL);
    builder.createTable("Users").column("id").pgInt8().notNull().endColumn().column("first_name").pgVarchar().size(10).defaultExpression("John").endColumn().column("last_name").type(Type.pgVarchar()).max().defaultExpression("Lennon").endColumn().column("full_name").type(Type.pgVarchar()).max().generatedAs("CONCAT(first_name, ' ', last_name)").stored().endColumn().primaryKey().asc("id").end().indexes(ImmutableList.of("CREATE INDEX \"UsersByFirstName\" ON \"Users\" (\"first_name\")")).foreignKeys(ImmutableList.of("ALTER TABLE \"Users\" ADD CONSTRAINT \"fk\" FOREIGN KEY (\"first_name\")" + " REFERENCES \"AllowedNames\" (\"first_name\")")).checkConstraints(ImmutableList.of("CONSTRAINT \"ck\" CHECK (\"first_name\" != \"last_name\")")).endTable();
    Export export = Export.newBuilder().addDatabaseOptions(Export.DatabaseOption.newBuilder().setOptionName("version_retention_period").setOptionValue("4d").setOptionType("STRING").build()).build();
    builder.mergeDatabaseOptions(export.getDatabaseOptionsList());
    Ddl ddl = builder.build();
    assertThat(ddl.prettyPrint(), equalToCompressingWhiteSpace("ALTER DATABASE \"%db_name%\" SET spanner.version_retention_period = '4d'" + " CREATE TABLE \"Users\" (" + " \"id\" bigint NOT NULL," + " \"first_name\" character varying(10) DEFAULT John," + " \"last_name\" character varying DEFAULT Lennon," + " \"full_name\" character varying GENERATED ALWAYS AS" + " (CONCAT(first_name, ' ', last_name)) STORED," + " CONSTRAINT \"ck\" CHECK (\"first_name\" != \"last_name\")," + " PRIMARY KEY (\"id\")" + " ) " + " CREATE INDEX \"UsersByFirstName\" ON \"Users\" (\"first_name\")" + " ALTER TABLE \"Users\" ADD CONSTRAINT \"fk\" FOREIGN KEY (\"first_name\")" + " REFERENCES \"AllowedNames\" (\"first_name\")"));
}
Also used : Export(com.google.cloud.teleport.spanner.ExportProtos.Export) Test(org.junit.Test)

Aggregations

Export (com.google.cloud.teleport.spanner.ExportProtos.Export)5 Dialect (com.google.cloud.spanner.Dialect)3 ProtoDialect (com.google.cloud.teleport.spanner.ExportProtos.ProtoDialect)3 Ddl (com.google.cloud.teleport.spanner.ddl.Ddl)3 List (java.util.List)3 Test (org.junit.Test)3 TimestampBound (com.google.cloud.spanner.TimestampBound)2 TableManifest (com.google.cloud.teleport.spanner.ExportProtos.TableManifest)2 InvalidProtocolBufferException (com.google.protobuf.InvalidProtocolBufferException)2 JsonFormat (com.google.protobuf.util.JsonFormat)2 Files (java.nio.file.Files)2 Path (java.nio.file.Path)2 ArrayList (java.util.ArrayList)2 Map (java.util.Map)2 KV (org.apache.beam.sdk.values.KV)2 Timestamp (com.google.cloud.Timestamp)1 Mutation (com.google.cloud.spanner.Mutation)1 Struct (com.google.cloud.spanner.Struct)1 Builder (com.google.cloud.teleport.spanner.ExportProtos.Export.Builder)1 BuildTableManifests (com.google.cloud.teleport.spanner.ExportTransform.BuildTableManifests)1