Search in sources :

Example 1 with ImportManifest

use of com.google.cloud.teleport.spanner.TextImportProtos.ImportManifest in project DataflowTemplates by GoogleCloudPlatform.

the class TextImportTransform method expand.

@Override
public PDone expand(PBegin begin) {
    PCollectionView<Transaction> tx = begin.apply(SpannerIO.createTransaction().withSpannerConfig(spannerConfig));
    PCollectionView<Dialect> dialectView = begin.apply("Read Dialect", new ReadDialect(spannerConfig)).apply("Dialect As PCollectionView", View.asSingleton());
    PCollection<Ddl> ddl = begin.apply("Read Information Schema", new ReadInformationSchema(spannerConfig, tx, dialectView));
    PCollectionView<Ddl> ddlView = ddl.apply("Cloud Spanner DDL as view", View.asSingleton());
    PCollection<ImportManifest> manifest = begin.apply("Read manifest file", new ReadImportManifest(importManifest, dialectView));
    PCollection<KV<String, String>> allFiles = manifest.apply("Resolve data files", new ResolveDataFiles(importManifest, ddlView));
    PCollection<Map<String, List<TableManifest.Column>>> tableColumns = manifest.apply("Read table columns from manifest", new ReadTableColumns());
    PCollectionView<Map<String, List<TableManifest.Column>>> tableColumnsView = tableColumns.apply("tableColumns as View", View.asSingleton());
    PCollection<KV<String, List<String>>> tableFiles = allFiles.apply(Combine.perKey(AsList.fn()));
    // TODO: add a step to check that schema in the manifest match db schema.
    PCollection<HashMultimap<Integer, String>> levelMap = ddl.apply("Group tables by depth", ParDo.of(new DoFn<Ddl, HashMultimap<Integer, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            Ddl ddl = c.element();
            c.output(ddl.perLevelView());
        }
    }));
    PCollectionView<HashMultimap<Integer, String>> levelsView = levelMap.apply("Level map as view", View.asSingleton());
    PCollection<HashMultimap<String, String>> tablesToFilesMap = tableFiles.apply("Combine table files", Combine.globally(AsList.fn())).apply("As HashMultimap", ParDo.of(new DoFn<List<KV<String, List<String>>>, HashMultimap<String, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            HashMultimap<String, String> result = HashMultimap.create();
            for (KV<String, List<String>> kv : c.element()) {
                result.putAll(kv.getKey().toLowerCase(), kv.getValue());
            }
            c.output(result);
        }
    }));
    PCollection<?> previousComputation = ddl;
    for (int i = 0; i < MAX_DEPTH; i++) {
        final int depth = i;
        PCollection<KV<String, String>> levelFileToTables = tablesToFilesMap.apply("Store depth " + depth, ParDo.of(new DoFn<HashMultimap<String, String>, KV<String, String>>() {

            @ProcessElement
            public void processElement(ProcessContext c) {
                HashMultimap<String, String> allFiles = c.element();
                HashMultimap<Integer, String> levels = c.sideInput(levelsView);
                Set<String> tables = levels.get(depth);
                for (String table : tables) {
                    for (String file : allFiles.get(table)) {
                        c.output(KV.of(file, table));
                    }
                }
            }
        }).withSideInputs(levelsView));
        PCollection<Mutation> mutations = levelFileToTables.apply("Reshuffle text files " + depth, Reshuffle.viaRandomKey()).apply("Text files as mutations. Depth: " + depth, new TextTableFilesAsMutations(ddlView, tableColumnsView));
        SpannerWriteResult result = mutations.apply("Wait for previous depth " + depth, Wait.on(previousComputation)).apply("Write mutations " + depth, SpannerIO.write().withSpannerConfig(spannerConfig).withCommitDeadline(Duration.standardMinutes(1)).withMaxCumulativeBackoff(Duration.standardHours(2)).withMaxNumMutations(10000).withGroupingFactor(100).withDialectView(dialectView));
        previousComputation = result.getOutput();
    }
    return PDone.in(begin.getPipeline());
}
Also used : Set(java.util.Set) Ddl(com.google.cloud.teleport.spanner.ddl.Ddl) ImportManifest(com.google.cloud.teleport.spanner.TextImportProtos.ImportManifest) Column(com.google.cloud.teleport.spanner.ddl.Column) Dialect(com.google.cloud.spanner.Dialect) ProtoDialect(com.google.cloud.teleport.spanner.ExportProtos.ProtoDialect) List(java.util.List) KV(org.apache.beam.sdk.values.KV) HashMultimap(com.google.common.collect.HashMultimap) DoFn(org.apache.beam.sdk.transforms.DoFn) Transaction(org.apache.beam.sdk.io.gcp.spanner.Transaction) SpannerWriteResult(org.apache.beam.sdk.io.gcp.spanner.SpannerWriteResult) Mutation(com.google.cloud.spanner.Mutation) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

Dialect (com.google.cloud.spanner.Dialect)1 Mutation (com.google.cloud.spanner.Mutation)1 ProtoDialect (com.google.cloud.teleport.spanner.ExportProtos.ProtoDialect)1 ImportManifest (com.google.cloud.teleport.spanner.TextImportProtos.ImportManifest)1 Column (com.google.cloud.teleport.spanner.ddl.Column)1 Ddl (com.google.cloud.teleport.spanner.ddl.Ddl)1 HashMultimap (com.google.common.collect.HashMultimap)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 SpannerWriteResult (org.apache.beam.sdk.io.gcp.spanner.SpannerWriteResult)1 Transaction (org.apache.beam.sdk.io.gcp.spanner.Transaction)1 DoFn (org.apache.beam.sdk.transforms.DoFn)1 KV (org.apache.beam.sdk.values.KV)1