use of com.google.cloud.teleport.spanner.TextImportProtos.ImportManifest in project DataflowTemplates by GoogleCloudPlatform.
the class TextImportTransform method expand.
@Override
public PDone expand(PBegin begin) {
PCollectionView<Transaction> tx = begin.apply(SpannerIO.createTransaction().withSpannerConfig(spannerConfig));
PCollectionView<Dialect> dialectView = begin.apply("Read Dialect", new ReadDialect(spannerConfig)).apply("Dialect As PCollectionView", View.asSingleton());
PCollection<Ddl> ddl = begin.apply("Read Information Schema", new ReadInformationSchema(spannerConfig, tx, dialectView));
PCollectionView<Ddl> ddlView = ddl.apply("Cloud Spanner DDL as view", View.asSingleton());
PCollection<ImportManifest> manifest = begin.apply("Read manifest file", new ReadImportManifest(importManifest, dialectView));
PCollection<KV<String, String>> allFiles = manifest.apply("Resolve data files", new ResolveDataFiles(importManifest, ddlView));
PCollection<Map<String, List<TableManifest.Column>>> tableColumns = manifest.apply("Read table columns from manifest", new ReadTableColumns());
PCollectionView<Map<String, List<TableManifest.Column>>> tableColumnsView = tableColumns.apply("tableColumns as View", View.asSingleton());
PCollection<KV<String, List<String>>> tableFiles = allFiles.apply(Combine.perKey(AsList.fn()));
// TODO: add a step to check that schema in the manifest match db schema.
PCollection<HashMultimap<Integer, String>> levelMap = ddl.apply("Group tables by depth", ParDo.of(new DoFn<Ddl, HashMultimap<Integer, String>>() {
@ProcessElement
public void processElement(ProcessContext c) {
Ddl ddl = c.element();
c.output(ddl.perLevelView());
}
}));
PCollectionView<HashMultimap<Integer, String>> levelsView = levelMap.apply("Level map as view", View.asSingleton());
PCollection<HashMultimap<String, String>> tablesToFilesMap = tableFiles.apply("Combine table files", Combine.globally(AsList.fn())).apply("As HashMultimap", ParDo.of(new DoFn<List<KV<String, List<String>>>, HashMultimap<String, String>>() {
@ProcessElement
public void processElement(ProcessContext c) {
HashMultimap<String, String> result = HashMultimap.create();
for (KV<String, List<String>> kv : c.element()) {
result.putAll(kv.getKey().toLowerCase(), kv.getValue());
}
c.output(result);
}
}));
PCollection<?> previousComputation = ddl;
for (int i = 0; i < MAX_DEPTH; i++) {
final int depth = i;
PCollection<KV<String, String>> levelFileToTables = tablesToFilesMap.apply("Store depth " + depth, ParDo.of(new DoFn<HashMultimap<String, String>, KV<String, String>>() {
@ProcessElement
public void processElement(ProcessContext c) {
HashMultimap<String, String> allFiles = c.element();
HashMultimap<Integer, String> levels = c.sideInput(levelsView);
Set<String> tables = levels.get(depth);
for (String table : tables) {
for (String file : allFiles.get(table)) {
c.output(KV.of(file, table));
}
}
}
}).withSideInputs(levelsView));
PCollection<Mutation> mutations = levelFileToTables.apply("Reshuffle text files " + depth, Reshuffle.viaRandomKey()).apply("Text files as mutations. Depth: " + depth, new TextTableFilesAsMutations(ddlView, tableColumnsView));
SpannerWriteResult result = mutations.apply("Wait for previous depth " + depth, Wait.on(previousComputation)).apply("Write mutations " + depth, SpannerIO.write().withSpannerConfig(spannerConfig).withCommitDeadline(Duration.standardMinutes(1)).withMaxCumulativeBackoff(Duration.standardHours(2)).withMaxNumMutations(10000).withGroupingFactor(100).withDialectView(dialectView));
previousComputation = result.getOutput();
}
return PDone.in(begin.getPipeline());
}
Aggregations