* Read the Cloud Spanner schema and all the rows in all the tables of the database. Create and
* write the exported Avro files to GCS.
public WriteFilesResult<String> expand(PBegin begin) {
Pipeline p = begin.getPipeline();
* Allow users to specify read timestamp.
* CreateTransaction and CreateTransactionFn classes in SpannerIO
* only take a timestamp object for exact staleness which works when
* parameters are provided during template compile time. They do not work with
* a Timestamp valueProvider which can take parameters at runtime. Hence a new
* ParDo class CreateTransactionFnWithTimestamp had to be created for this
* purpose.
PCollectionView<Transaction> tx = p.apply("CreateTransaction", Create.of(1)).apply("Create transaction", ParDo.of(new CreateTransactionFnWithTimestamp(spannerConfig, snapshotTime))).apply("Tx As PCollectionView", View.asSingleton());
PCollectionView<Dialect> dialectView = p.apply("Read Dialect", new ReadDialect(spannerConfig)).apply("Dialect As PCollectionView", View.asSingleton());
PCollection<Ddl> ddl = p.apply("Read Information Schema", new ReadInformationSchema(spannerConfig, tx, dialectView));
PCollection<Ddl> exportState = ddl.apply("Check export conditions", ParDo.of(new DoFn<Ddl, Ddl>() {
public void processElement(ProcessContext c) throws Exception {
Ddl ddl = c.element();
List<String> tablesList = Collections.emptyList();
// a list of export tables, throw an exception.
if (tableNames.get().trim().isEmpty() && exportRelatedTables.get()) {
throw new Exception("Invalid usage of --tableNames and --shouldExportRelatedTables. Set" + " --shouldExportRelatedTables=true only if --tableNames is given" + " selected tables for export.");
// If the user provides a comma-separated list of strings, parse it into a List
if (!tableNames.get().trim().isEmpty()) {
tablesList = Arrays.asList(tableNames.get().split(",\\s*"));
// If the user provided any invalid table names, throw an exception.
List<String> allSpannerTables = ddl.allTables().stream().map(t ->;
List<String> invalidTables = -> !allSpannerTables.contains(t)).collect(Collectors.toList());
if (invalidTables.size() != 0) {
throw new Exception("INVALID_ARGUMENT: Table(s) not found: " + String.join(", ", invalidTables) + ".");
List<String> filteredTables = getFilteredTables(ddl, tablesList).stream().map(t ->;
// Save any missing necessary export table names; save a copy of the original
// table list to bypass 'final or effectively final' condition of the lambda
// expression below.
List<String> usersTables =;
List<String> missingTables = -> !usersTables.contains(t)).collect(Collectors.toList());
// throw an exception.
if (tablesList.size() != 0 && !(tablesList.equals(filteredTables)) && !exportRelatedTables.get()) {
throw new Exception("Attempted to export table(s) requiring parent and/or foreign keys tables" + " without setting the shouldExportRelatedTables parameter. Set" + " --shouldExportRelatedTables=true to export all necessary" + " tables, or add " + String.join(", ", missingTables) + " to --tableNames.");
PCollection<ReadOperation> tables = ddl.apply("Build table read operations", new BuildReadFromTableOperations(tableNames));
PCollection<KV<String, Void>> allTableAndViewNames = ddl.apply("List all table and view names", ParDo.of(new DoFn<Ddl, KV<String, Void>>() {
public void processElement(ProcessContext c) {
Ddl ddl = c.element();
for (Table t : ddl.allTables()) {
c.output(KV.of(, null));
// we need to add the names of all views separately here.
for ( v : ddl.views()) {
c.output(KV.of(, null));
PCollection<String> allChangeStreamNames = ddl.apply("List all change stream names", ParDo.of(new DoFn<Ddl, String>() {
public void processElement(ProcessContext c) {
Ddl ddl = c.element();
for (ChangeStream changeStream : ddl.changeStreams()) {
// Generate a unique output directory name.
final PCollectionView<String> outputDirectoryName = p.apply(Create.of(1)).apply("Create Avro output folder", ParDo.of(new DoFn<Integer, String>() {
public void processElement(ProcessContext c) {
String instanceId = spannerConfig.getInstanceId().get();
String dbId = spannerConfig.getDatabaseId().get();
// For direct runner or tests we need a deterministic jobId.
String testJobId = ExportTransform.this.testJobId.get();
if (!Strings.isNullOrEmpty(testJobId)) {
try {
DataflowWorkerHarnessOptions workerHarnessOptions = c.getPipelineOptions().as(DataflowWorkerHarnessOptions.class);
String jobId = workerHarnessOptions.getJobId();
c.output(instanceId + "-" + dbId + "-" + jobId);
} catch (Exception e) {
throw new IllegalStateException("Please specify --testJobId to run with non-dataflow runner");
final PCollectionView<Map<String, SerializableSchemaSupplier>> avroSchemas = ddl.apply("Build Avro schemas from DDL", ParDo.of(new DoFn<Ddl, KV<String, SerializableSchemaSupplier>>() {
public void processElement(ProcessContext c) {
Collection<Schema> avroSchemas = new DdlToAvroSchemaConverter("spannerexport", "1.0.0", shouldExportTimestampAsLogicalType.get()).convert(c.element());
for (Schema schema : avroSchemas) {
c.output(KV.of(schema.getName(), new SerializableSchemaSupplier(schema)));
})).apply("As view", View.asMap());
PCollection<Struct> rows = tables.apply("Read all rows from Spanner", SpannerIO.readAll().withTransaction(tx).withSpannerConfig(spannerConfig));
ValueProvider<ResourceId> resource = ValueProvider.NestedValueProvider.of(outputDir, (SerializableFunction<String, ResourceId>) s -> FileSystems.matchNewResource(s, true));
ValueProvider<ResourceId> tempResource = ValueProvider.NestedValueProvider.of(eitherOrValueProvider(avroTempDirectory, outputDir), (SerializableFunction<String, ResourceId>) s -> FileSystems.matchNewResource(s, true));
WriteFilesResult<String> fileWriteResults = rows.apply("Store Avro files", AvroIO.<Struct>writeCustomTypeToGenericRecords().to(new SchemaBasedDynamicDestinations(avroSchemas, outputDirectoryName, dialectView, resource)).withTempDirectory(tempResource));
// Generate the manifest file.
PCollection<KV<String, Iterable<String>>> tableFiles = fileWriteResults.getPerDestinationOutputFilenames().apply(GroupByKey.create());
final TupleTag<Void> allTables = new TupleTag<>();
final TupleTag<Iterable<String>> nonEmptyTables = new TupleTag<>();
PCollection<KV<String, CoGbkResult>> groupedTables = KeyedPCollectionTuple.of(allTables, allTableAndViewNames).and(nonEmptyTables, tableFiles).apply("Group with all tables", CoGroupByKey.create());
// The following is to export empty tables and views from the database. Empty tables and views
// are handled together because we do not export any rows for views, only their metadata,
// including the queries defining them.
PCollection<KV<String, Iterable<String>>> emptyTablesAndViews = groupedTables.apply("Export empty tables and views", ParDo.of(new DoFn<KV<String, CoGbkResult>, KV<String, Iterable<String>>>() {
public void processElement(ProcessContext c) {
KV<String, CoGbkResult> kv = c.element();
String table = kv.getKey();
CoGbkResult coGbkResult = kv.getValue();
Iterable<String> only = coGbkResult.getOnly(nonEmptyTables, null);
if (only == null) {"Exporting empty table or view: " + table);
// This file will contain the schema definition: column definitions for empty
// tables or defining queries for views.
c.output(KV.of(table, Collections.singleton(table + ".avro-00000-of-00001")));
PCollection<KV<String, Iterable<String>>> changeStreams = allChangeStreamNames.apply("Export change streams", ParDo.of(new DoFn<String, KV<String, Iterable<String>>>() {
public void processElement(ProcessContext c) {
String changeStreamName = c.element();"Exporting change stream: " + changeStreamName);
// This file will contain the schema definition for the change stream.
c.output(KV.of(changeStreamName, Collections.singleton(changeStreamName + ".avro-00000-of-00001")));
// Empty tables, views and change streams are handled together, because we export them as empty
// Avro files that only contain the Avro schemas.
PCollection<KV<String, Iterable<String>>> emptySchemaFiles = PCollectionList.of(emptyTablesAndViews).and(changeStreams).apply("Combine all empty schema files", Flatten.pCollections());
emptySchemaFiles = emptySchemaFiles.apply("Save empty schema files", ParDo.of(new DoFn<KV<String, Iterable<String>>, KV<String, Iterable<String>>>() {
public void processElement(ProcessContext c) {
Map<String, SerializableSchemaSupplier> schemaMap = c.sideInput(avroSchemas);
KV<String, Iterable<String>> kv = c.element();
String objectName = kv.getKey();
String fileName = kv.getValue().iterator().next();
Schema schema = schemaMap.get(objectName).get();
DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
Path fullPath = createOutputPath(outputDir.get(), c.sideInput(outputDirectoryName), fileName);
try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, createOutputStream(fullPath, c));
} catch (IOException e) {
throw new RuntimeException(e);
c.output(KV.of(objectName, Collections.singleton(fullPath.toString())));
* Resolves the complete path name for Avro files for both GCS and local FS
* (for testing).
* @param outputDirectoryPath Initial directory path for the file.
* @param outputDirectoryName Terminal directory for the file.
* @param fileName Name of the Avro file
* @return The full {@link Path} of the output Avro file.
private Path createOutputPath(String outputDirectoryPath, String outputDirectoryName, String fileName) {
if (GcsPath.GCS_URI.matcher(outputDirectoryPath).matches()) {
// Avro file path in GCS.
return GcsPath.fromUri(outputDirectoryPath).resolve(outputDirectoryName).resolve(fileName);
} else {
// Avro file path in local filesystem
return Paths.get(outputDirectoryPath, outputDirectoryName, fileName);
* Creates the {@link OutputStream} for the output file either on GCS or on
* local FS (for testing).
* @param outputPath The full path of the output file.
* @param c The {@link org.apache.beam.sdk.transforms.DoFn.ProcessContext}
* @return An {@link OutputStream} for the opened output file.
* @throws IOException if the output file cannot be opened.
private OutputStream createOutputStream(Path outputPath, ProcessContext c) throws IOException {
if (GcsPath.GCS_URI.matcher(outputPath.toString()).matches()) {
// Writing the Avro file to GCS.
org.apache.beam.sdk.extensions.gcp.util.GcsUtil gcsUtil = c.getPipelineOptions().as(GcsOptions.class).getGcsUtil();
String gcsType = "application/octet-stream";
WritableByteChannel gcsChannel = gcsUtil.create((GcsPath) outputPath, gcsType);
return Channels.newOutputStream(gcsChannel);
} else {
// Avro file is created on local filesystem (for testing).
return Files.newOutputStream(outputPath);
}).withSideInputs(avroSchemas, outputDirectoryName));
PCollection<KV<String, Iterable<String>>> allFiles = PCollectionList.of(tableFiles).and(emptySchemaFiles).apply("Combine all files", Flatten.pCollections());
PCollection<KV<String, String>> tableManifests = allFiles.apply("Build table manifests", ParDo.of(new BuildTableManifests()));
Contextful.Fn<String, FileIO.Write.FileNaming> tableManifestNaming = (element, c) -> (window, pane, numShards, shardIndex, compression) -> GcsUtil.joinPath(outputDir.get(), c.sideInput(outputDirectoryName), tableManifestFileName(element));
tableManifests.apply("Store table manifests", FileIO.<String, KV<String, String>>writeDynamic().by(KV::getKey).withDestinationCoder(StringUtf8Coder.of()).withNaming(Contextful.of(tableManifestNaming, Requirements.requiresSideInputs(outputDirectoryName))).via(Contextful.fn(KV::getValue), TextIO.sink()).withTempDirectory(eitherOrValueProvider(avroTempDirectory, outputDir)));
PCollection<List<Export.Table>> metadataTables = tableManifests.apply("Combine table metadata", Combine.globally(new CombineTableMetadata()));
PCollectionView<Ddl> ddlView = ddl.apply("Cloud Spanner DDL as view", View.asSingleton());
PCollection<String> metadataContent = metadataTables.apply("Create database manifest", ParDo.of(new CreateDatabaseManifest(ddlView, dialectView)).withSideInputs(ddlView, dialectView));
Contextful.Fn<String, FileIO.Write.FileNaming> manifestNaming = (element, c) -> (window, pane, numShards, shardIndex, compression) -> GcsUtil.joinPath(outputDir.get(), c.sideInput(outputDirectoryName), "spanner-export.json");
metadataContent.apply("Store the database manifest", FileIO.<String, String>writeDynamic().by(SerializableFunctions.constant("")).withDestinationCoder(StringUtf8Coder.of()).via(TextIO.sink()).withNaming(Contextful.of(manifestNaming, Requirements.requiresSideInputs(outputDirectoryName))).withTempDirectory(eitherOrValueProvider(avroTempDirectory, outputDir)));
return fileWriteResults;
public void processElement(ProcessContext c) throws IOException {
* Input string is one line but Apache CSVParser process multiple lines, so we only take the
* first item in the result list
KV<String, String> kv = c.element();
String tableName = kv.getKey();
Ddl ddl = c.sideInput(ddlView);
Map<String, List<TableManifest.Column>> tableColumnsMap = c.sideInput(tableColumnsView);
Table table = ddl.table(tableName);
Reader in = new StringReader(kv.getValue());
CSVFormat csvFormat = CSVFormat.newFormat(columnDelimiter.get()).withQuote(fieldQualifier.get()).withIgnoreEmptyLines(true).withTrailingDelimiter(trailingDelimiter.get()).withEscape(escape.get()).withNullString(nullString.get());
CSVParser parser = new CSVParser(in, csvFormat);
List<CSVRecord> list = parser.getRecords();
if (list.isEmpty()) {
if (list.size() > 1) {
throw new RuntimeException("Unable to parse this row: " + c.element());
CSVRecord row = list.get(0);
writeBuilder = Mutation.newInsertOrUpdateBuilder(;
try {
c.output(parseRow(writeBuilder, row, table, tableColumnsMap.get(tableName)));
} catch (IllegalArgumentException e) {
throw new RuntimeException(String.format("Error to parseRow. row: %s, table: %s", row, table), e);
use of in project DataflowTemplates by GoogleCloudPlatform.
the class AvroSchemaToDdlConverter method toTable.
public Table toTable(String tableName, Schema schema) {
if (tableName == null) {
tableName = schema.getName();
LOG.debug("Converting to Ddl tableName {}", tableName);
Table.Builder table = Table.builder(dialect);;
for (Schema.Field f : schema.getFields()) {
Column.Builder column = table.column(;
String sqlType = f.getProp("sqlType");
String expression = f.getProp("generationExpression");
if (expression != null) {
// This is a generated column.
if (Strings.isNullOrEmpty(sqlType)) {
throw new IllegalArgumentException("Property sqlType is missing for generated column " +;
String notNull = f.getProp("notNull");
if (notNull == null) {
throw new IllegalArgumentException("Property notNull is missing for generated column " +;
String stored = f.getProp("stored");
if (stored == null) {
throw new IllegalArgumentException("Property stored is missing for generated column " +;
if (Boolean.parseBoolean(stored)) {
} else {
boolean nullable = false;
Schema avroType = f.schema();
if (avroType.getType() == Schema.Type.UNION) {
Schema unpacked = unpackNullable(avroType);
nullable = unpacked != null;
if (nullable) {
avroType = unpacked;
if (Strings.isNullOrEmpty(sqlType)) {
Type spannerType = inferType(avroType, true);
sqlType = toString(spannerType, true);
String defaultExpression = f.getProp("defaultExpression");
ImmutableList.Builder<String> columnOptions = ImmutableList.builder();
for (int i = 0; ; i++) {
String spannerOption = f.getProp("spannerOption_" + i);
if (spannerOption == null) {
for (int i = 0; ; i++) {
String spannerPrimaryKey = schema.getProp("spannerPrimaryKey_" + i);
if (spannerPrimaryKey == null) {
if (spannerPrimaryKey.endsWith(" ASC")) {
String name = spannerPrimaryKey.substring(0, spannerPrimaryKey.length() - 4);
table.primaryKey().asc(unescape(name, dialect)).end();
} else if (spannerPrimaryKey.endsWith(" DESC")) {
String name = spannerPrimaryKey.substring(0, spannerPrimaryKey.length() - 5);
table.primaryKey().desc(unescape(name, dialect)).end();
} else {
throw new IllegalArgumentException("Cannot parse spannerPrimaryKey " + spannerPrimaryKey);
table.indexes(getNumberedPropsWithPrefix(schema, "spannerIndex_"));
table.foreignKeys(getNumberedPropsWithPrefix(schema, "spannerForeignKey_"));
table.checkConstraints(getNumberedPropsWithPrefix(schema, "spannerCheckConstraint_"));
// Table parent options.
String spannerParent = schema.getProp("spannerParent");
if (!Strings.isNullOrEmpty(spannerParent)) {
// Process the on delete action.
String onDeleteAction = schema.getProp("spannerOnDeleteAction");
if (onDeleteAction == null) {
// Preserve behavior for old versions of exporter that did not provide the
// spannerOnDeleteAction property.
onDeleteAction = "no action";
if (onDeleteAction.equals("cascade")) {
} else if (!onDeleteAction.equals("no action")) {
// This is an unknown on delete action.
throw new IllegalArgumentException("Unsupported ON DELETE action " + onDeleteAction);
use of in project DataflowTemplates by GoogleCloudPlatform.
the class BuildReadFromTableOperations method expand.
public PCollection<ReadOperation> expand(PCollection<Ddl> ddl) {
return ddl.apply("Read from table operations", ParDo.of(new DoFn<Ddl, ReadOperation>() {
public void processElement(ProcessContext c) {
Ddl ddl = c.element();
List<String> tablesList = Collections.emptyList();
// If the user provides a comma-separated list of strings, parse it into a List
if (!tables.get().trim().isEmpty()) {
tablesList = Arrays.asList(tables.get().split(",\\s*"));
for (Table table : getFilteredTables(ddl, tablesList)) {
String columnsListAsString = table.columns().stream().filter(x -> !x.isGenerated()).map(x -> createColumnExpression(x)).collect(Collectors.joining(","));
PartitionOptions partitionOptions = PartitionOptions.newBuilder().setMaxPartitions(MAX_PARTITIONS).build();
// Also have to export table name to be able to identify which row belongs to
// which table.
ReadOperation read;
switch(ddl.dialect()) {
read = ReadOperation.create().withQuery(String.format("SELECT \"%s\" AS _spanner_table, %s FROM `%s` AS t",, columnsListAsString,;
read = ReadOperation.create().withQuery(String.format("SELECT '%s' AS _spanner_table, %s FROM \"%s\" AS t",, columnsListAsString,;
throw new IllegalArgumentException(String.format("Unrecognized dialect: %s", ddl.dialect()));
use of in project DataflowTemplates by GoogleCloudPlatform.
the class SpannerTableFilter method getFilteredTables.
* Given a list of table names and database Ddl, returns a Collection of Tables from the Ddl that
* contains only the Tables with the corresponding table names.
static Collection<Table> getFilteredTables(Ddl ddl, List<String> tables) {
Collection<Table> allTables = ddl.allTables();
// If there are no tables provided, return all the tables
if (tables.isEmpty()) {
return allTables;
/* The rest of this function is for handling the export of all necessary related tables.
* We first get a Queue for the initial unprocessed Tables. Then, we iteratively gather related
* tables for each table in unprocessedTables. If a related table (either a parent or foreign
* key table) has not been processed (i.e. added to the completedTables list) and it's not
* all already been checked into the unprocessedTables Queue, then it is added to the Queue.
* All completedTables are returned at the end to the caller. */
Queue<Table> unprocessedTables = getTablesFromDdl(ddl, tables);
// Completed Tables list
Collection<Table> completedTables = Lists.newArrayList();
// Iteratively gather related tables via BFS
while (!unprocessedTables.isEmpty()) {
Table currTable = unprocessedTables.remove();
// Add currTable to completedTables before processing relationships.
// Doing this avoids adding this table again to unprocessTables in the
// corner case of self references.
Set<Table> parentTables = getParentTables(ddl, currTable);
Collection<Table> foreignKeysTables = getForeignKeyTables(ddl, currTable);
for (Table parentTable : parentTables) {
if (!completedTables.contains(parentTable) && !unprocessedTables.contains(parentTable)) {
// This table needs to be processed (and it's not already in unprocessedTables)
for (Table fkTable : foreignKeysTables) {
if (!completedTables.contains(fkTable) && !unprocessedTables.contains(fkTable)) {
// This table needs to be processed (and it's not already in unprocessedTables)
// All necessary tables have been processed
return completedTables;