use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToSQL method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to SQL DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to SQL Database via jdbc
*/
Pipeline pipeline = Pipeline.create(options);
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
validateOptions(options, dataSourceConfiguration);
Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
/*
* Stage 2: Write JSON Strings to SQL Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to SQL", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {
public String formatStatement(DmlInfo element) {
return element.getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToPostgres method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to Postgres DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to Postgres
*/
Pipeline pipeline = Pipeline.create(options);
String jdbcDriverConnectionString = String.format("jdbc:postgresql://%s:%s/%s", options.getDatabaseHost(), options.getDatabasePort(), options.getDatabaseName());
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = CdcJdbcIO.DataSourceConfiguration.create("org.postgresql.Driver", jdbcDriverConnectionString).withUsername(options.getDatabaseUser()).withPassword(options.getDatabasePassword()).withMaxIdleConnections(new Integer(0));
validateOptions(options, dataSourceConfiguration);
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
/*
* Stage 2: Write JSON Strings to Postgres Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to Postgres DML", CreateDml.createDmlObjects(dataSourceConfiguration)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to Postgres", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {
public String formatStatement(DmlInfo element) {
return element.getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class ShadowTableCreator method constructShadowTable.
/*
* Constructs a shadow table for a data table in the information schema.
* Note: Shadow tables for interleaved tables are not interleaved to
* their shadow parent table.
*/
Table constructShadowTable(Ddl informationSchema, String dataTableName) {
// Create a new shadow table with the given prefix.
Table.Builder shadowTableBuilder = Table.builder();
String shadowTableName = shadowTablePrefix + dataTableName;
shadowTableBuilder.name(shadowTableName);
// Add key columns from the data table to the shadow table builder.
Table dataTable = informationSchema.table(dataTableName);
Set<String> primaryKeyColNames = dataTable.primaryKeys().stream().map(k -> k.name()).collect(Collectors.toSet());
List<Column> primaryKeyCols = dataTable.columns().stream().filter(col -> primaryKeyColNames.contains(col.name())).collect(Collectors.toList());
for (Column col : primaryKeyCols) {
shadowTableBuilder.addColumn(col);
}
// Add primary key constraints.
for (IndexColumn keyColumn : dataTable.primaryKeys()) {
if (keyColumn.order() == IndexColumn.Order.ASC) {
shadowTableBuilder.primaryKey().asc(keyColumn.name()).end();
} else if (keyColumn.order() == IndexColumn.Order.DESC) {
shadowTableBuilder.primaryKey().desc(keyColumn.name()).end();
}
}
// Add extra column to track ChangeEventSequence information
addChangeEventSequenceColumns(shadowTableBuilder);
return shadowTableBuilder.build();
}
use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversion method run.
/**
* Runs the pipeline to completion with the specified options.
*
* @return The pipeline result.
*/
public static PipelineResult run(Pipeline pipeline, FileFormatConversionOptions options, DataplexClient dataplex, OutputPathProvider outputPathProvider) throws IOException {
boolean isInputAsset = ASSET_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches();
if (!isInputAsset && !ENTITIES_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches()) {
throw new IllegalArgumentException("Either input asset or input entities list must be provided");
}
GoogleCloudDataplexV1Asset outputAsset = dataplex.getAsset(options.getOutputAsset());
if (outputAsset == null || outputAsset.getResourceSpec() == null || !DataplexAssetResourceSpec.STORAGE_BUCKET.name().equals(outputAsset.getResourceSpec().getType()) || outputAsset.getResourceSpec().getName() == null) {
throw new IllegalArgumentException("Output asset must be an existing asset with resource spec name being a GCS bucket and" + " resource spec type of " + DataplexAssetResourceSpec.STORAGE_BUCKET.name());
}
String outputBucket = outputAsset.getResourceSpec().getName();
Predicate<String> inputFilesFilter;
switch(options.getWriteDisposition()) {
case OVERWRITE:
inputFilesFilter = inputFilePath -> true;
break;
case FAIL:
Set<String> outputFilePaths = getAllOutputFilePaths(outputBucket);
inputFilesFilter = inputFilePath -> {
if (outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()))) {
throw new WriteDispositionException(String.format("The file %s already exists in the output asset bucket: %s", inputFilePath, outputBucket));
}
return true;
};
break;
case SKIP:
outputFilePaths = getAllOutputFilePaths(outputBucket);
inputFilesFilter = inputFilePath -> !outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()));
break;
default:
throw new UnsupportedOperationException("Unsupported existing file behaviour: " + options.getWriteDisposition());
}
ImmutableList<GoogleCloudDataplexV1Entity> entities = isInputAsset ? dataplex.getCloudStorageEntities(options.getInputAssetOrEntitiesList()) : dataplex.getEntities(Splitter.on(',').trimResults().splitToList(options.getInputAssetOrEntitiesList()));
boolean convertingFiles = false;
for (GoogleCloudDataplexV1Entity entity : entities) {
ImmutableList<GoogleCloudDataplexV1Partition> partitions = dataplex.getPartitions(entity.getName());
if (partitions.isEmpty()) {
String outputPath = outputPathProvider.outputPathFrom(entity.getDataPath(), outputBucket);
Iterator<String> inputFilePaths = getFilesFromFilePattern(entityToFileSpec(entity)).filter(inputFilesFilter).iterator();
convertingFiles = inputFilePaths.hasNext();
inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(entity.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
} else {
for (GoogleCloudDataplexV1Partition partition : partitions) {
String outputPath = outputPathProvider.outputPathFrom(partition.getLocation(), outputBucket);
Iterator<String> inputFilePaths = getFilesFromFilePattern(partitionToFileSpec(partition)).filter(inputFilesFilter).iterator();
convertingFiles = inputFilePaths.hasNext();
inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(partition.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
}
}
}
if (!convertingFiles) {
pipeline.apply("Nothing to convert", new NoopTransform());
}
return pipeline.run();
}
use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class BigQueryMetadataLoader method loadTableMetadata.
/**
* Populates {@code table} builder with additional metadata like partition names and schema.
*
* @param filter optional filter to skip a subset of tables
* @return {@code true} if the table matches all filters and should be included in the results,
* {@code false} if it should be skipped
*/
private boolean loadTableMetadata(BigQueryTable.Builder table, Filter filter) throws InterruptedException {
TableReadOptions.Builder readOptions = TableReadOptions.newBuilder();
if (table.getPartitioningColumn() == null) {
if (filter != null && filter.shouldSkipUnpartitionedTable(table)) {
return false;
}
} else {
List<BigQueryTablePartition> partitions = loadTablePartitions(table, filter);
if (filter != null && filter.shouldSkipPartitionedTable(table, partitions)) {
return false;
}
table.setPartitions(partitions);
LOG.info("Loaded {} partitions for table {}: {}", partitions.size(), table.getTableName(), partitions);
// Creating a ReadSession without a WHERE clause for a partitioned table that has
// "require partition filter" param set to true would fail with the error:
// "Cannot query over table ... without a filter over column(s) ...
// that can be used for partition elimination".
// The following is a hack that adds an "is null and is not null" filter over the
// partitioning column, which shouldn't select any data but should make the query
// analyzer happy and should be enough to extract the table schema.
// TODO(an2x): do this only when "require partition filter" = true
// or load schema differently?
readOptions.setRowRestriction(String.format("%s is null and %s is not null", table.getPartitioningColumn(), table.getPartitioningColumn()));
}
ReadSession session = BigQueryUtils.createReadSession(bqsClient, DatasetId.of(table.getProject(), table.getDataset()), table.getTableName(), readOptions.build());
table.setSchema(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
LOG.info("Loaded schema for table {}: {}", table.getTableName(), table.getSchema());
return true;
}
Aggregations