Search in sources :

Example 1 with ColumnInformation

use of com.google.cloud.solutions.autotokenize.AutoTokenizeMessages.ColumnInformation in project auto-data-tokenize by GoogleCloudPlatform.

the class DlpInspectionPipeline method makePipeline.

/**
 * Creates the pipeline and applies the transforms.
 */
@VisibleForTesting
Pipeline makePipeline() {
    TupleTag<FlatRecord> recordsTag = new TupleTag<>();
    TupleTag<String> avroSchemaTag = new TupleTag<>();
    PCollectionTuple recordSchemaTuple = pipeline.apply("Read" + SourceNames.forType(options.getSourceType()).asCamelCase(), TransformingReader.forSourceType(options.getSourceType()).from(options.getInputPattern()).withJdbcConfiguration(JdbcConfigurationExtractor.using(options).jdbcConfiguration()).withSecretsClient(secretsClient).withRecordsTag(recordsTag).withAvroSchemaTag(avroSchemaTag));
    // Sample and Identify columns
    var columnInfoTag = new TupleTag<ColumnInformation>();
    var errorTag = new TupleTag<KV<ShardedKey<String>, Table>>();
    var dlpInspectResults = recordSchemaTuple.get(recordsTag).apply("RandomColumnarSample", RandomColumnarSampler.any(options.getSampleSize())).apply("BatchForDlp", new BatchColumnsForDlp()).apply("DlpIdentify", DlpIdentify.builder().batchIdentifierFactory(makeDlpBatchIdentifierFactory()).columnInfoTag(columnInfoTag).errorTag(errorTag).build());
    dlpInspectResults.get(errorTag).setCoder(KvCoder.of(ShardedKey.Coder.of(StringUtf8Coder.of()), ProtoCoder.of(Table.class))).apply("MakeErrorTableJson", ParDo.of(new ConvertTableToJsonFn())).setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())).apply("WriteErrorElements", FileIO.<String, KV<String, String>>writeDynamic().via(Contextful.fn(KV::getValue), Contextful.fn(col -> TextIO.sink())).by(KV::getKey).withDestinationCoder(StringUtf8Coder.of()).withNaming(Contextful.fn(colName -> defaultNaming(/*prefix=*/
    String.format("col-%s", colName.replaceAll("[\\.\\$\\[\\]]+", "-")).replaceAll("[-]+", "-"), /*suffix=*/
    ".json"))).to(options.getReportLocation() + "/error"));
    var inspectionReport = dlpInspectResults.get(columnInfoTag).apply("ExtractReport", MakeInspectionReport.builder().setAvroSchema(recordSchemaTuple.get(avroSchemaTag).apply(View.asSingleton())).setSourceType(options.getSourceType()).setClock(clock).setInputPattern(options.getInputPattern()).setJdbcConfiguration(JdbcConfigurationExtractor.using(options).jdbcConfiguration()).build());
    recordSchemaTuple.get(avroSchemaTag).apply("WriteSchema", TextIO.write().to(options.getReportLocation() + "/schema").withSuffix(".json").withoutSharding());
    writeReportToGcs(inspectionReport);
    writeReportToBigQuery(inspectionReport);
    writeReportToDataCatalog(inspectionReport);
    return pipeline;
}
Also used : FileIO(org.apache.beam.sdk.io.FileIO) KV(org.apache.beam.sdk.values.KV) DlpBatchInspectFactory(com.google.cloud.solutions.autotokenize.dlp.DlpBatchInspectFactory) InspectionReportToTableRow(com.google.cloud.solutions.autotokenize.common.InspectionReportToTableRow) View(org.apache.beam.sdk.transforms.View) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) SecretsClient(com.google.cloud.solutions.autotokenize.common.SecretsClient) Contextful(org.apache.beam.sdk.transforms.Contextful) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) DlpClientFactory(com.google.cloud.solutions.autotokenize.dlp.DlpClientFactory) TransformingReader(com.google.cloud.solutions.autotokenize.common.TransformingReader) TupleTag(org.apache.beam.sdk.values.TupleTag) InspectionReport(com.google.cloud.solutions.autotokenize.AutoTokenizeMessages.InspectionReport) ProtoCoder(org.apache.beam.sdk.extensions.protobuf.ProtoCoder) DataCatalogWriter(com.google.cloud.solutions.autotokenize.datacatalog.DataCatalogWriter) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ColumnInformation(com.google.cloud.solutions.autotokenize.AutoTokenizeMessages.ColumnInformation) Pipeline(org.apache.beam.sdk.Pipeline) DlpServiceClient(com.google.cloud.dlp.v2.DlpServiceClient) DoFn(org.apache.beam.sdk.transforms.DoFn) MakeDataCatalogItems(com.google.cloud.solutions.autotokenize.datacatalog.MakeDataCatalogItems) InvalidProtocolBufferException(com.google.protobuf.InvalidProtocolBufferException) KvCoder(org.apache.beam.sdk.coders.KvCoder) ImmutableSet(com.google.common.collect.ImmutableSet) BatchColumnsForDlp(com.google.cloud.solutions.autotokenize.dlp.BatchColumnsForDlp) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) Table(com.google.privacy.dlp.v2.Table) Preconditions.checkNotNull(com.google.common.base.Preconditions.checkNotNull) DlpIdentify(com.google.cloud.solutions.autotokenize.dlp.DlpIdentify) InfoType(com.google.privacy.dlp.v2.InfoType) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) ShardedKey(org.apache.beam.sdk.util.ShardedKey) PCollection(org.apache.beam.sdk.values.PCollection) Write.defaultNaming(org.apache.beam.sdk.io.FileIO.Write.defaultNaming) GoogleLogger(com.google.common.flogger.GoogleLogger) SourceNames(com.google.cloud.solutions.autotokenize.common.SourceNames) StringUtils.isNotBlank(org.apache.commons.lang3.StringUtils.isNotBlank) JsonFormat(com.google.protobuf.util.JsonFormat) ParDo(org.apache.beam.sdk.transforms.ParDo) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) FlatRecord(com.google.cloud.solutions.autotokenize.AutoTokenizeMessages.FlatRecord) Clock(java.time.Clock) InspectionReportFileWriter(com.google.cloud.solutions.autotokenize.common.InspectionReportFileWriter) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TextIO(org.apache.beam.sdk.io.TextIO) Table(com.google.privacy.dlp.v2.Table) FlatRecord(com.google.cloud.solutions.autotokenize.AutoTokenizeMessages.FlatRecord) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) ShardedKey(org.apache.beam.sdk.util.ShardedKey) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) BatchColumnsForDlp(com.google.cloud.solutions.autotokenize.dlp.BatchColumnsForDlp) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

DlpServiceClient (com.google.cloud.dlp.v2.DlpServiceClient)1 ColumnInformation (com.google.cloud.solutions.autotokenize.AutoTokenizeMessages.ColumnInformation)1 FlatRecord (com.google.cloud.solutions.autotokenize.AutoTokenizeMessages.FlatRecord)1 InspectionReport (com.google.cloud.solutions.autotokenize.AutoTokenizeMessages.InspectionReport)1 InspectionReportFileWriter (com.google.cloud.solutions.autotokenize.common.InspectionReportFileWriter)1 InspectionReportToTableRow (com.google.cloud.solutions.autotokenize.common.InspectionReportToTableRow)1 SecretsClient (com.google.cloud.solutions.autotokenize.common.SecretsClient)1 SourceNames (com.google.cloud.solutions.autotokenize.common.SourceNames)1 TransformingReader (com.google.cloud.solutions.autotokenize.common.TransformingReader)1 DataCatalogWriter (com.google.cloud.solutions.autotokenize.datacatalog.DataCatalogWriter)1 MakeDataCatalogItems (com.google.cloud.solutions.autotokenize.datacatalog.MakeDataCatalogItems)1 BatchColumnsForDlp (com.google.cloud.solutions.autotokenize.dlp.BatchColumnsForDlp)1 DlpBatchInspectFactory (com.google.cloud.solutions.autotokenize.dlp.DlpBatchInspectFactory)1 DlpClientFactory (com.google.cloud.solutions.autotokenize.dlp.DlpClientFactory)1 DlpIdentify (com.google.cloud.solutions.autotokenize.dlp.DlpIdentify)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkNotNull (com.google.common.base.Preconditions.checkNotNull)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)1