Search in sources :

Example 1 with Table

use of com.google.privacy.dlp.v2.Table in project beam by apache.

the class BigtableWriteIT method testE2EBigtableWrite.

@Test
public void testE2EBigtableWrite() throws Exception {
    final String tableName = bigtableOptions.getInstanceName().toTableNameStr(tableId);
    final String instanceName = bigtableOptions.getInstanceName().toString();
    final int numRows = 1000;
    final List<KV<ByteString, ByteString>> testData = generateTableData(numRows);
    createEmptyTable(instanceName, tableId);
    Pipeline p = Pipeline.create(options);
    p.apply(GenerateSequence.from(0).to(numRows)).apply(ParDo.of(new DoFn<Long, KV<ByteString, Iterable<Mutation>>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            int index = c.element().intValue();
            Iterable<Mutation> mutations = ImmutableList.of(Mutation.newBuilder().setSetCell(Mutation.SetCell.newBuilder().setValue(testData.get(index).getValue()).setFamilyName(COLUMN_FAMILY_NAME)).build());
            c.output(KV.of(testData.get(index).getKey(), mutations));
        }
    })).apply(BigtableIO.write().withBigtableOptions(bigtableOptions).withTableId(tableId));
    p.run();
    // Test number of column families and column family name equality
    Table table = getTable(tableName);
    assertThat(table.getColumnFamiliesMap().keySet(), Matchers.hasSize(1));
    assertThat(table.getColumnFamiliesMap(), Matchers.hasKey(COLUMN_FAMILY_NAME));
    // Test table data equality
    List<KV<ByteString, ByteString>> tableData = getTableData(tableName);
    assertThat(tableData, Matchers.containsInAnyOrder(testData.toArray()));
}
Also used : Table(com.google.bigtable.admin.v2.Table) ByteString(com.google.protobuf.ByteString) KV(org.apache.beam.sdk.values.KV) Mutation(com.google.bigtable.v2.Mutation) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 2 with Table

use of com.google.privacy.dlp.v2.Table in project java-docs-samples by GoogleCloudPlatform.

the class DeIdentification method deidentifyWithDateShift.

// [END dlp_reidentify_fpe]
// [START dlp_deidentify_date_shift]
/**
 * @param inputCsvPath The path to the CSV file to deidentify
 * @param outputCsvPath (Optional) path to the output CSV file
 * @param dateFields The list of (date) fields in the CSV file to date shift
 * @param lowerBoundDays The maximum number of days to shift a date backward
 * @param upperBoundDays The maximum number of days to shift a date forward
 * @param contextFieldId (Optional) The column to determine date shift, default : a random shift
 *     amount
 * @param wrappedKey (Optional) The encrypted ('wrapped') AES-256 key to use when shifting dates
 * @param keyName (Optional) The name of the Cloud KMS key used to encrypt ('wrap') the AES-256
 *     key
 * @param projectId ID of Google Cloud project to run the API under.
 */
private static void deidentifyWithDateShift(Path inputCsvPath, Path outputCsvPath, String[] dateFields, int lowerBoundDays, int upperBoundDays, String contextFieldId, String wrappedKey, String keyName, String projectId) throws Exception {
    // instantiate a client
    try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
        // Set the maximum days to shift a day backward (lowerbound), forward (upperbound)
        DateShiftConfig.Builder dateShiftConfigBuilder = DateShiftConfig.newBuilder().setLowerBoundDays(lowerBoundDays).setUpperBoundDays(upperBoundDays);
        // If contextFieldId, keyName or wrappedKey is set: all three arguments must be valid
        if (contextFieldId != null && keyName != null && wrappedKey != null) {
            dateShiftConfigBuilder.setContext(FieldId.newBuilder().setName(contextFieldId).build());
            KmsWrappedCryptoKey kmsWrappedCryptoKey = KmsWrappedCryptoKey.newBuilder().setCryptoKeyName(keyName).setWrappedKey(ByteString.copyFrom(BaseEncoding.base64().decode(wrappedKey))).build();
            dateShiftConfigBuilder.setCryptoKey(CryptoKey.newBuilder().setKmsWrapped(kmsWrappedCryptoKey).build());
        } else if (contextFieldId != null || keyName != null || wrappedKey != null) {
            throw new IllegalArgumentException("You must set either ALL or NONE of {contextFieldId, keyName, wrappedKey}!");
        }
        // Read and parse the CSV file
        BufferedReader br = null;
        String line;
        List<Table.Row> rows = new ArrayList<>();
        List<FieldId> headers;
        br = new BufferedReader(new FileReader(inputCsvPath.toFile()));
        // convert csv header to FieldId
        headers = Arrays.stream(br.readLine().split(",")).map(header -> FieldId.newBuilder().setName(header).build()).collect(Collectors.toList());
        while ((line = br.readLine()) != null) {
            // convert csv rows to Table.Row
            rows.add(convertCsvRowToTableRow(line));
        }
        br.close();
        Table table = Table.newBuilder().addAllHeaders(headers).addAllRows(rows).build();
        List<FieldId> dateFieldIds = Arrays.stream(dateFields).map(field -> FieldId.newBuilder().setName(field).build()).collect(Collectors.toList());
        DateShiftConfig dateShiftConfig = dateShiftConfigBuilder.build();
        FieldTransformation fieldTransformation = FieldTransformation.newBuilder().addAllFields(dateFieldIds).setPrimitiveTransformation(PrimitiveTransformation.newBuilder().setDateShiftConfig(dateShiftConfig).build()).build();
        DeidentifyConfig deidentifyConfig = DeidentifyConfig.newBuilder().setRecordTransformations(RecordTransformations.newBuilder().addFieldTransformations(fieldTransformation).build()).build();
        ContentItem tableItem = ContentItem.newBuilder().setTable(table).build();
        DeidentifyContentRequest request = DeidentifyContentRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setDeidentifyConfig(deidentifyConfig).setItem(tableItem).build();
        // Execute the deidentification request
        DeidentifyContentResponse response = dlpServiceClient.deidentifyContent(request);
        // Write out the response as a CSV file
        List<FieldId> outputHeaderFields = response.getItem().getTable().getHeadersList();
        List<Table.Row> outputRows = response.getItem().getTable().getRowsList();
        List<String> outputHeaders = outputHeaderFields.stream().map(FieldId::getName).collect(Collectors.toList());
        File outputFile = outputCsvPath.toFile();
        if (!outputFile.exists()) {
            outputFile.createNewFile();
        }
        BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(outputFile));
        // write out headers
        bufferedWriter.append(String.join(",", outputHeaders) + "\n");
        // write out each row
        for (Table.Row outputRow : outputRows) {
            String row = outputRow.getValuesList().stream().map(value -> value.getStringValue()).collect(Collectors.joining(","));
            bufferedWriter.append(row + "\n");
        }
        bufferedWriter.flush();
        bufferedWriter.close();
        System.out.println("Successfully saved date-shift output to: " + outputCsvPath.getFileName());
    } catch (Exception e) {
        System.out.println("Error in deidentifyWithDateShift: " + e.getMessage());
    }
}
Also used : Arrays(java.util.Arrays) Date(com.google.type.Date) CryptoKey(com.google.privacy.dlp.v2.CryptoKey) DefaultParser(org.apache.commons.cli.DefaultParser) Path(java.nio.file.Path) Value(com.google.privacy.dlp.v2.Value) DateShiftConfig(com.google.privacy.dlp.v2.DateShiftConfig) InfoTypeTransformation(com.google.privacy.dlp.v2.InfoTypeTransformations.InfoTypeTransformation) FieldTransformation(com.google.privacy.dlp.v2.FieldTransformation) ContentItem(com.google.privacy.dlp.v2.ContentItem) Collectors(java.util.stream.Collectors) ByteString(com.google.protobuf.ByteString) ReidentifyContentRequest(com.google.privacy.dlp.v2.ReidentifyContentRequest) DateTimeParseException(java.time.format.DateTimeParseException) List(java.util.List) ParseException(org.apache.commons.cli.ParseException) LocalDate(java.time.LocalDate) RecordTransformations(com.google.privacy.dlp.v2.RecordTransformations) FfxCommonNativeAlphabet(com.google.privacy.dlp.v2.CryptoReplaceFfxFpeConfig.FfxCommonNativeAlphabet) CharacterMaskConfig(com.google.privacy.dlp.v2.CharacterMaskConfig) Options(org.apache.commons.cli.Options) KmsWrappedCryptoKey(com.google.privacy.dlp.v2.KmsWrappedCryptoKey) PrimitiveTransformation(com.google.privacy.dlp.v2.PrimitiveTransformation) HelpFormatter(org.apache.commons.cli.HelpFormatter) ArrayList(java.util.ArrayList) CryptoReplaceFfxFpeConfig(com.google.privacy.dlp.v2.CryptoReplaceFfxFpeConfig) ServiceOptions(com.google.cloud.ServiceOptions) DeidentifyConfig(com.google.privacy.dlp.v2.DeidentifyConfig) CommandLine(org.apache.commons.cli.CommandLine) FieldId(com.google.privacy.dlp.v2.FieldId) Option(org.apache.commons.cli.Option) DeidentifyContentResponse(com.google.privacy.dlp.v2.DeidentifyContentResponse) DlpServiceClient(com.google.cloud.dlp.v2.DlpServiceClient) InfoTypeTransformations(com.google.privacy.dlp.v2.InfoTypeTransformations) BaseEncoding(com.google.common.io.BaseEncoding) CommandLineParser(org.apache.commons.cli.CommandLineParser) BufferedWriter(java.io.BufferedWriter) Table(com.google.privacy.dlp.v2.Table) FileWriter(java.io.FileWriter) ReidentifyContentResponse(com.google.privacy.dlp.v2.ReidentifyContentResponse) SurrogateType(com.google.privacy.dlp.v2.CustomInfoType.SurrogateType) InfoType(com.google.privacy.dlp.v2.InfoType) DeidentifyContentRequest(com.google.privacy.dlp.v2.DeidentifyContentRequest) File(java.io.File) InspectConfig(com.google.privacy.dlp.v2.InspectConfig) ProjectName(com.google.privacy.dlp.v2.ProjectName) Paths(java.nio.file.Paths) OptionGroup(org.apache.commons.cli.OptionGroup) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) CustomInfoType(com.google.privacy.dlp.v2.CustomInfoType) FileWriter(java.io.FileWriter) ArrayList(java.util.ArrayList) ByteString(com.google.protobuf.ByteString) BufferedWriter(java.io.BufferedWriter) DeidentifyConfig(com.google.privacy.dlp.v2.DeidentifyConfig) FileReader(java.io.FileReader) FieldTransformation(com.google.privacy.dlp.v2.FieldTransformation) DeidentifyContentRequest(com.google.privacy.dlp.v2.DeidentifyContentRequest) Table(com.google.privacy.dlp.v2.Table) DateTimeParseException(java.time.format.DateTimeParseException) ParseException(org.apache.commons.cli.ParseException) DateShiftConfig(com.google.privacy.dlp.v2.DateShiftConfig) DlpServiceClient(com.google.cloud.dlp.v2.DlpServiceClient) FieldId(com.google.privacy.dlp.v2.FieldId) BufferedReader(java.io.BufferedReader) KmsWrappedCryptoKey(com.google.privacy.dlp.v2.KmsWrappedCryptoKey) File(java.io.File) ContentItem(com.google.privacy.dlp.v2.ContentItem) DeidentifyContentResponse(com.google.privacy.dlp.v2.DeidentifyContentResponse)

Example 3 with Table

use of com.google.privacy.dlp.v2.Table in project java-docs-samples by GoogleCloudPlatform.

the class Inspect method main.

// [END dlp_inspect_bigquery]
/**
 * Command line application to inspect data using the Data Loss Prevention API. Supported data
 * formats: string, file, text file on GCS, BigQuery table, and Datastore entity
 */
public static void main(String[] args) throws Exception {
    OptionGroup optionsGroup = new OptionGroup();
    optionsGroup.setRequired(true);
    Option stringOption = new Option("s", "string", true, "inspect string");
    optionsGroup.addOption(stringOption);
    Option fileOption = new Option("f", "file path", true, "inspect input file path");
    optionsGroup.addOption(fileOption);
    Option gcsOption = new Option("gcs", "Google Cloud Storage", false, "inspect GCS file");
    optionsGroup.addOption(gcsOption);
    Option datastoreOption = new Option("ds", "Google Datastore", false, "inspect Datastore kind");
    optionsGroup.addOption(datastoreOption);
    Option bigqueryOption = new Option("bq", "Google BigQuery", false, "inspect BigQuery table");
    optionsGroup.addOption(bigqueryOption);
    Options commandLineOptions = new Options();
    commandLineOptions.addOptionGroup(optionsGroup);
    Option minLikelihoodOption = Option.builder("minLikelihood").hasArg(true).required(false).build();
    commandLineOptions.addOption(minLikelihoodOption);
    Option maxFindingsOption = Option.builder("maxFindings").hasArg(true).required(false).build();
    commandLineOptions.addOption(maxFindingsOption);
    Option infoTypesOption = Option.builder("infoTypes").hasArg(true).required(false).build();
    infoTypesOption.setArgs(Option.UNLIMITED_VALUES);
    commandLineOptions.addOption(infoTypesOption);
    Option includeQuoteOption = Option.builder("includeQuote").hasArg(true).required(false).build();
    commandLineOptions.addOption(includeQuoteOption);
    Option bucketNameOption = Option.builder("bucketName").hasArg(true).required(false).build();
    commandLineOptions.addOption(bucketNameOption);
    Option gcsFileNameOption = Option.builder("fileName").hasArg(true).required(false).build();
    commandLineOptions.addOption(gcsFileNameOption);
    Option datasetIdOption = Option.builder("datasetId").hasArg(true).required(false).build();
    commandLineOptions.addOption(datasetIdOption);
    Option tableIdOption = Option.builder("tableId").hasArg(true).required(false).build();
    commandLineOptions.addOption(tableIdOption);
    Option projectIdOption = Option.builder("projectId").hasArg(true).required(false).build();
    commandLineOptions.addOption(projectIdOption);
    Option topicIdOption = Option.builder("topicId").hasArg(true).required(false).build();
    commandLineOptions.addOption(topicIdOption);
    Option subscriptionIdOption = Option.builder("subscriptionId").hasArg(true).required(false).build();
    commandLineOptions.addOption(subscriptionIdOption);
    Option datastoreNamespaceOption = Option.builder("namespace").hasArg(true).required(false).build();
    commandLineOptions.addOption(datastoreNamespaceOption);
    Option datastoreKindOption = Option.builder("kind").hasArg(true).required(false).build();
    commandLineOptions.addOption(datastoreKindOption);
    CommandLineParser parser = new DefaultParser();
    HelpFormatter formatter = new HelpFormatter();
    CommandLine cmd;
    try {
        cmd = parser.parse(commandLineOptions, args);
    } catch (ParseException e) {
        System.out.println(e.getMessage());
        formatter.printHelp(Inspect.class.getName(), commandLineOptions);
        System.exit(1);
        return;
    }
    Likelihood minLikelihood = Likelihood.valueOf(cmd.getOptionValue(minLikelihoodOption.getOpt(), Likelihood.LIKELIHOOD_UNSPECIFIED.name()));
    int maxFindings = Integer.parseInt(cmd.getOptionValue(maxFindingsOption.getOpt(), "0"));
    boolean includeQuote = Boolean.parseBoolean(cmd.getOptionValue(includeQuoteOption.getOpt(), "true"));
    String projectId = cmd.getOptionValue(projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId());
    String topicId = cmd.getOptionValue(topicIdOption.getOpt());
    String subscriptionId = cmd.getOptionValue(subscriptionIdOption.getOpt());
    List<InfoType> infoTypesList = Collections.emptyList();
    if (cmd.hasOption(infoTypesOption.getOpt())) {
        infoTypesList = new ArrayList<>();
        String[] infoTypes = cmd.getOptionValues(infoTypesOption.getOpt());
        for (String infoType : infoTypes) {
            infoTypesList.add(InfoType.newBuilder().setName(infoType).build());
        }
    }
    // string inspection
    if (cmd.hasOption("s")) {
        String val = cmd.getOptionValue(stringOption.getOpt());
        inspectString(val, minLikelihood, maxFindings, infoTypesList, includeQuote, projectId);
    } else if (cmd.hasOption("f")) {
        String filePath = cmd.getOptionValue(fileOption.getOpt());
        inspectFile(filePath, minLikelihood, maxFindings, infoTypesList, includeQuote, projectId);
    // gcs file inspection
    } else if (cmd.hasOption("gcs")) {
        String bucketName = cmd.getOptionValue(bucketNameOption.getOpt());
        String fileName = cmd.getOptionValue(gcsFileNameOption.getOpt());
        inspectGcsFile(bucketName, fileName, minLikelihood, infoTypesList, maxFindings, topicId, subscriptionId, projectId);
    // datastore kind inspection
    } else if (cmd.hasOption("ds")) {
        String namespaceId = cmd.getOptionValue(datastoreNamespaceOption.getOpt(), "");
        String kind = cmd.getOptionValue(datastoreKindOption.getOpt());
        // use default project id when project id is not specified
        inspectDatastore(projectId, namespaceId, kind, minLikelihood, infoTypesList, maxFindings, topicId, subscriptionId);
    } else if (cmd.hasOption("bq")) {
        String datasetId = cmd.getOptionValue(datasetIdOption.getOpt());
        String tableId = cmd.getOptionValue(tableIdOption.getOpt());
        // use default project id when project id is not specified
        inspectBigquery(projectId, datasetId, tableId, minLikelihood, infoTypesList, maxFindings, topicId, subscriptionId);
    }
}
Also used : Options(org.apache.commons.cli.Options) BigQueryOptions(com.google.privacy.dlp.v2.BigQueryOptions) CloudStorageOptions(com.google.privacy.dlp.v2.CloudStorageOptions) ServiceOptions(com.google.cloud.ServiceOptions) DatastoreOptions(com.google.privacy.dlp.v2.DatastoreOptions) Likelihood(com.google.privacy.dlp.v2.Likelihood) ByteString(com.google.protobuf.ByteString) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) OptionGroup(org.apache.commons.cli.OptionGroup) Option(org.apache.commons.cli.Option) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) InfoType(com.google.privacy.dlp.v2.InfoType) DefaultParser(org.apache.commons.cli.DefaultParser)

Example 4 with Table

use of com.google.privacy.dlp.v2.Table in project java-docs-samples by GoogleCloudPlatform.

the class Inspect method inspectBigquery.

// [END dlp_inspect_datastore]
// [START dlp_inspect_bigquery]
/**
 * Inspect a BigQuery table
 *
 * @param projectId The project ID to run the API call under
 * @param datasetId The ID of the dataset to inspect, e.g. 'my_dataset'
 * @param tableId The ID of the table to inspect, e.g. 'my_table'
 * @param minLikelihood The minimum likelihood required before returning a match
 * @param infoTypes The infoTypes of information to match
 * @param maxFindings The maximum number of findings to report (0 = server maximum)
 * @param topicId Topic ID for pubsub.
 * @param subscriptionId Subscription ID for pubsub.
 */
private static void inspectBigquery(String projectId, String datasetId, String tableId, Likelihood minLikelihood, List<InfoType> infoTypes, int maxFindings, String topicId, String subscriptionId) {
    // Instantiates a client
    try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
        // Reference to the BigQuery table
        BigQueryTable tableReference = BigQueryTable.newBuilder().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId).build();
        BigQueryOptions bigQueryOptions = BigQueryOptions.newBuilder().setTableReference(tableReference).build();
        // Construct BigQuery configuration to be inspected
        StorageConfig storageConfig = StorageConfig.newBuilder().setBigQueryOptions(bigQueryOptions).build();
        FindingLimits findingLimits = FindingLimits.newBuilder().setMaxFindingsPerRequest(maxFindings).build();
        InspectConfig inspectConfig = InspectConfig.newBuilder().addAllInfoTypes(infoTypes).setMinLikelihood(minLikelihood).setLimits(findingLimits).build();
        ProjectTopicName topic = ProjectTopicName.of(projectId, topicId);
        Action.PublishToPubSub publishToPubSub = Action.PublishToPubSub.newBuilder().setTopic(topic.toString()).build();
        Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
        InspectJobConfig inspectJobConfig = InspectJobConfig.newBuilder().setStorageConfig(storageConfig).setInspectConfig(inspectConfig).addActions(action).build();
        // Asynchronously submit an inspect job, and wait on results
        CreateDlpJobRequest createDlpJobRequest = CreateDlpJobRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setInspectJob(inspectJobConfig).build();
        DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
        System.out.println("Job created with ID:" + dlpJob.getName());
        // Wait for job completion semi-synchronously
        // For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
        final SettableApiFuture<Boolean> done = SettableApiFuture.create();
        // Set up a Pub/Sub subscriber to listen on the job completion status
        Subscriber subscriber = Subscriber.newBuilder(ProjectSubscriptionName.of(projectId, subscriptionId), (pubsubMessage, ackReplyConsumer) -> {
            if (pubsubMessage.getAttributesCount() > 0 && pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJob.getName())) {
                // notify job completion
                done.set(true);
                ackReplyConsumer.ack();
            }
        }).build();
        subscriber.startAsync();
        try {
            done.get(1, TimeUnit.MINUTES);
            // Wait for the job to become available
            Thread.sleep(500);
        } catch (Exception e) {
            System.out.println("Unable to verify job completion.");
        }
        DlpJob completedJob = dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJob.getName()).build());
        System.out.println("Job status: " + completedJob.getState());
        InspectDataSourceDetails inspectDataSourceDetails = completedJob.getInspectDetails();
        InspectDataSourceDetails.Result result = inspectDataSourceDetails.getResult();
        if (result.getInfoTypeStatsCount() > 0) {
            System.out.println("Findings: ");
            for (InfoTypeStats infoTypeStat : result.getInfoTypeStatsList()) {
                System.out.print("\tInfo type: " + infoTypeStat.getInfoType().getName());
                System.out.println("\tCount: " + infoTypeStat.getCount());
            }
        } else {
            System.out.println("No findings.");
        }
    } catch (Exception e) {
        System.out.println("inspectBigquery Problems: " + e.getMessage());
    }
}
Also used : ByteContentItem(com.google.privacy.dlp.v2.ByteContentItem) InspectResult(com.google.privacy.dlp.v2.InspectResult) KindExpression(com.google.privacy.dlp.v2.KindExpression) Options(org.apache.commons.cli.Options) Likelihood(com.google.privacy.dlp.v2.Likelihood) Subscriber(com.google.cloud.pubsub.v1.Subscriber) BigQueryOptions(com.google.privacy.dlp.v2.BigQueryOptions) HelpFormatter(org.apache.commons.cli.HelpFormatter) MimetypesFileTypeMap(javax.activation.MimetypesFileTypeMap) CloudStorageOptions(com.google.privacy.dlp.v2.CloudStorageOptions) ArrayList(java.util.ArrayList) DefaultParser(org.apache.commons.cli.DefaultParser) InspectDataSourceDetails(com.google.privacy.dlp.v2.InspectDataSourceDetails) FindingLimits(com.google.privacy.dlp.v2.InspectConfig.FindingLimits) InspectContentResponse(com.google.privacy.dlp.v2.InspectContentResponse) ServiceOptions(com.google.cloud.ServiceOptions) URLConnection(java.net.URLConnection) StorageConfig(com.google.privacy.dlp.v2.StorageConfig) CommandLine(org.apache.commons.cli.CommandLine) PartitionId(com.google.privacy.dlp.v2.PartitionId) Action(com.google.privacy.dlp.v2.Action) DatastoreOptions(com.google.privacy.dlp.v2.DatastoreOptions) ProjectTopicName(com.google.pubsub.v1.ProjectTopicName) Option(org.apache.commons.cli.Option) InspectJobConfig(com.google.privacy.dlp.v2.InspectJobConfig) DlpServiceClient(com.google.cloud.dlp.v2.DlpServiceClient) Finding(com.google.privacy.dlp.v2.Finding) CreateDlpJobRequest(com.google.privacy.dlp.v2.CreateDlpJobRequest) Files(java.nio.file.Files) CommandLineParser(org.apache.commons.cli.CommandLineParser) ContentItem(com.google.privacy.dlp.v2.ContentItem) InfoType(com.google.privacy.dlp.v2.InfoType) InfoTypeStats(com.google.privacy.dlp.v2.InfoTypeStats) SettableApiFuture(com.google.api.core.SettableApiFuture) ByteString(com.google.protobuf.ByteString) TimeUnit(java.util.concurrent.TimeUnit) InspectConfig(com.google.privacy.dlp.v2.InspectConfig) List(java.util.List) ProjectName(com.google.privacy.dlp.v2.ProjectName) GetDlpJobRequest(com.google.privacy.dlp.v2.GetDlpJobRequest) Paths(java.nio.file.Paths) ParseException(org.apache.commons.cli.ParseException) BigQueryTable(com.google.privacy.dlp.v2.BigQueryTable) ProjectSubscriptionName(com.google.pubsub.v1.ProjectSubscriptionName) OptionGroup(org.apache.commons.cli.OptionGroup) DlpJob(com.google.privacy.dlp.v2.DlpJob) InspectContentRequest(com.google.privacy.dlp.v2.InspectContentRequest) Collections(java.util.Collections) Action(com.google.privacy.dlp.v2.Action) FindingLimits(com.google.privacy.dlp.v2.InspectConfig.FindingLimits) StorageConfig(com.google.privacy.dlp.v2.StorageConfig) InspectDataSourceDetails(com.google.privacy.dlp.v2.InspectDataSourceDetails) BigQueryOptions(com.google.privacy.dlp.v2.BigQueryOptions) InspectConfig(com.google.privacy.dlp.v2.InspectConfig) CreateDlpJobRequest(com.google.privacy.dlp.v2.CreateDlpJobRequest) ParseException(org.apache.commons.cli.ParseException) InfoTypeStats(com.google.privacy.dlp.v2.InfoTypeStats) Subscriber(com.google.cloud.pubsub.v1.Subscriber) DlpServiceClient(com.google.cloud.dlp.v2.DlpServiceClient) BigQueryTable(com.google.privacy.dlp.v2.BigQueryTable) DlpJob(com.google.privacy.dlp.v2.DlpJob) ProjectTopicName(com.google.pubsub.v1.ProjectTopicName) InspectJobConfig(com.google.privacy.dlp.v2.InspectJobConfig)

Example 5 with Table

use of com.google.privacy.dlp.v2.Table in project java-docs-samples by GoogleCloudPlatform.

the class RiskAnalysis method categoricalStatsAnalysis.

// [END dlp_numerical_stats]
// [START dlp_categorical_stats]
/**
 * Calculate categorical statistics for a column in a BigQuery table using the DLP API.
 *
 * @param projectId The Google Cloud Platform project ID to run the API call under.
 * @param datasetId The BigQuery dataset to analyze.
 * @param tableId The BigQuery table to analyze.
 * @param columnName The name of the column to analyze, which need not contain numerical data.
 * @param topicId The name of the Pub/Sub topic to notify once the job completes
 * @param subscriptionId The name of the Pub/Sub subscription to use when listening for job
 *     completion status.
 */
private static void categoricalStatsAnalysis(String projectId, String datasetId, String tableId, String columnName, String topicId, String subscriptionId) {
    // Instantiates a client
    try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
        FieldId fieldId = FieldId.newBuilder().setName(columnName).build();
        CategoricalStatsConfig categoricalStatsConfig = CategoricalStatsConfig.newBuilder().setField(fieldId).build();
        BigQueryTable bigQueryTable = BigQueryTable.newBuilder().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId).build();
        PrivacyMetric privacyMetric = PrivacyMetric.newBuilder().setCategoricalStatsConfig(categoricalStatsConfig).build();
        ProjectTopicName topicName = ProjectTopicName.of(projectId, topicId);
        PublishToPubSub publishToPubSub = PublishToPubSub.newBuilder().setTopic(topicName.toString()).build();
        // Create action to publish job status notifications over Google Cloud Pub/Sub
        Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
        RiskAnalysisJobConfig riskAnalysisJobConfig = RiskAnalysisJobConfig.newBuilder().setSourceTable(bigQueryTable).setPrivacyMetric(privacyMetric).addActions(action).build();
        CreateDlpJobRequest createDlpJobRequest = CreateDlpJobRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setRiskJob(riskAnalysisJobConfig).build();
        DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
        String dlpJobName = dlpJob.getName();
        final SettableApiFuture<Boolean> done = SettableApiFuture.create();
        // Set up a Pub/Sub subscriber to listen on the job completion status
        Subscriber subscriber = Subscriber.newBuilder(ProjectSubscriptionName.newBuilder().setProject(projectId).setSubscription(subscriptionId).build(), (pubsubMessage, ackReplyConsumer) -> {
            if (pubsubMessage.getAttributesCount() > 0 && pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJobName)) {
                // notify job completion
                done.set(true);
                ackReplyConsumer.ack();
            }
        }).build();
        subscriber.startAsync();
        // For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
        try {
            done.get(1, TimeUnit.MINUTES);
            // Wait for the job to become available
            Thread.sleep(500);
        } catch (TimeoutException e) {
            System.out.println("Unable to verify job completion.");
        }
        // Retrieve completed job status
        DlpJob completedJob = dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJobName).build());
        System.out.println("Job status: " + completedJob.getState());
        AnalyzeDataSourceRiskDetails riskDetails = completedJob.getRiskDetails();
        AnalyzeDataSourceRiskDetails.CategoricalStatsResult result = riskDetails.getCategoricalStatsResult();
        for (CategoricalStatsHistogramBucket bucket : result.getValueFrequencyHistogramBucketsList()) {
            System.out.printf("Most common value occurs %d time(s).\n", bucket.getValueFrequencyUpperBound());
            System.out.printf("Least common value occurs %d time(s).\n", bucket.getValueFrequencyLowerBound());
            for (ValueFrequency valueFrequency : bucket.getBucketValuesList()) {
                System.out.printf("Value %s occurs %d time(s).\n", valueFrequency.getValue().toString(), valueFrequency.getCount());
            }
        }
    } catch (Exception e) {
        System.out.println("Error in categoricalStatsAnalysis: " + e.getMessage());
    }
}
Also used : Arrays(java.util.Arrays) TimeoutException(java.util.concurrent.TimeoutException) Subscriber(com.google.cloud.pubsub.v1.Subscriber) DefaultParser(org.apache.commons.cli.DefaultParser) KMapEstimationHistogramBucket(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KMapEstimationResult.KMapEstimationHistogramBucket) LDiversityEquivalenceClass(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.LDiversityResult.LDiversityEquivalenceClass) ValueFrequency(com.google.privacy.dlp.v2.ValueFrequency) LDiversityConfig(com.google.privacy.dlp.v2.PrivacyMetric.LDiversityConfig) NumericalStatsConfig(com.google.privacy.dlp.v2.PrivacyMetric.NumericalStatsConfig) Action(com.google.privacy.dlp.v2.Action) KMapEstimationConfig(com.google.privacy.dlp.v2.PrivacyMetric.KMapEstimationConfig) CategoricalStatsHistogramBucket(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.CategoricalStatsResult.CategoricalStatsHistogramBucket) KAnonymityEquivalenceClass(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KAnonymityResult.KAnonymityEquivalenceClass) Value(com.google.privacy.dlp.v2.Value) TaggedField(com.google.privacy.dlp.v2.PrivacyMetric.KMapEstimationConfig.TaggedField) RiskAnalysisJobConfig(com.google.privacy.dlp.v2.RiskAnalysisJobConfig) Collectors(java.util.stream.Collectors) SettableApiFuture(com.google.api.core.SettableApiFuture) List(java.util.List) KAnonymityResult(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KAnonymityResult) ParseException(org.apache.commons.cli.ParseException) BigQueryTable(com.google.privacy.dlp.v2.BigQueryTable) ProjectSubscriptionName(com.google.pubsub.v1.ProjectSubscriptionName) AnalyzeDataSourceRiskDetails(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails) Options(org.apache.commons.cli.Options) HelpFormatter(org.apache.commons.cli.HelpFormatter) CategoricalStatsConfig(com.google.privacy.dlp.v2.PrivacyMetric.CategoricalStatsConfig) ArrayList(java.util.ArrayList) ServiceOptions(com.google.cloud.ServiceOptions) CommandLine(org.apache.commons.cli.CommandLine) FieldId(com.google.privacy.dlp.v2.FieldId) ProjectTopicName(com.google.pubsub.v1.ProjectTopicName) Option(org.apache.commons.cli.Option) DlpServiceClient(com.google.cloud.dlp.v2.DlpServiceClient) Iterator(java.util.Iterator) CreateDlpJobRequest(com.google.privacy.dlp.v2.CreateDlpJobRequest) CommandLineParser(org.apache.commons.cli.CommandLineParser) KMapEstimationResult(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KMapEstimationResult) KMapEstimationQuasiIdValues(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KMapEstimationResult.KMapEstimationQuasiIdValues) InfoType(com.google.privacy.dlp.v2.InfoType) LDiversityResult(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.LDiversityResult) KAnonymityConfig(com.google.privacy.dlp.v2.PrivacyMetric.KAnonymityConfig) TimeUnit(java.util.concurrent.TimeUnit) PublishToPubSub(com.google.privacy.dlp.v2.Action.PublishToPubSub) ProjectName(com.google.privacy.dlp.v2.ProjectName) GetDlpJobRequest(com.google.privacy.dlp.v2.GetDlpJobRequest) LDiversityHistogramBucket(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.LDiversityResult.LDiversityHistogramBucket) KAnonymityHistogramBucket(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KAnonymityResult.KAnonymityHistogramBucket) PrivacyMetric(com.google.privacy.dlp.v2.PrivacyMetric) OptionGroup(org.apache.commons.cli.OptionGroup) DlpJob(com.google.privacy.dlp.v2.DlpJob) Collections(java.util.Collections) CategoricalStatsConfig(com.google.privacy.dlp.v2.PrivacyMetric.CategoricalStatsConfig) Action(com.google.privacy.dlp.v2.Action) CategoricalStatsHistogramBucket(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.CategoricalStatsResult.CategoricalStatsHistogramBucket) RiskAnalysisJobConfig(com.google.privacy.dlp.v2.RiskAnalysisJobConfig) PrivacyMetric(com.google.privacy.dlp.v2.PrivacyMetric) CreateDlpJobRequest(com.google.privacy.dlp.v2.CreateDlpJobRequest) TimeoutException(java.util.concurrent.TimeoutException) ParseException(org.apache.commons.cli.ParseException) PublishToPubSub(com.google.privacy.dlp.v2.Action.PublishToPubSub) Subscriber(com.google.cloud.pubsub.v1.Subscriber) DlpServiceClient(com.google.cloud.dlp.v2.DlpServiceClient) BigQueryTable(com.google.privacy.dlp.v2.BigQueryTable) ValueFrequency(com.google.privacy.dlp.v2.ValueFrequency) FieldId(com.google.privacy.dlp.v2.FieldId) DlpJob(com.google.privacy.dlp.v2.DlpJob) ProjectTopicName(com.google.pubsub.v1.ProjectTopicName) AnalyzeDataSourceRiskDetails(com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

ServiceOptions (com.google.cloud.ServiceOptions)8 InfoType (com.google.privacy.dlp.v2.InfoType)8 DlpServiceClient (com.google.cloud.dlp.v2.DlpServiceClient)7 ProjectName (com.google.privacy.dlp.v2.ProjectName)7 ArrayList (java.util.ArrayList)7 SettableApiFuture (com.google.api.core.SettableApiFuture)6 Subscriber (com.google.cloud.pubsub.v1.Subscriber)6 Action (com.google.privacy.dlp.v2.Action)6 BigQueryTable (com.google.privacy.dlp.v2.BigQueryTable)6 CreateDlpJobRequest (com.google.privacy.dlp.v2.CreateDlpJobRequest)6 DlpJob (com.google.privacy.dlp.v2.DlpJob)6 FieldId (com.google.privacy.dlp.v2.FieldId)6 GetDlpJobRequest (com.google.privacy.dlp.v2.GetDlpJobRequest)6 Value (com.google.privacy.dlp.v2.Value)6 ProjectSubscriptionName (com.google.pubsub.v1.ProjectSubscriptionName)6 ProjectTopicName (com.google.pubsub.v1.ProjectTopicName)6 Arrays (java.util.Arrays)6 Collections (java.util.Collections)6 CommandLine (org.apache.commons.cli.CommandLine)6 CommandLineParser (org.apache.commons.cli.CommandLineParser)6