use of com.google.privacy.dlp.v2.Table in project beam by apache.
the class BigtableWriteIT method testE2EBigtableWrite.
@Test
public void testE2EBigtableWrite() throws Exception {
final String tableName = bigtableOptions.getInstanceName().toTableNameStr(tableId);
final String instanceName = bigtableOptions.getInstanceName().toString();
final int numRows = 1000;
final List<KV<ByteString, ByteString>> testData = generateTableData(numRows);
createEmptyTable(instanceName, tableId);
Pipeline p = Pipeline.create(options);
p.apply(GenerateSequence.from(0).to(numRows)).apply(ParDo.of(new DoFn<Long, KV<ByteString, Iterable<Mutation>>>() {
@ProcessElement
public void processElement(ProcessContext c) {
int index = c.element().intValue();
Iterable<Mutation> mutations = ImmutableList.of(Mutation.newBuilder().setSetCell(Mutation.SetCell.newBuilder().setValue(testData.get(index).getValue()).setFamilyName(COLUMN_FAMILY_NAME)).build());
c.output(KV.of(testData.get(index).getKey(), mutations));
}
})).apply(BigtableIO.write().withBigtableOptions(bigtableOptions).withTableId(tableId));
p.run();
// Test number of column families and column family name equality
Table table = getTable(tableName);
assertThat(table.getColumnFamiliesMap().keySet(), Matchers.hasSize(1));
assertThat(table.getColumnFamiliesMap(), Matchers.hasKey(COLUMN_FAMILY_NAME));
// Test table data equality
List<KV<ByteString, ByteString>> tableData = getTableData(tableName);
assertThat(tableData, Matchers.containsInAnyOrder(testData.toArray()));
}
use of com.google.privacy.dlp.v2.Table in project java-docs-samples by GoogleCloudPlatform.
the class DeIdentification method deidentifyWithDateShift.
// [END dlp_reidentify_fpe]
// [START dlp_deidentify_date_shift]
/**
* @param inputCsvPath The path to the CSV file to deidentify
* @param outputCsvPath (Optional) path to the output CSV file
* @param dateFields The list of (date) fields in the CSV file to date shift
* @param lowerBoundDays The maximum number of days to shift a date backward
* @param upperBoundDays The maximum number of days to shift a date forward
* @param contextFieldId (Optional) The column to determine date shift, default : a random shift
* amount
* @param wrappedKey (Optional) The encrypted ('wrapped') AES-256 key to use when shifting dates
* @param keyName (Optional) The name of the Cloud KMS key used to encrypt ('wrap') the AES-256
* key
* @param projectId ID of Google Cloud project to run the API under.
*/
private static void deidentifyWithDateShift(Path inputCsvPath, Path outputCsvPath, String[] dateFields, int lowerBoundDays, int upperBoundDays, String contextFieldId, String wrappedKey, String keyName, String projectId) throws Exception {
// instantiate a client
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
// Set the maximum days to shift a day backward (lowerbound), forward (upperbound)
DateShiftConfig.Builder dateShiftConfigBuilder = DateShiftConfig.newBuilder().setLowerBoundDays(lowerBoundDays).setUpperBoundDays(upperBoundDays);
// If contextFieldId, keyName or wrappedKey is set: all three arguments must be valid
if (contextFieldId != null && keyName != null && wrappedKey != null) {
dateShiftConfigBuilder.setContext(FieldId.newBuilder().setName(contextFieldId).build());
KmsWrappedCryptoKey kmsWrappedCryptoKey = KmsWrappedCryptoKey.newBuilder().setCryptoKeyName(keyName).setWrappedKey(ByteString.copyFrom(BaseEncoding.base64().decode(wrappedKey))).build();
dateShiftConfigBuilder.setCryptoKey(CryptoKey.newBuilder().setKmsWrapped(kmsWrappedCryptoKey).build());
} else if (contextFieldId != null || keyName != null || wrappedKey != null) {
throw new IllegalArgumentException("You must set either ALL or NONE of {contextFieldId, keyName, wrappedKey}!");
}
// Read and parse the CSV file
BufferedReader br = null;
String line;
List<Table.Row> rows = new ArrayList<>();
List<FieldId> headers;
br = new BufferedReader(new FileReader(inputCsvPath.toFile()));
// convert csv header to FieldId
headers = Arrays.stream(br.readLine().split(",")).map(header -> FieldId.newBuilder().setName(header).build()).collect(Collectors.toList());
while ((line = br.readLine()) != null) {
// convert csv rows to Table.Row
rows.add(convertCsvRowToTableRow(line));
}
br.close();
Table table = Table.newBuilder().addAllHeaders(headers).addAllRows(rows).build();
List<FieldId> dateFieldIds = Arrays.stream(dateFields).map(field -> FieldId.newBuilder().setName(field).build()).collect(Collectors.toList());
DateShiftConfig dateShiftConfig = dateShiftConfigBuilder.build();
FieldTransformation fieldTransformation = FieldTransformation.newBuilder().addAllFields(dateFieldIds).setPrimitiveTransformation(PrimitiveTransformation.newBuilder().setDateShiftConfig(dateShiftConfig).build()).build();
DeidentifyConfig deidentifyConfig = DeidentifyConfig.newBuilder().setRecordTransformations(RecordTransformations.newBuilder().addFieldTransformations(fieldTransformation).build()).build();
ContentItem tableItem = ContentItem.newBuilder().setTable(table).build();
DeidentifyContentRequest request = DeidentifyContentRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setDeidentifyConfig(deidentifyConfig).setItem(tableItem).build();
// Execute the deidentification request
DeidentifyContentResponse response = dlpServiceClient.deidentifyContent(request);
// Write out the response as a CSV file
List<FieldId> outputHeaderFields = response.getItem().getTable().getHeadersList();
List<Table.Row> outputRows = response.getItem().getTable().getRowsList();
List<String> outputHeaders = outputHeaderFields.stream().map(FieldId::getName).collect(Collectors.toList());
File outputFile = outputCsvPath.toFile();
if (!outputFile.exists()) {
outputFile.createNewFile();
}
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(outputFile));
// write out headers
bufferedWriter.append(String.join(",", outputHeaders) + "\n");
// write out each row
for (Table.Row outputRow : outputRows) {
String row = outputRow.getValuesList().stream().map(value -> value.getStringValue()).collect(Collectors.joining(","));
bufferedWriter.append(row + "\n");
}
bufferedWriter.flush();
bufferedWriter.close();
System.out.println("Successfully saved date-shift output to: " + outputCsvPath.getFileName());
} catch (Exception e) {
System.out.println("Error in deidentifyWithDateShift: " + e.getMessage());
}
}
use of com.google.privacy.dlp.v2.Table in project java-docs-samples by GoogleCloudPlatform.
the class Inspect method main.
// [END dlp_inspect_bigquery]
/**
* Command line application to inspect data using the Data Loss Prevention API. Supported data
* formats: string, file, text file on GCS, BigQuery table, and Datastore entity
*/
public static void main(String[] args) throws Exception {
OptionGroup optionsGroup = new OptionGroup();
optionsGroup.setRequired(true);
Option stringOption = new Option("s", "string", true, "inspect string");
optionsGroup.addOption(stringOption);
Option fileOption = new Option("f", "file path", true, "inspect input file path");
optionsGroup.addOption(fileOption);
Option gcsOption = new Option("gcs", "Google Cloud Storage", false, "inspect GCS file");
optionsGroup.addOption(gcsOption);
Option datastoreOption = new Option("ds", "Google Datastore", false, "inspect Datastore kind");
optionsGroup.addOption(datastoreOption);
Option bigqueryOption = new Option("bq", "Google BigQuery", false, "inspect BigQuery table");
optionsGroup.addOption(bigqueryOption);
Options commandLineOptions = new Options();
commandLineOptions.addOptionGroup(optionsGroup);
Option minLikelihoodOption = Option.builder("minLikelihood").hasArg(true).required(false).build();
commandLineOptions.addOption(minLikelihoodOption);
Option maxFindingsOption = Option.builder("maxFindings").hasArg(true).required(false).build();
commandLineOptions.addOption(maxFindingsOption);
Option infoTypesOption = Option.builder("infoTypes").hasArg(true).required(false).build();
infoTypesOption.setArgs(Option.UNLIMITED_VALUES);
commandLineOptions.addOption(infoTypesOption);
Option includeQuoteOption = Option.builder("includeQuote").hasArg(true).required(false).build();
commandLineOptions.addOption(includeQuoteOption);
Option bucketNameOption = Option.builder("bucketName").hasArg(true).required(false).build();
commandLineOptions.addOption(bucketNameOption);
Option gcsFileNameOption = Option.builder("fileName").hasArg(true).required(false).build();
commandLineOptions.addOption(gcsFileNameOption);
Option datasetIdOption = Option.builder("datasetId").hasArg(true).required(false).build();
commandLineOptions.addOption(datasetIdOption);
Option tableIdOption = Option.builder("tableId").hasArg(true).required(false).build();
commandLineOptions.addOption(tableIdOption);
Option projectIdOption = Option.builder("projectId").hasArg(true).required(false).build();
commandLineOptions.addOption(projectIdOption);
Option topicIdOption = Option.builder("topicId").hasArg(true).required(false).build();
commandLineOptions.addOption(topicIdOption);
Option subscriptionIdOption = Option.builder("subscriptionId").hasArg(true).required(false).build();
commandLineOptions.addOption(subscriptionIdOption);
Option datastoreNamespaceOption = Option.builder("namespace").hasArg(true).required(false).build();
commandLineOptions.addOption(datastoreNamespaceOption);
Option datastoreKindOption = Option.builder("kind").hasArg(true).required(false).build();
commandLineOptions.addOption(datastoreKindOption);
CommandLineParser parser = new DefaultParser();
HelpFormatter formatter = new HelpFormatter();
CommandLine cmd;
try {
cmd = parser.parse(commandLineOptions, args);
} catch (ParseException e) {
System.out.println(e.getMessage());
formatter.printHelp(Inspect.class.getName(), commandLineOptions);
System.exit(1);
return;
}
Likelihood minLikelihood = Likelihood.valueOf(cmd.getOptionValue(minLikelihoodOption.getOpt(), Likelihood.LIKELIHOOD_UNSPECIFIED.name()));
int maxFindings = Integer.parseInt(cmd.getOptionValue(maxFindingsOption.getOpt(), "0"));
boolean includeQuote = Boolean.parseBoolean(cmd.getOptionValue(includeQuoteOption.getOpt(), "true"));
String projectId = cmd.getOptionValue(projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId());
String topicId = cmd.getOptionValue(topicIdOption.getOpt());
String subscriptionId = cmd.getOptionValue(subscriptionIdOption.getOpt());
List<InfoType> infoTypesList = Collections.emptyList();
if (cmd.hasOption(infoTypesOption.getOpt())) {
infoTypesList = new ArrayList<>();
String[] infoTypes = cmd.getOptionValues(infoTypesOption.getOpt());
for (String infoType : infoTypes) {
infoTypesList.add(InfoType.newBuilder().setName(infoType).build());
}
}
// string inspection
if (cmd.hasOption("s")) {
String val = cmd.getOptionValue(stringOption.getOpt());
inspectString(val, minLikelihood, maxFindings, infoTypesList, includeQuote, projectId);
} else if (cmd.hasOption("f")) {
String filePath = cmd.getOptionValue(fileOption.getOpt());
inspectFile(filePath, minLikelihood, maxFindings, infoTypesList, includeQuote, projectId);
// gcs file inspection
} else if (cmd.hasOption("gcs")) {
String bucketName = cmd.getOptionValue(bucketNameOption.getOpt());
String fileName = cmd.getOptionValue(gcsFileNameOption.getOpt());
inspectGcsFile(bucketName, fileName, minLikelihood, infoTypesList, maxFindings, topicId, subscriptionId, projectId);
// datastore kind inspection
} else if (cmd.hasOption("ds")) {
String namespaceId = cmd.getOptionValue(datastoreNamespaceOption.getOpt(), "");
String kind = cmd.getOptionValue(datastoreKindOption.getOpt());
// use default project id when project id is not specified
inspectDatastore(projectId, namespaceId, kind, minLikelihood, infoTypesList, maxFindings, topicId, subscriptionId);
} else if (cmd.hasOption("bq")) {
String datasetId = cmd.getOptionValue(datasetIdOption.getOpt());
String tableId = cmd.getOptionValue(tableIdOption.getOpt());
// use default project id when project id is not specified
inspectBigquery(projectId, datasetId, tableId, minLikelihood, infoTypesList, maxFindings, topicId, subscriptionId);
}
}
use of com.google.privacy.dlp.v2.Table in project java-docs-samples by GoogleCloudPlatform.
the class Inspect method inspectBigquery.
// [END dlp_inspect_datastore]
// [START dlp_inspect_bigquery]
/**
* Inspect a BigQuery table
*
* @param projectId The project ID to run the API call under
* @param datasetId The ID of the dataset to inspect, e.g. 'my_dataset'
* @param tableId The ID of the table to inspect, e.g. 'my_table'
* @param minLikelihood The minimum likelihood required before returning a match
* @param infoTypes The infoTypes of information to match
* @param maxFindings The maximum number of findings to report (0 = server maximum)
* @param topicId Topic ID for pubsub.
* @param subscriptionId Subscription ID for pubsub.
*/
private static void inspectBigquery(String projectId, String datasetId, String tableId, Likelihood minLikelihood, List<InfoType> infoTypes, int maxFindings, String topicId, String subscriptionId) {
// Instantiates a client
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
// Reference to the BigQuery table
BigQueryTable tableReference = BigQueryTable.newBuilder().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId).build();
BigQueryOptions bigQueryOptions = BigQueryOptions.newBuilder().setTableReference(tableReference).build();
// Construct BigQuery configuration to be inspected
StorageConfig storageConfig = StorageConfig.newBuilder().setBigQueryOptions(bigQueryOptions).build();
FindingLimits findingLimits = FindingLimits.newBuilder().setMaxFindingsPerRequest(maxFindings).build();
InspectConfig inspectConfig = InspectConfig.newBuilder().addAllInfoTypes(infoTypes).setMinLikelihood(minLikelihood).setLimits(findingLimits).build();
ProjectTopicName topic = ProjectTopicName.of(projectId, topicId);
Action.PublishToPubSub publishToPubSub = Action.PublishToPubSub.newBuilder().setTopic(topic.toString()).build();
Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
InspectJobConfig inspectJobConfig = InspectJobConfig.newBuilder().setStorageConfig(storageConfig).setInspectConfig(inspectConfig).addActions(action).build();
// Asynchronously submit an inspect job, and wait on results
CreateDlpJobRequest createDlpJobRequest = CreateDlpJobRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setInspectJob(inspectJobConfig).build();
DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
System.out.println("Job created with ID:" + dlpJob.getName());
// Wait for job completion semi-synchronously
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
final SettableApiFuture<Boolean> done = SettableApiFuture.create();
// Set up a Pub/Sub subscriber to listen on the job completion status
Subscriber subscriber = Subscriber.newBuilder(ProjectSubscriptionName.of(projectId, subscriptionId), (pubsubMessage, ackReplyConsumer) -> {
if (pubsubMessage.getAttributesCount() > 0 && pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJob.getName())) {
// notify job completion
done.set(true);
ackReplyConsumer.ack();
}
}).build();
subscriber.startAsync();
try {
done.get(1, TimeUnit.MINUTES);
// Wait for the job to become available
Thread.sleep(500);
} catch (Exception e) {
System.out.println("Unable to verify job completion.");
}
DlpJob completedJob = dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJob.getName()).build());
System.out.println("Job status: " + completedJob.getState());
InspectDataSourceDetails inspectDataSourceDetails = completedJob.getInspectDetails();
InspectDataSourceDetails.Result result = inspectDataSourceDetails.getResult();
if (result.getInfoTypeStatsCount() > 0) {
System.out.println("Findings: ");
for (InfoTypeStats infoTypeStat : result.getInfoTypeStatsList()) {
System.out.print("\tInfo type: " + infoTypeStat.getInfoType().getName());
System.out.println("\tCount: " + infoTypeStat.getCount());
}
} else {
System.out.println("No findings.");
}
} catch (Exception e) {
System.out.println("inspectBigquery Problems: " + e.getMessage());
}
}
use of com.google.privacy.dlp.v2.Table in project java-docs-samples by GoogleCloudPlatform.
the class RiskAnalysis method categoricalStatsAnalysis.
// [END dlp_numerical_stats]
// [START dlp_categorical_stats]
/**
* Calculate categorical statistics for a column in a BigQuery table using the DLP API.
*
* @param projectId The Google Cloud Platform project ID to run the API call under.
* @param datasetId The BigQuery dataset to analyze.
* @param tableId The BigQuery table to analyze.
* @param columnName The name of the column to analyze, which need not contain numerical data.
* @param topicId The name of the Pub/Sub topic to notify once the job completes
* @param subscriptionId The name of the Pub/Sub subscription to use when listening for job
* completion status.
*/
private static void categoricalStatsAnalysis(String projectId, String datasetId, String tableId, String columnName, String topicId, String subscriptionId) {
// Instantiates a client
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
FieldId fieldId = FieldId.newBuilder().setName(columnName).build();
CategoricalStatsConfig categoricalStatsConfig = CategoricalStatsConfig.newBuilder().setField(fieldId).build();
BigQueryTable bigQueryTable = BigQueryTable.newBuilder().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId).build();
PrivacyMetric privacyMetric = PrivacyMetric.newBuilder().setCategoricalStatsConfig(categoricalStatsConfig).build();
ProjectTopicName topicName = ProjectTopicName.of(projectId, topicId);
PublishToPubSub publishToPubSub = PublishToPubSub.newBuilder().setTopic(topicName.toString()).build();
// Create action to publish job status notifications over Google Cloud Pub/Sub
Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
RiskAnalysisJobConfig riskAnalysisJobConfig = RiskAnalysisJobConfig.newBuilder().setSourceTable(bigQueryTable).setPrivacyMetric(privacyMetric).addActions(action).build();
CreateDlpJobRequest createDlpJobRequest = CreateDlpJobRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setRiskJob(riskAnalysisJobConfig).build();
DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
String dlpJobName = dlpJob.getName();
final SettableApiFuture<Boolean> done = SettableApiFuture.create();
// Set up a Pub/Sub subscriber to listen on the job completion status
Subscriber subscriber = Subscriber.newBuilder(ProjectSubscriptionName.newBuilder().setProject(projectId).setSubscription(subscriptionId).build(), (pubsubMessage, ackReplyConsumer) -> {
if (pubsubMessage.getAttributesCount() > 0 && pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJobName)) {
// notify job completion
done.set(true);
ackReplyConsumer.ack();
}
}).build();
subscriber.startAsync();
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
try {
done.get(1, TimeUnit.MINUTES);
// Wait for the job to become available
Thread.sleep(500);
} catch (TimeoutException e) {
System.out.println("Unable to verify job completion.");
}
// Retrieve completed job status
DlpJob completedJob = dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJobName).build());
System.out.println("Job status: " + completedJob.getState());
AnalyzeDataSourceRiskDetails riskDetails = completedJob.getRiskDetails();
AnalyzeDataSourceRiskDetails.CategoricalStatsResult result = riskDetails.getCategoricalStatsResult();
for (CategoricalStatsHistogramBucket bucket : result.getValueFrequencyHistogramBucketsList()) {
System.out.printf("Most common value occurs %d time(s).\n", bucket.getValueFrequencyUpperBound());
System.out.printf("Least common value occurs %d time(s).\n", bucket.getValueFrequencyLowerBound());
for (ValueFrequency valueFrequency : bucket.getBucketValuesList()) {
System.out.printf("Value %s occurs %d time(s).\n", valueFrequency.getValue().toString(), valueFrequency.getCount());
}
}
} catch (Exception e) {
System.out.println("Error in categoricalStatsAnalysis: " + e.getMessage());
}
}
Aggregations