use of com.google.privacy.dlp.v2.FieldId in project java-docs-samples by GoogleCloudPlatform.
the class DeIdentification method deidentifyWithDateShift.
// [END dlp_reidentify_fpe]
// [START dlp_deidentify_date_shift]
/**
* @param inputCsvPath The path to the CSV file to deidentify
* @param outputCsvPath (Optional) path to the output CSV file
* @param dateFields The list of (date) fields in the CSV file to date shift
* @param lowerBoundDays The maximum number of days to shift a date backward
* @param upperBoundDays The maximum number of days to shift a date forward
* @param contextFieldId (Optional) The column to determine date shift, default : a random shift
* amount
* @param wrappedKey (Optional) The encrypted ('wrapped') AES-256 key to use when shifting dates
* @param keyName (Optional) The name of the Cloud KMS key used to encrypt ('wrap') the AES-256
* key
* @param projectId ID of Google Cloud project to run the API under.
*/
private static void deidentifyWithDateShift(Path inputCsvPath, Path outputCsvPath, String[] dateFields, int lowerBoundDays, int upperBoundDays, String contextFieldId, String wrappedKey, String keyName, String projectId) throws Exception {
// instantiate a client
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
// Set the maximum days to shift a day backward (lowerbound), forward (upperbound)
DateShiftConfig.Builder dateShiftConfigBuilder = DateShiftConfig.newBuilder().setLowerBoundDays(lowerBoundDays).setUpperBoundDays(upperBoundDays);
// If contextFieldId, keyName or wrappedKey is set: all three arguments must be valid
if (contextFieldId != null && keyName != null && wrappedKey != null) {
dateShiftConfigBuilder.setContext(FieldId.newBuilder().setName(contextFieldId).build());
KmsWrappedCryptoKey kmsWrappedCryptoKey = KmsWrappedCryptoKey.newBuilder().setCryptoKeyName(keyName).setWrappedKey(ByteString.copyFrom(BaseEncoding.base64().decode(wrappedKey))).build();
dateShiftConfigBuilder.setCryptoKey(CryptoKey.newBuilder().setKmsWrapped(kmsWrappedCryptoKey).build());
} else if (contextFieldId != null || keyName != null || wrappedKey != null) {
throw new IllegalArgumentException("You must set either ALL or NONE of {contextFieldId, keyName, wrappedKey}!");
}
// Read and parse the CSV file
BufferedReader br = null;
String line;
List<Table.Row> rows = new ArrayList<>();
List<FieldId> headers;
br = new BufferedReader(new FileReader(inputCsvPath.toFile()));
// convert csv header to FieldId
headers = Arrays.stream(br.readLine().split(",")).map(header -> FieldId.newBuilder().setName(header).build()).collect(Collectors.toList());
while ((line = br.readLine()) != null) {
// convert csv rows to Table.Row
rows.add(convertCsvRowToTableRow(line));
}
br.close();
Table table = Table.newBuilder().addAllHeaders(headers).addAllRows(rows).build();
List<FieldId> dateFieldIds = Arrays.stream(dateFields).map(field -> FieldId.newBuilder().setName(field).build()).collect(Collectors.toList());
DateShiftConfig dateShiftConfig = dateShiftConfigBuilder.build();
FieldTransformation fieldTransformation = FieldTransformation.newBuilder().addAllFields(dateFieldIds).setPrimitiveTransformation(PrimitiveTransformation.newBuilder().setDateShiftConfig(dateShiftConfig).build()).build();
DeidentifyConfig deidentifyConfig = DeidentifyConfig.newBuilder().setRecordTransformations(RecordTransformations.newBuilder().addFieldTransformations(fieldTransformation).build()).build();
ContentItem tableItem = ContentItem.newBuilder().setTable(table).build();
DeidentifyContentRequest request = DeidentifyContentRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setDeidentifyConfig(deidentifyConfig).setItem(tableItem).build();
// Execute the deidentification request
DeidentifyContentResponse response = dlpServiceClient.deidentifyContent(request);
// Write out the response as a CSV file
List<FieldId> outputHeaderFields = response.getItem().getTable().getHeadersList();
List<Table.Row> outputRows = response.getItem().getTable().getRowsList();
List<String> outputHeaders = outputHeaderFields.stream().map(FieldId::getName).collect(Collectors.toList());
File outputFile = outputCsvPath.toFile();
if (!outputFile.exists()) {
outputFile.createNewFile();
}
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(outputFile));
// write out headers
bufferedWriter.append(String.join(",", outputHeaders) + "\n");
// write out each row
for (Table.Row outputRow : outputRows) {
String row = outputRow.getValuesList().stream().map(value -> value.getStringValue()).collect(Collectors.joining(","));
bufferedWriter.append(row + "\n");
}
bufferedWriter.flush();
bufferedWriter.close();
System.out.println("Successfully saved date-shift output to: " + outputCsvPath.getFileName());
} catch (Exception e) {
System.out.println("Error in deidentifyWithDateShift: " + e.getMessage());
}
}
use of com.google.privacy.dlp.v2.FieldId in project java-docs-samples by GoogleCloudPlatform.
the class RiskAnalysis method categoricalStatsAnalysis.
// [END dlp_numerical_stats]
// [START dlp_categorical_stats]
/**
* Calculate categorical statistics for a column in a BigQuery table using the DLP API.
*
* @param projectId The Google Cloud Platform project ID to run the API call under.
* @param datasetId The BigQuery dataset to analyze.
* @param tableId The BigQuery table to analyze.
* @param columnName The name of the column to analyze, which need not contain numerical data.
* @param topicId The name of the Pub/Sub topic to notify once the job completes
* @param subscriptionId The name of the Pub/Sub subscription to use when listening for job
* completion status.
*/
private static void categoricalStatsAnalysis(String projectId, String datasetId, String tableId, String columnName, String topicId, String subscriptionId) {
// Instantiates a client
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
FieldId fieldId = FieldId.newBuilder().setName(columnName).build();
CategoricalStatsConfig categoricalStatsConfig = CategoricalStatsConfig.newBuilder().setField(fieldId).build();
BigQueryTable bigQueryTable = BigQueryTable.newBuilder().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId).build();
PrivacyMetric privacyMetric = PrivacyMetric.newBuilder().setCategoricalStatsConfig(categoricalStatsConfig).build();
ProjectTopicName topicName = ProjectTopicName.of(projectId, topicId);
PublishToPubSub publishToPubSub = PublishToPubSub.newBuilder().setTopic(topicName.toString()).build();
// Create action to publish job status notifications over Google Cloud Pub/Sub
Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
RiskAnalysisJobConfig riskAnalysisJobConfig = RiskAnalysisJobConfig.newBuilder().setSourceTable(bigQueryTable).setPrivacyMetric(privacyMetric).addActions(action).build();
CreateDlpJobRequest createDlpJobRequest = CreateDlpJobRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setRiskJob(riskAnalysisJobConfig).build();
DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
String dlpJobName = dlpJob.getName();
final SettableApiFuture<Boolean> done = SettableApiFuture.create();
// Set up a Pub/Sub subscriber to listen on the job completion status
Subscriber subscriber = Subscriber.newBuilder(ProjectSubscriptionName.newBuilder().setProject(projectId).setSubscription(subscriptionId).build(), (pubsubMessage, ackReplyConsumer) -> {
if (pubsubMessage.getAttributesCount() > 0 && pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJobName)) {
// notify job completion
done.set(true);
ackReplyConsumer.ack();
}
}).build();
subscriber.startAsync();
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
try {
done.get(1, TimeUnit.MINUTES);
// Wait for the job to become available
Thread.sleep(500);
} catch (TimeoutException e) {
System.out.println("Unable to verify job completion.");
}
// Retrieve completed job status
DlpJob completedJob = dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJobName).build());
System.out.println("Job status: " + completedJob.getState());
AnalyzeDataSourceRiskDetails riskDetails = completedJob.getRiskDetails();
AnalyzeDataSourceRiskDetails.CategoricalStatsResult result = riskDetails.getCategoricalStatsResult();
for (CategoricalStatsHistogramBucket bucket : result.getValueFrequencyHistogramBucketsList()) {
System.out.printf("Most common value occurs %d time(s).\n", bucket.getValueFrequencyUpperBound());
System.out.printf("Least common value occurs %d time(s).\n", bucket.getValueFrequencyLowerBound());
for (ValueFrequency valueFrequency : bucket.getBucketValuesList()) {
System.out.printf("Value %s occurs %d time(s).\n", valueFrequency.getValue().toString(), valueFrequency.getCount());
}
}
} catch (Exception e) {
System.out.println("Error in categoricalStatsAnalysis: " + e.getMessage());
}
}
use of com.google.privacy.dlp.v2.FieldId in project java-docs-samples by GoogleCloudPlatform.
the class RiskAnalysis method calculateLDiversity.
// [END dlp_k_anonymity]
// [START dlp_l_diversity]
/**
* Calculate l-diversity for an attribute relative to quasi-identifiers in a BigQuery table.
*
* @param projectId The Google Cloud Platform project ID to run the API call under.
* @param datasetId The BigQuery dataset to analyze.
* @param tableId The BigQuery table to analyze.
* @param sensitiveAttribute The name of the attribute to compare the quasi-ID against
* @param quasiIds A set of column names that form a composite key ('quasi-identifiers').
* @param topicId The name of the Pub/Sub topic to notify once the job completes
* @param subscriptionId The name of the Pub/Sub subscription to use when listening for job
* completion status.
*/
private static void calculateLDiversity(String projectId, String datasetId, String tableId, String sensitiveAttribute, List<String> quasiIds, String topicId, String subscriptionId) throws Exception {
// Instantiates a client
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
FieldId sensitiveAttributeField = FieldId.newBuilder().setName(sensitiveAttribute).build();
List<FieldId> quasiIdFields = quasiIds.stream().map(columnName -> FieldId.newBuilder().setName(columnName).build()).collect(Collectors.toList());
LDiversityConfig ldiversityConfig = LDiversityConfig.newBuilder().addAllQuasiIds(quasiIdFields).setSensitiveAttribute(sensitiveAttributeField).build();
BigQueryTable bigQueryTable = BigQueryTable.newBuilder().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId).build();
PrivacyMetric privacyMetric = PrivacyMetric.newBuilder().setLDiversityConfig(ldiversityConfig).build();
String topicName = String.format("projects/%s/topics/%s", projectId, topicId);
PublishToPubSub publishToPubSub = PublishToPubSub.newBuilder().setTopic(topicName).build();
// Create action to publish job status notifications over Google Cloud Pub/Sub
Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
RiskAnalysisJobConfig riskAnalysisJobConfig = RiskAnalysisJobConfig.newBuilder().setSourceTable(bigQueryTable).setPrivacyMetric(privacyMetric).addActions(action).build();
CreateDlpJobRequest createDlpJobRequest = CreateDlpJobRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setRiskJob(riskAnalysisJobConfig).build();
DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
String dlpJobName = dlpJob.getName();
final SettableApiFuture<Boolean> done = SettableApiFuture.create();
// Set up a Pub/Sub subscriber to listen on the job completion status
Subscriber subscriber = Subscriber.newBuilder(ProjectSubscriptionName.newBuilder().setProject(projectId).setSubscription(subscriptionId).build(), (pubsubMessage, ackReplyConsumer) -> {
if (pubsubMessage.getAttributesCount() > 0 && pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJobName)) {
// notify job completion
done.set(true);
ackReplyConsumer.ack();
}
}).build();
subscriber.startAsync();
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
try {
done.get(1, TimeUnit.MINUTES);
// Wait for the job to become available
Thread.sleep(500);
} catch (TimeoutException e) {
System.out.println("Unable to verify job completion.");
}
// retrieve completed job status
DlpJob completedJob = dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJobName).build());
System.out.println("Job status: " + completedJob.getState());
AnalyzeDataSourceRiskDetails riskDetails = completedJob.getRiskDetails();
LDiversityResult ldiversityResult = riskDetails.getLDiversityResult();
for (LDiversityHistogramBucket result : ldiversityResult.getSensitiveValueFrequencyHistogramBucketsList()) {
for (LDiversityEquivalenceClass bucket : result.getBucketValuesList()) {
List<String> quasiIdValues = bucket.getQuasiIdsValuesList().stream().map(Value::toString).collect(Collectors.toList());
System.out.println("\tQuasi-ID values: " + String.join(", ", quasiIdValues));
System.out.println("\tClass size: " + bucket.getEquivalenceClassSize());
for (ValueFrequency valueFrequency : bucket.getTopSensitiveValuesList()) {
System.out.printf("\t\tSensitive value %s occurs %d time(s).\n", valueFrequency.getValue().toString(), valueFrequency.getCount());
}
}
}
} catch (Exception e) {
System.out.println("Error in calculateLDiversity: " + e.getMessage());
}
}
use of com.google.privacy.dlp.v2.FieldId in project java-docs-samples by GoogleCloudPlatform.
the class RiskAnalysis method calculateKAnonymity.
// [END dlp_categorical_stats]
// [START dlp_k_anonymity]
/**
* Calculate k-anonymity for quasi-identifiers in a BigQuery table using the DLP API.
*
* @param projectId The Google Cloud Platform project ID to run the API call under.
* @param datasetId The BigQuery dataset to analyze.
* @param tableId The BigQuery table to analyze.
* @param quasiIds The names of columns that form a composite key ('quasi-identifiers').
* @param topicId The name of the Pub/Sub topic to notify once the job completes
* @param subscriptionId The name of the Pub/Sub subscription to use when listening for job
* completion status.
*/
private static void calculateKAnonymity(String projectId, String datasetId, String tableId, List<String> quasiIds, String topicId, String subscriptionId) throws Exception {
// Instantiates a client
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
List<FieldId> quasiIdFields = quasiIds.stream().map(columnName -> FieldId.newBuilder().setName(columnName).build()).collect(Collectors.toList());
KAnonymityConfig kanonymityConfig = KAnonymityConfig.newBuilder().addAllQuasiIds(quasiIdFields).build();
BigQueryTable bigQueryTable = BigQueryTable.newBuilder().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId).build();
PrivacyMetric privacyMetric = PrivacyMetric.newBuilder().setKAnonymityConfig(kanonymityConfig).build();
String topicName = String.format("projects/%s/topics/%s", projectId, topicId);
PublishToPubSub publishToPubSub = PublishToPubSub.newBuilder().setTopic(topicName).build();
// Create action to publish job status notifications over Google Cloud Pub/Sub
Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
RiskAnalysisJobConfig riskAnalysisJobConfig = RiskAnalysisJobConfig.newBuilder().setSourceTable(bigQueryTable).setPrivacyMetric(privacyMetric).addActions(action).build();
CreateDlpJobRequest createDlpJobRequest = CreateDlpJobRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setRiskJob(riskAnalysisJobConfig).build();
DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
String dlpJobName = dlpJob.getName();
final SettableApiFuture<Boolean> done = SettableApiFuture.create();
// Set up a Pub/Sub subscriber to listen on the job completion status
Subscriber subscriber = Subscriber.newBuilder(ProjectSubscriptionName.newBuilder().setProject(projectId).setSubscription(subscriptionId).build(), (pubsubMessage, ackReplyConsumer) -> {
if (pubsubMessage.getAttributesCount() > 0 && pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJobName)) {
// notify job completion
done.set(true);
ackReplyConsumer.ack();
}
}).build();
subscriber.startAsync();
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
try {
done.get(1, TimeUnit.MINUTES);
// Wait for the job to become available
Thread.sleep(500);
} catch (TimeoutException e) {
System.out.println("Unable to verify job completion.");
}
// Retrieve completed job status
DlpJob completedJob = dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJobName).build());
System.out.println("Job status: " + completedJob.getState());
AnalyzeDataSourceRiskDetails riskDetails = completedJob.getRiskDetails();
KAnonymityResult kanonymityResult = riskDetails.getKAnonymityResult();
for (KAnonymityHistogramBucket result : kanonymityResult.getEquivalenceClassHistogramBucketsList()) {
System.out.printf("Bucket size range: [%d, %d]\n", result.getEquivalenceClassSizeLowerBound(), result.getEquivalenceClassSizeUpperBound());
for (KAnonymityEquivalenceClass bucket : result.getBucketValuesList()) {
List<String> quasiIdValues = bucket.getQuasiIdsValuesList().stream().map(v -> v.toString()).collect(Collectors.toList());
System.out.println("\tQuasi-ID values: " + String.join(", ", quasiIdValues));
System.out.println("\tClass size: " + bucket.getEquivalenceClassSize());
}
}
} catch (Exception e) {
System.out.println("Error in calculateKAnonymity: " + e.getMessage());
}
}
use of com.google.privacy.dlp.v2.FieldId in project java-docs-samples by GoogleCloudPlatform.
the class RiskAnalysis method numericalStatsAnalysis.
// [START dlp_numerical_stats]
/**
* Calculate numerical statistics for a column in a BigQuery table using the DLP API.
*
* @param projectId The Google Cloud Platform project ID to run the API call under.
* @param datasetId The BigQuery dataset to analyze.
* @param tableId The BigQuery table to analyze.
* @param columnName The name of the column to analyze, which must contain only numerical data.
* @param topicId The name of the Pub/Sub topic to notify once the job completes
* @param subscriptionId The name of the Pub/Sub subscription to use when listening for job
* completion status.
*/
private static void numericalStatsAnalysis(String projectId, String datasetId, String tableId, String columnName, String topicId, String subscriptionId) throws Exception {
// Instantiates a client
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
BigQueryTable bigQueryTable = BigQueryTable.newBuilder().setTableId(tableId).setDatasetId(datasetId).setProjectId(projectId).build();
FieldId fieldId = FieldId.newBuilder().setName(columnName).build();
NumericalStatsConfig numericalStatsConfig = NumericalStatsConfig.newBuilder().setField(fieldId).build();
PrivacyMetric privacyMetric = PrivacyMetric.newBuilder().setNumericalStatsConfig(numericalStatsConfig).build();
String topicName = String.format("projects/%s/topics/%s", projectId, topicId);
PublishToPubSub publishToPubSub = PublishToPubSub.newBuilder().setTopic(topicName).build();
// Create action to publish job status notifications over Google Cloud Pub/Sub
Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
RiskAnalysisJobConfig riskAnalysisJobConfig = RiskAnalysisJobConfig.newBuilder().setSourceTable(bigQueryTable).setPrivacyMetric(privacyMetric).addActions(action).build();
CreateDlpJobRequest createDlpJobRequest = CreateDlpJobRequest.newBuilder().setParent(ProjectName.of(projectId).toString()).setRiskJob(riskAnalysisJobConfig).build();
DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
String dlpJobName = dlpJob.getName();
final SettableApiFuture<Boolean> done = SettableApiFuture.create();
// Set up a Pub/Sub subscriber to listen on the job completion status
Subscriber subscriber = Subscriber.newBuilder(ProjectSubscriptionName.newBuilder().setProject(projectId).setSubscription(subscriptionId).build(), (pubsubMessage, ackReplyConsumer) -> {
if (pubsubMessage.getAttributesCount() > 0 && pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJobName)) {
// notify job completion
done.set(true);
ackReplyConsumer.ack();
}
}).build();
subscriber.startAsync();
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
try {
done.get(1, TimeUnit.MINUTES);
// Wait for the job to become available
Thread.sleep(500);
} catch (TimeoutException e) {
System.out.println("Unable to verify job completion.");
}
// Retrieve completed job status
DlpJob completedJob = dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJobName).build());
System.out.println("Job status: " + completedJob.getState());
AnalyzeDataSourceRiskDetails riskDetails = completedJob.getRiskDetails();
AnalyzeDataSourceRiskDetails.NumericalStatsResult result = riskDetails.getNumericalStatsResult();
System.out.printf("Value range : [%.3f, %.3f]\n", result.getMinValue().getFloatValue(), result.getMaxValue().getFloatValue());
int percent = 1;
Double lastValue = null;
for (Value quantileValue : result.getQuantileValuesList()) {
Double currentValue = quantileValue.getFloatValue();
if (lastValue == null || !lastValue.equals(currentValue)) {
System.out.printf("Value at %s %% quantile : %.3f", percent, currentValue);
}
lastValue = currentValue;
}
} catch (Exception e) {
System.out.println("Error in categoricalStatsAnalysis: " + e.getMessage());
}
}
Aggregations