use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.
the class FieldPolicyLoader method loadFieldPolicy.
/**
* read the JSON file path and return the JSON string
*
* @param path path to field policy JSON file
*/
public Map<String, FieldPolicy> loadFieldPolicy(String path) {
log.info("Loading Field Policy JSON file at {} ", path);
String policyJson = "[]";
/**
* If spark is running in yarn-cluster mode, the policyJson file will be passed via --files param to be
* added into driver classpath in Application Master. The "path" won't be valid in that case,
* as it would be pointing to local file system. To enable this, we should be checking the fieldPolicyFile
* in the current location ie classpath for "yarn-cluster" mode as well as the path for "yarn-client" mode
*
* You can also use sparkcontext object to get the value of sparkContext.getConf().get("spark.submit.deployMode")
* and use this to decide which readFieldPolicyJsonPath to choose.
*/
File policyFile = new File(path);
if (policyFile.exists() && policyFile.isFile()) {
log.info("Loading field policies at {} ", path);
} else {
log.info("Couldn't find field policy file at {} will check classpath.", path);
String fileName = policyFile.getName();
path = "./" + fileName;
}
try (BufferedReader br = new BufferedReader(new FileReader(path))) {
StringBuilder sb = new StringBuilder();
String line = br.readLine();
if (line == null) {
log.error("Field policies file at {} is empty ", path);
}
while (line != null) {
sb.append(line);
line = br.readLine();
}
policyJson = sb.toString();
} catch (Exception e) {
log.error("Error parsing field policy file. Please verify valid JSON at path {}", e.getMessage(), e);
}
FieldPoliciesJsonTransformer fieldPoliciesJsonTransformer = new FieldPoliciesJsonTransformer(policyJson);
fieldPoliciesJsonTransformer.augmentPartitionColumnValidation();
Map<String, FieldPolicy> map = fieldPoliciesJsonTransformer.buildPolicies();
log.info("Finished building field policies for file: {} with entity that has {} fields ", path, map.size());
return map;
}
use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.
the class StandardDataValidator method resolvePolicies.
/**
* Returns an array of field-level policies for data validation and cleansing
*/
private FieldPolicy[] resolvePolicies(StructField[] fields, Map<String, FieldPolicy> policyMap) {
List<FieldPolicy> pols = new ArrayList<>(fields.length);
for (StructField field : fields) {
String colName = field.name().toLowerCase();
FieldPolicy policy = policyMap.get(colName);
if (policy == null) {
policy = FieldPolicyBuilder.SKIP_VALIDATION;
}
pols.add(policy);
}
return pols.toArray(new FieldPolicy[0]);
}
use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.
the class StandardDataValidator method validateTable.
@Nonnull
@Override
public DataValidatorResult validateTable(@Nonnull final String databaseName, @Nonnull final String sourceTableName, @Nonnull final String targetTableName, @Nonnull final String partition, final int numPartitions, @Nonnull final Map<String, FieldPolicy> policyMap, @Nonnull final HiveContext hiveContext) {
// Extract fields from a source table
String definitionsTableToUse = targetTableName;
// TODO: should be at debug level
log.info("Constructing field policies from table definitions of '{}'", definitionsTableToUse);
StructField[] fields = resolveSchema(databaseName, definitionsTableToUse, hiveContext);
FieldPolicy[] policies = resolvePolicies(fields, policyMap);
Column[] columns = toSelectColumns(policies);
DataSet sourceDF = scs.toDataSet(hiveContext, HiveUtils.quoteIdentifier(databaseName, sourceTableName)).select(columns).filter("processing_dttm = '" + partition + "'");
// Repartition if necessary
if (numPartitions > 0) {
log.info("Partition count: {}", numPartitions);
sourceDF = sourceDF.repartition(numPartitions);
}
return validate(sourceDF, policies, fields);
}
use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.
the class Validator method run.
private void run(@Nonnull final PrintStream out, @Nonnull final String... args) {
// Check how many arguments were passed in
if (args.length < 4) {
String msg = "Proper Usage is: <targetDatabase> <entity> <partition> <path-to-policy-file>\n" + "You can optionally add: --hiveConf hive.setting=value --hiveConf hive.other.setting=value\n" + "You can optionally add: --storageLevel rdd_persistence_level_value\n" + "You can optionally add: --numPartitions number_of_rdd_partitions\n" + "You provided " + args.length + " args which are (comma separated): " + StringUtils.join(args, ",");
out.println(msg);
throw new IllegalArgumentException(msg);
}
final SparkContext sparkContext = SparkContext.getOrCreate();
try {
final ValidatorConfiguration params = new ValidatorConfiguration(args);
// Initialize Spring context
try (final ConfigurableApplicationContext ctx = new AnnotationConfigApplicationContext("com.thinkbiganalytics.spark")) {
final DataValidator app = ctx.getBean(DataValidator.class);
// Prepare Hive context
final HiveContext hiveContext = new HiveContext(sparkContext);
for (final Param param : params.getHiveParams()) {
log.info("Adding Hive parameter {}={}", param.getName(), param.getValue());
hiveContext.setConf(param.getName(), param.getValue());
}
log.info("Deployment Mode - {}", hiveContext.sparkContext().getConf().get("spark.submit.deployMode"));
Map<String, FieldPolicy> policyMap = ctx.getBean(FieldPolicyLoader.class).loadFieldPolicy(params.getFieldPolicyJsonPath());
// Run validation
final DataValidatorResult results = app.validateTable(params.getTargetDatabase(), params.getFeedTableName(), params.getValidTableName(), params.getPartition(), params.getNumPartitions(), policyMap, hiveContext);
log.info("Persistence level: {}", params.getStorageLevel());
results.persist(StorageLevel.fromString(params.getStorageLevel()));
app.saveInvalidToTable(params.getTargetDatabase(), params.getFeedTableName(), params.getInvalidTableName(), results, hiveContext);
app.saveValidToTable(params.getTargetDatabase(), params.getFeedTableName(), params.getValidTableName(), results, hiveContext);
app.saveProfileToTable(params.getTargetDatabase(), params.getProfileTableName(), params.getPartition(), results, hiveContext);
results.unpersist();
}
log.info("Validator app finished");
} catch (Exception e) {
log.error("Failed to perform validation: {}", e.toString(), e);
throw e;
}
}
use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.
the class Profiler method checkCommandLineArgs.
/**
* Check command line arguments
*
* @param args list of command line arguments
* @return query to run (null if invalid arguments)
*/
@Nullable
private String checkCommandLineArgs(final String[] args) {
if (log.isInfoEnabled()) {
log.info("Running Spark Profiler with the following command line {} args (comma separated): {}", args.length, StringUtils.join(args, ","));
}
if (args.length < 5) {
log.error("Invalid number of command line arguments ({})", args.length);
showCommandLineArgs();
return null;
}
String retVal;
String profileObjectType = args[0];
String profileObjectDesc = args[1];
Integer n = Integer.valueOf(args[2]);
String profileOutputTable = args[3];
String fieldPolicyJsonPath = args[4];
Map<String, FieldPolicy> policyMap = loader.loadFieldPolicy(fieldPolicyJsonPath);
String inputAndOutputTablePartitionKey = "ALL";
if (args.length >= 6) {
inputAndOutputTablePartitionKey = args[5];
}
switch(profileObjectType) {
case "table":
// Quote source table
final String[] tableRef = profileObjectDesc.split("\\.", 2);
final String safeTable = tableRef.length == 1 ? HiveUtils.quoteIdentifier(tableRef[0]) : HiveUtils.quoteIdentifier(tableRef[0], tableRef[1]);
// Create SQL
List<String> profiledColumns = new ArrayList<>();
for (FieldPolicy fieldPolicy : policyMap.values()) {
if (fieldPolicy.isProfile()) {
profiledColumns.add(HiveUtils.quoteIdentifier(fieldPolicy.getField().toLowerCase()));
}
}
if (!profiledColumns.isEmpty()) {
retVal = "select " + StringUtils.join(profiledColumns, ',') + " from " + safeTable;
if (inputAndOutputTablePartitionKey != null && !"ALL".equalsIgnoreCase(inputAndOutputTablePartitionKey)) {
retVal += " where " + HiveUtils.quoteIdentifier(profilerConfiguration.getInputTablePartitionColumnName()) + " = " + HiveUtils.quoteString(inputAndOutputTablePartitionKey);
}
} else {
retVal = null;
}
break;
case "query":
retVal = profileObjectDesc;
break;
default:
log.error("Illegal command line argument for object type ({})", profileObjectType);
showCommandLineArgs();
return null;
}
if (n <= 0) {
log.error("Illegal command line argument for n for top_n values ({})", n);
showCommandLineArgs();
return null;
} else {
profilerConfiguration.setNumberOfTopNValues(n);
}
if (!setOutputTableDBAndName(profileOutputTable, profilerConfiguration)) {
log.error("Illegal command line argument for output table ({})", profileOutputTable);
showCommandLineArgs();
return null;
}
profilerConfiguration.setInputAndOutputTablePartitionKey(inputAndOutputTablePartitionKey);
return retVal;
}
Aggregations