Search in sources :

Example 1 with FieldPolicy

use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.

the class CleanseAndValidateRowTest method rangeValidate.

private ValidationResult rangeValidate(Number min, Number max, String dataType, String value) {
    RangeValidator validatorPolicy = new RangeValidator(min, max);
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(validatorPolicy);
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName("field1").feedFieldName("field1").addPolicies(policies).build();
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, value, HCatDataType.createFromDataType("field1", dataType), new HashMap<Class, Class>());
    return result.getFinalValidationResult();
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) ArrayList(java.util.ArrayList) RangeValidator(com.thinkbiganalytics.policy.validation.RangeValidator) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult)

Example 2 with FieldPolicy

use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.

the class CleanseAndValidateRowTest method invalidStandardizeAndValidate.

@Test
public void invalidStandardizeAndValidate() {
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
    policies.add(new LookupValidator("blah"));
    policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
    policies.add(new LookupValidator("aatestaa"));
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
    assertEquals("aabaraa", result.getFieldValue());
    assertNotEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) LookupValidator(com.thinkbiganalytics.policy.validation.LookupValidator) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) SimpleRegexReplacer(com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 3 with FieldPolicy

use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.

the class CleanseAndValidateRowTest method standardizeRegex.

@Test
public void standardizeRegex() {
    SimpleRegexReplacer standardizer = new SimpleRegexReplacer("(?i)foo", "bar");
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(standardizer);
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
    assertEquals(result.getFieldValue(), "aabaraa");
    result = validator.standardizeAndValidateField(fieldPolicy, null, fieldDataType, new HashMap<Class, Class>());
    assertNull(result.getFieldValue());
    result = validator.standardizeAndValidateField(fieldPolicy, "", fieldDataType, new HashMap<Class, Class>());
    assertEquals(result.getFieldValue(), "");
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HashMap(java.util.HashMap) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) SimpleRegexReplacer(com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 4 with FieldPolicy

use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.

the class CleanseAndValidateRowTest method mixedStandardizeAndValidate.

@Test
public void mixedStandardizeAndValidate() {
    String fieldValue = "TeSt_fiELd";
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(UppercaseStandardizer.instance());
    policies.add(new CharacterValidator("UPPERCASE"));
    policies.add(LowercaseStandardizer.instance());
    policies.add(new CharacterValidator("LOWERCASE"));
    policies.add(UppercaseStandardizer.instance());
    policies.add(new CharacterValidator("UPPERCASE"));
    policies.add(LowercaseStandardizer.instance());
    policies.add(new CharacterValidator("LOWERCASE"));
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, fieldValue, fieldDataType, new HashMap<Class, Class>());
    assertEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
    assertEquals("test_field", result.getFieldValue());
}
Also used : CharacterValidator(com.thinkbiganalytics.policy.validation.CharacterValidator) FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 5 with FieldPolicy

use of com.thinkbiganalytics.policy.FieldPolicy in project kylo by Teradata.

the class ModifiedSchema method getValidTableSchema.

@Nonnull
public static StructType getValidTableSchema(@Nonnull final StructField[] feedFields, @Nonnull final StructField[] validFields, @Nonnull final FieldPolicy[] policies) {
    // Map of the lower feed valid name to the field type
    final Map<String, StructField> validFieldsMap = new HashMap<>();
    for (StructField validField : validFields) {
        String lowerFieldName = validField.name().toLowerCase();
        validFieldsMap.put(lowerFieldName, validField);
    }
    // List of all the feedFieldNames that are part of the policyMap
    final List<String> policyMapFeedFieldNames = new ArrayList<>();
    // A map of the feedFieldName to validFieldName
    final Map<String, String> validFieldToFeedFieldMap = new HashMap<>();
    // List of all those validFieldNames that have a standardizer on them
    final List<String> validFieldsWithStandardizers = new ArrayList<>();
    for (FieldPolicy policy : policies) {
        if (policy.getField() != null) {
            String feedFieldName = policy.getFeedField().toLowerCase();
            String fieldName = policy.getField().toLowerCase();
            policyMapFeedFieldNames.add(feedFieldName);
            validFieldToFeedFieldMap.put(fieldName, feedFieldName);
            if (policy.hasStandardizationPolicies()) {
                validFieldsWithStandardizers.add(fieldName);
            }
        }
    }
    List<StructField> fieldsList = new ArrayList<>(feedFields.length);
    for (StructField feedField : feedFields) {
        String lowerFeedFieldName = feedField.name().toLowerCase();
        if (policyMapFeedFieldNames.contains(lowerFeedFieldName)) {
            StructField field = feedField;
            // get the corresponding valid table field name
            String lowerFieldName = validFieldToFeedFieldMap.get(lowerFeedFieldName);
            // if we are standardizing then use the field type matching the _valid table
            if (validFieldsWithStandardizers.contains(lowerFieldName)) {
                // get the valid table
                field = validFieldsMap.get(lowerFieldName);
                HCatDataType dataType = HCatDataType.createFromDataType(field.name(), field.dataType().simpleString());
                if (dataType != null && dataType.isDateOrTimestamp()) {
                    field = new StructField(field.name(), DataTypes.StringType, field.nullable(), field.metadata());
                }
            }
            fieldsList.add(field);
        } else {
            log.warn("Valid table field {} is not present in policy map", lowerFeedFieldName);
        }
    }
    // Insert the two custom fields before the processing partition column
    fieldsList.add(new StructField(CleanseAndValidateRow.PROCESSING_DTTM_COL, DataTypes.StringType, true, Metadata.empty()));
    fieldsList.add(fieldsList.size() - 1, new StructField(CleanseAndValidateRow.REJECT_REASON_COL, DataTypes.StringType, true, Metadata.empty()));
    return new StructType(fieldsList.toArray(new StructField[0]));
}
Also used : StructField(org.apache.spark.sql.types.StructField) FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) Nonnull(javax.annotation.Nonnull)

Aggregations

FieldPolicy (com.thinkbiganalytics.policy.FieldPolicy)16 ArrayList (java.util.ArrayList)12 BaseFieldPolicy (com.thinkbiganalytics.policy.BaseFieldPolicy)10 StandardizationAndValidationResult (com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult)10 HCatDataType (com.thinkbiganalytics.spark.validation.HCatDataType)9 Test (org.junit.Test)7 SimpleRegexReplacer (com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer)4 LookupValidator (com.thinkbiganalytics.policy.validation.LookupValidator)3 HashMap (java.util.HashMap)3 StructField (org.apache.spark.sql.types.StructField)3 Nonnull (javax.annotation.Nonnull)2 FieldPoliciesJsonTransformer (com.thinkbiganalytics.policy.FieldPoliciesJsonTransformer)1 StandardizationPolicy (com.thinkbiganalytics.policy.standardization.StandardizationPolicy)1 CharacterValidator (com.thinkbiganalytics.policy.validation.CharacterValidator)1 NotNullValidator (com.thinkbiganalytics.policy.validation.NotNullValidator)1 RangeValidator (com.thinkbiganalytics.policy.validation.RangeValidator)1 ValidationResult (com.thinkbiganalytics.policy.validation.ValidationResult)1 DataSet (com.thinkbiganalytics.spark.DataSet)1 CleansedRowResult (com.thinkbiganalytics.spark.datavalidator.CleansedRowResult)1 FieldPolicyLoader (com.thinkbiganalytics.spark.policy.FieldPolicyLoader)1