Search in sources :

Example 1 with StandardizationAndValidationResult

use of com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult in project kylo by Teradata.

the class CleanseAndValidateRow method standardizeAndValidateField.

StandardizationAndValidationResult standardizeAndValidateField(FieldPolicy fieldPolicy, Object value, HCatDataType dataType, Map<Class, Class> validatorParamType) {
    StandardizationAndValidationResult result = new StandardizationAndValidationResult(value);
    List<BaseFieldPolicy> fieldPolicies = fieldPolicy.getAllPolicies();
    int standardizerCount = 0;
    for (BaseFieldPolicy p : fieldPolicies) {
        if (p instanceof StandardizationPolicy) {
            standardizerCount++;
        }
    }
    boolean validateNullValues = false;
    int processedStandardizers = 0;
    for (BaseFieldPolicy p : fieldPolicies) {
        boolean isEmpty = ((result.getFieldValue() == null) || (StringUtils.isEmpty(result.getFieldValue().toString())));
        if (p instanceof StandardizationPolicy) {
            processedStandardizers++;
            StandardizationPolicy standardizationPolicy = (StandardizationPolicy) p;
            boolean shouldStandardize = true;
            if (isEmpty && !(standardizationPolicy instanceof AcceptsEmptyValues)) {
                shouldStandardize = false;
            }
            if (!standardizationPolicy.accepts(result.getFieldValue())) {
                shouldStandardize = false;
            }
            if (shouldStandardize) {
                Object newValue = result.getFieldValue();
                try {
                    newValue = standardizationPolicy.convertRawValue(result.getFieldValue());
                } catch (Exception e) {
                    log.error("Standardizer '{}' threw exception while attempting to standardize value, original value will be kept. Exception: {}", standardizationPolicy.getClass(), e);
                }
                // If this is the last standardizer for this field and the standardized value is returned as a String, and target column is not String, then validate and convert it to correct type
                if (newValue != null && dataType.getConvertibleType() != newValue.getClass() && standardizerCount == processedStandardizers) {
                    try {
                        // Date and timestamp fields can be valid as strings
                        boolean isValueOk = dataType.isStringValueValidForHiveType(newValue.toString());
                        if (!isValueOk) {
                            // if the current string is not in a correct format attempt to convert it
                            try {
                                newValue = dataType.toNativeValue(newValue.toString());
                            } catch (RuntimeException e) {
                                result.addValidationResult(ValidationResult.failField("incompatible", dataType.getName(), "Not convertible to " + dataType.getNativeType()));
                            }
                        }
                    } catch (InvalidFormatException e) {
                        log.warn("Could not convert value {} to correct type {}", newValue.toString(), dataType.getConvertibleType().getName());
                    }
                }
                result.setFieldValue(newValue);
            }
        }
        if (p instanceof ValidationPolicy) {
            ValidationPolicy validationPolicy = (ValidationPolicy) p;
            // not null validator
            if (!isEmpty || validateNullValues || validationPolicy instanceof NotNullValidator) {
                ValidationResult validationResult = validateValue(validationPolicy, dataType, result.getFieldValue(), validatorParamType);
                if (isEmpty && validationPolicy instanceof NotNullValidator) {
                    validateNullValues = validationPolicy != VALID_RESULT;
                }
                // only need to add those that are invalid
                if (validationResult != VALID_RESULT) {
                    result.addValidationResult(validationResult);
                    // exit out of processing if invalid records found.
                    break;
                }
            }
            // reset the failOnEmpty flag back to false
            if (!(validationPolicy instanceof NotNullValidator)) {
                validateNullValues = false;
            }
        }
    }
    ValidationResult finalValidationCheck = finalValidationCheck(fieldPolicy, dataType, result.getFieldValue());
    if (finalValidationCheck != VALID_RESULT) {
        result.addValidationResult(finalValidationCheck);
    }
    return result;
}
Also used : NotNullValidator(com.thinkbiganalytics.policy.validation.NotNullValidator) AcceptsEmptyValues(com.thinkbiganalytics.policy.standardization.AcceptsEmptyValues) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) ValidationResult(com.thinkbiganalytics.policy.validation.ValidationResult) InvalidFormatException(com.thinkbiganalytics.spark.util.InvalidFormatException) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) InvalidFormatException(com.thinkbiganalytics.spark.util.InvalidFormatException) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) ValidationPolicy(com.thinkbiganalytics.policy.validation.ValidationPolicy) StandardizationPolicy(com.thinkbiganalytics.policy.standardization.StandardizationPolicy)

Example 2 with StandardizationAndValidationResult

use of com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult in project kylo by Teradata.

the class CleanseAndValidateRowTest method rangeValidate.

private ValidationResult rangeValidate(Number min, Number max, String dataType, String value) {
    RangeValidator validatorPolicy = new RangeValidator(min, max);
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(validatorPolicy);
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName("field1").feedFieldName("field1").addPolicies(policies).build();
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, value, HCatDataType.createFromDataType("field1", dataType), new HashMap<Class, Class>());
    return result.getFinalValidationResult();
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) ArrayList(java.util.ArrayList) RangeValidator(com.thinkbiganalytics.policy.validation.RangeValidator) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult)

Example 3 with StandardizationAndValidationResult

use of com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult in project kylo by Teradata.

the class CleanseAndValidateRowTest method mixedStandardizeAndValidate.

@Test
public void mixedStandardizeAndValidate() {
    String fieldValue = "TeSt_fiELd";
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(UppercaseStandardizer.instance());
    policies.add(new CharacterValidator("UPPERCASE"));
    policies.add(LowercaseStandardizer.instance());
    policies.add(new CharacterValidator("LOWERCASE"));
    policies.add(UppercaseStandardizer.instance());
    policies.add(new CharacterValidator("UPPERCASE"));
    policies.add(LowercaseStandardizer.instance());
    policies.add(new CharacterValidator("LOWERCASE"));
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, fieldValue, fieldDataType, new HashMap<Class, Class>());
    assertEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
    assertEquals("test_field", result.getFieldValue());
}
Also used : CharacterValidator(com.thinkbiganalytics.policy.validation.CharacterValidator) FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 4 with StandardizationAndValidationResult

use of com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult in project kylo by Teradata.

the class CleanseAndValidateRowTest method invalidStandardizeAndValidate.

@Test
public void invalidStandardizeAndValidate() {
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
    policies.add(new LookupValidator("blah"));
    policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
    policies.add(new LookupValidator("aatestaa"));
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
    assertEquals("aabaraa", result.getFieldValue());
    assertNotEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) LookupValidator(com.thinkbiganalytics.policy.validation.LookupValidator) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) SimpleRegexReplacer(com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 5 with StandardizationAndValidationResult

use of com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult in project kylo by Teradata.

the class CleanseAndValidateRowTest method standardizeRegex.

@Test
public void standardizeRegex() {
    SimpleRegexReplacer standardizer = new SimpleRegexReplacer("(?i)foo", "bar");
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(standardizer);
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
    assertEquals(result.getFieldValue(), "aabaraa");
    result = validator.standardizeAndValidateField(fieldPolicy, null, fieldDataType, new HashMap<Class, Class>());
    assertNull(result.getFieldValue());
    result = validator.standardizeAndValidateField(fieldPolicy, "", fieldDataType, new HashMap<Class, Class>());
    assertEquals(result.getFieldValue(), "");
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HashMap(java.util.HashMap) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) SimpleRegexReplacer(com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Aggregations

BaseFieldPolicy (com.thinkbiganalytics.policy.BaseFieldPolicy)11 StandardizationAndValidationResult (com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult)11 FieldPolicy (com.thinkbiganalytics.policy.FieldPolicy)10 ArrayList (java.util.ArrayList)9 HCatDataType (com.thinkbiganalytics.spark.validation.HCatDataType)8 Test (org.junit.Test)7 SimpleRegexReplacer (com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer)4 LookupValidator (com.thinkbiganalytics.policy.validation.LookupValidator)3 StandardizationPolicy (com.thinkbiganalytics.policy.standardization.StandardizationPolicy)2 NotNullValidator (com.thinkbiganalytics.policy.validation.NotNullValidator)2 ValidationResult (com.thinkbiganalytics.policy.validation.ValidationResult)2 HashMap (java.util.HashMap)2 AcceptsEmptyValues (com.thinkbiganalytics.policy.standardization.AcceptsEmptyValues)1 CharacterValidator (com.thinkbiganalytics.policy.validation.CharacterValidator)1 RangeValidator (com.thinkbiganalytics.policy.validation.RangeValidator)1 ValidationPolicy (com.thinkbiganalytics.policy.validation.ValidationPolicy)1 CleansedRowResult (com.thinkbiganalytics.spark.datavalidator.CleansedRowResult)1 InvalidFormatException (com.thinkbiganalytics.spark.util.InvalidFormatException)1