Search in sources :

Example 6 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRow method call.

@Override
public CleansedRowResult call(@Nonnull final Row row) throws Exception {
    /*
    Cache for performance. Validators accept different parameters (numeric,string, etc) so we need to resolve the type using reflection
     */
    Map<Class, Class> validatorParamType = new HashMap<>();
    int nulls = hasProcessingDttm ? 1 : 0;
    // Create placeholder for the new values plus one columns for reject_reason
    Object[] newValues = new Object[dataTypes.length + 1];
    boolean rowValid = true;
    String sbRejectReason;
    List<ValidationResult> results = null;
    boolean[] columnsValid = new boolean[dataTypes.length];
    Map<Integer, Object> originalValues = new HashMap<>();
    // Iterate through columns to cleanse and validate
    for (int idx = 0; idx < dataTypes.length; idx++) {
        ValidationResult result;
        FieldPolicy fieldPolicy = policies[idx];
        HCatDataType dataType = dataTypes[idx];
        boolean columnValid = true;
        boolean isBinaryType = dataType.getConvertibleType().equals(byte[].class);
        // Extract the value (allowing for null or missing field for odd-ball data)
        Object val = (idx == row.length() || row.isNullAt(idx) ? null : row.get(idx));
        if (dataType.isUnchecked()) {
            if (val == null) {
                nulls++;
            }
            newValues[idx] = val;
            originalValues.put(idx, val);
        } else {
            Object fieldValue = (val);
            boolean isEmpty;
            if (fieldValue == null) {
                nulls++;
            }
            originalValues.put(idx, fieldValue);
            StandardizationAndValidationResult standardizationAndValidationResult = standardizeAndValidateField(fieldPolicy, fieldValue, dataType, validatorParamType);
            result = standardizationAndValidationResult.getFinalValidationResult();
            // only apply the standardized result value if the routine is valid
            fieldValue = result.isValid() ? standardizationAndValidationResult.getFieldValue() : fieldValue;
            // reevaluate the isEmpty flag
            isEmpty = ((fieldValue == null) || (StringUtils.isEmpty(fieldValue.toString())));
            // hive will auto convert byte[] or String fields to a target binary type.
            if (result.isValid() && isBinaryType && !(fieldValue instanceof byte[]) && !(fieldValue instanceof String)) {
                // set it to null
                fieldValue = null;
            } else if ((dataType.isNumeric() || isBinaryType) && isEmpty) {
                // if its a numeric column and the field is empty then set it to null as well
                fieldValue = null;
            }
            newValues[idx] = fieldValue;
            if (!result.isValid()) {
                rowValid = false;
                results = (results == null ? new Vector<ValidationResult>() : results);
                results.addAll(standardizationAndValidationResult.getValidationResults());
                columnValid = false;
            }
        }
        // Record fact that we there was an invalid column
        columnsValid[idx] = columnValid;
    }
    // Return success unless all values were null.  That would indicate a blank line in the file.
    if (nulls >= dataTypes.length) {
        rowValid = false;
        results = (results == null ? new Vector<ValidationResult>() : results);
        results.add(ValidationResult.failRow("empty", "Row is empty"));
    }
    if (!rowValid) {
        for (int idx = 0; idx < dataTypes.length; idx++) {
            // the _invalid table dataTypes matches the source, not the destination
            if (newValues[idx] == null || originalValues.get(idx) == null || newValues[idx].getClass() != originalValues.get(idx).getClass()) {
                newValues[idx] = originalValues.get(idx);
            }
        // otherwise the data has changed, but its still the same data type so we can keep the newly changed value
        }
    }
    // Convert to reject reasons to JSON
    sbRejectReason = toJSONArray(results);
    // Record the results in the appended columns, move processing partition value last
    if (hasProcessingDttm) {
        // PROCESSING_DTTM_COL
        newValues[dataTypes.length] = newValues[dataTypes.length - 1];
        // REJECT_REASON_COL
        newValues[dataTypes.length - 1] = sbRejectReason;
    } else {
        newValues[dataTypes.length] = sbRejectReason;
    }
    return new CleansedRowResult(RowFactory.create(newValues), columnsValid, rowValid);
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HashMap(java.util.HashMap) CleansedRowResult(com.thinkbiganalytics.spark.datavalidator.CleansedRowResult) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) ValidationResult(com.thinkbiganalytics.policy.validation.ValidationResult) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType)

Example 7 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRowTest method exceptionsShouldNotStopStandardization.

@Test
public void exceptionsShouldNotStopStandardization() {
    StandardizationPolicy standardizer = EXCEPTION_POLICY;
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(standardizer);
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
    assertEquals(result.getFieldValue(), "aafooaa");
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationPolicy(com.thinkbiganalytics.policy.standardization.StandardizationPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 8 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRowTest method castStringToBoolean.

@Test
public void castStringToBoolean() throws InvalidFormatException {
    Object booleanValueAsBoolean;
    String booleanFieldName = "flag";
    String booleanValueAsString = "true";
    HCatDataType dataType = HCatDataType.createFromDataType(booleanFieldName, "boolean");
    booleanValueAsBoolean = dataType.toNativeValue(booleanValueAsString);
    assertEquals(booleanValueAsBoolean.getClass().getName(), "java.lang.Boolean");
    assertEquals(booleanValueAsBoolean.toString(), "true");
}
Also used : HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) Test(org.junit.Test)

Example 9 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRowTest method standardizeShouldNotChangeType.

@Test
public void standardizeShouldNotChangeType() {
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(ADD_ONE_STANDARDISATION_POLICY);
    policies.add(ADD_ONE_STANDARDISATION_POLICY);
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("temp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "int");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, 0, fieldDataType, new HashMap<Class, Class>());
    assertEquals(2, result.getFieldValue());
    assertEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 10 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRowTest method nullValueStandardizeAndValidate.

@Test
public void nullValueStandardizeAndValidate() {
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
    policies.add(new LookupValidator("blah"));
    policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
    policies.add(new LookupValidator("aatestaa"));
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, null, fieldDataType, new HashMap<Class, Class>());
    assertEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) LookupValidator(com.thinkbiganalytics.policy.validation.LookupValidator) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) SimpleRegexReplacer(com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Aggregations

HCatDataType (com.thinkbiganalytics.spark.validation.HCatDataType)12 FieldPolicy (com.thinkbiganalytics.policy.FieldPolicy)9 ArrayList (java.util.ArrayList)9 Test (org.junit.Test)9 BaseFieldPolicy (com.thinkbiganalytics.policy.BaseFieldPolicy)8 StandardizationAndValidationResult (com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult)8 SimpleRegexReplacer (com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer)4 LookupValidator (com.thinkbiganalytics.policy.validation.LookupValidator)3 HashMap (java.util.HashMap)3 Nonnull (javax.annotation.Nonnull)2 StructField (org.apache.spark.sql.types.StructField)2 StandardizationPolicy (com.thinkbiganalytics.policy.standardization.StandardizationPolicy)1 CharacterValidator (com.thinkbiganalytics.policy.validation.CharacterValidator)1 ValidationResult (com.thinkbiganalytics.policy.validation.ValidationResult)1 CleansedRowResult (com.thinkbiganalytics.spark.datavalidator.CleansedRowResult)1 StructType (org.apache.spark.sql.types.StructType)1