Search in sources :

Example 1 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class ModifiedSchema method getValidTableSchema.

@Nonnull
public static StructType getValidTableSchema(@Nonnull final StructField[] feedFields, @Nonnull final StructField[] validFields, @Nonnull final FieldPolicy[] policies) {
    // Map of the lower feed valid name to the field type
    final Map<String, StructField> validFieldsMap = new HashMap<>();
    for (StructField validField : validFields) {
        String lowerFieldName = validField.name().toLowerCase();
        validFieldsMap.put(lowerFieldName, validField);
    }
    // List of all the feedFieldNames that are part of the policyMap
    final List<String> policyMapFeedFieldNames = new ArrayList<>();
    // A map of the feedFieldName to validFieldName
    final Map<String, String> validFieldToFeedFieldMap = new HashMap<>();
    // List of all those validFieldNames that have a standardizer on them
    final List<String> validFieldsWithStandardizers = new ArrayList<>();
    for (FieldPolicy policy : policies) {
        if (policy.getField() != null) {
            String feedFieldName = policy.getFeedField().toLowerCase();
            String fieldName = policy.getField().toLowerCase();
            policyMapFeedFieldNames.add(feedFieldName);
            validFieldToFeedFieldMap.put(fieldName, feedFieldName);
            if (policy.hasStandardizationPolicies()) {
                validFieldsWithStandardizers.add(fieldName);
            }
        }
    }
    List<StructField> fieldsList = new ArrayList<>(feedFields.length);
    for (StructField feedField : feedFields) {
        String lowerFeedFieldName = feedField.name().toLowerCase();
        if (policyMapFeedFieldNames.contains(lowerFeedFieldName)) {
            StructField field = feedField;
            // get the corresponding valid table field name
            String lowerFieldName = validFieldToFeedFieldMap.get(lowerFeedFieldName);
            // if we are standardizing then use the field type matching the _valid table
            if (validFieldsWithStandardizers.contains(lowerFieldName)) {
                // get the valid table
                field = validFieldsMap.get(lowerFieldName);
                HCatDataType dataType = HCatDataType.createFromDataType(field.name(), field.dataType().simpleString());
                if (dataType != null && dataType.isDateOrTimestamp()) {
                    field = new StructField(field.name(), DataTypes.StringType, field.nullable(), field.metadata());
                }
            }
            fieldsList.add(field);
        } else {
            log.warn("Valid table field {} is not present in policy map", lowerFeedFieldName);
        }
    }
    // Insert the two custom fields before the processing partition column
    fieldsList.add(new StructField(CleanseAndValidateRow.PROCESSING_DTTM_COL, DataTypes.StringType, true, Metadata.empty()));
    fieldsList.add(fieldsList.size() - 1, new StructField(CleanseAndValidateRow.REJECT_REASON_COL, DataTypes.StringType, true, Metadata.empty()));
    return new StructType(fieldsList.toArray(new StructField[0]));
}
Also used : StructField(org.apache.spark.sql.types.StructField) FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) Nonnull(javax.annotation.Nonnull)

Example 2 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRowTest method mixedStandardizeAndValidate.

@Test
public void mixedStandardizeAndValidate() {
    String fieldValue = "TeSt_fiELd";
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(UppercaseStandardizer.instance());
    policies.add(new CharacterValidator("UPPERCASE"));
    policies.add(LowercaseStandardizer.instance());
    policies.add(new CharacterValidator("LOWERCASE"));
    policies.add(UppercaseStandardizer.instance());
    policies.add(new CharacterValidator("UPPERCASE"));
    policies.add(LowercaseStandardizer.instance());
    policies.add(new CharacterValidator("LOWERCASE"));
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, fieldValue, fieldDataType, new HashMap<Class, Class>());
    assertEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
    assertEquals("test_field", result.getFieldValue());
}
Also used : CharacterValidator(com.thinkbiganalytics.policy.validation.CharacterValidator) FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 3 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRowTest method invalidStandardizeAndValidate.

@Test
public void invalidStandardizeAndValidate() {
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
    policies.add(new LookupValidator("blah"));
    policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
    policies.add(new LookupValidator("aatestaa"));
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
    assertEquals("aabaraa", result.getFieldValue());
    assertNotEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) LookupValidator(com.thinkbiganalytics.policy.validation.LookupValidator) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) SimpleRegexReplacer(com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 4 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRowTest method standardizeRegex.

@Test
public void standardizeRegex() {
    SimpleRegexReplacer standardizer = new SimpleRegexReplacer("(?i)foo", "bar");
    String fieldName = "field1";
    List<BaseFieldPolicy> policies = new ArrayList<>();
    policies.add(standardizer);
    FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
    HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
    StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
    assertEquals(result.getFieldValue(), "aabaraa");
    result = validator.standardizeAndValidateField(fieldPolicy, null, fieldDataType, new HashMap<Class, Class>());
    assertNull(result.getFieldValue());
    result = validator.standardizeAndValidateField(fieldPolicy, "", fieldDataType, new HashMap<Class, Class>());
    assertEquals(result.getFieldValue(), "");
}
Also used : FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) HashMap(java.util.HashMap) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) SimpleRegexReplacer(com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer) BaseFieldPolicy(com.thinkbiganalytics.policy.BaseFieldPolicy) StandardizationAndValidationResult(com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult) Test(org.junit.Test)

Example 5 with HCatDataType

use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.

the class CleanseAndValidateRow method resolveDataTypes.

/**
 * Converts the table schema into the corresponding data type structures
 */
@Nonnull
private HCatDataType[] resolveDataTypes(StructField[] fields) {
    List<HCatDataType> cols = new ArrayList<>(fields.length);
    for (StructField field : fields) {
        String colName = field.name();
        String dataType = field.dataType().simpleString();
        cols.add(HCatDataType.createFromDataType(colName, dataType));
    }
    return cols.toArray(new HCatDataType[0]);
}
Also used : StructField(org.apache.spark.sql.types.StructField) HCatDataType(com.thinkbiganalytics.spark.validation.HCatDataType) ArrayList(java.util.ArrayList) Nonnull(javax.annotation.Nonnull)

Aggregations

HCatDataType (com.thinkbiganalytics.spark.validation.HCatDataType)12 FieldPolicy (com.thinkbiganalytics.policy.FieldPolicy)9 ArrayList (java.util.ArrayList)9 Test (org.junit.Test)9 BaseFieldPolicy (com.thinkbiganalytics.policy.BaseFieldPolicy)8 StandardizationAndValidationResult (com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult)8 SimpleRegexReplacer (com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer)4 LookupValidator (com.thinkbiganalytics.policy.validation.LookupValidator)3 HashMap (java.util.HashMap)3 Nonnull (javax.annotation.Nonnull)2 StructField (org.apache.spark.sql.types.StructField)2 StandardizationPolicy (com.thinkbiganalytics.policy.standardization.StandardizationPolicy)1 CharacterValidator (com.thinkbiganalytics.policy.validation.CharacterValidator)1 ValidationResult (com.thinkbiganalytics.policy.validation.ValidationResult)1 CleansedRowResult (com.thinkbiganalytics.spark.datavalidator.CleansedRowResult)1 StructType (org.apache.spark.sql.types.StructType)1