use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class ModifiedSchema method getValidTableSchema.
@Nonnull
public static StructType getValidTableSchema(@Nonnull final StructField[] feedFields, @Nonnull final StructField[] validFields, @Nonnull final FieldPolicy[] policies) {
// Map of the lower feed valid name to the field type
final Map<String, StructField> validFieldsMap = new HashMap<>();
for (StructField validField : validFields) {
String lowerFieldName = validField.name().toLowerCase();
validFieldsMap.put(lowerFieldName, validField);
}
// List of all the feedFieldNames that are part of the policyMap
final List<String> policyMapFeedFieldNames = new ArrayList<>();
// A map of the feedFieldName to validFieldName
final Map<String, String> validFieldToFeedFieldMap = new HashMap<>();
// List of all those validFieldNames that have a standardizer on them
final List<String> validFieldsWithStandardizers = new ArrayList<>();
for (FieldPolicy policy : policies) {
if (policy.getField() != null) {
String feedFieldName = policy.getFeedField().toLowerCase();
String fieldName = policy.getField().toLowerCase();
policyMapFeedFieldNames.add(feedFieldName);
validFieldToFeedFieldMap.put(fieldName, feedFieldName);
if (policy.hasStandardizationPolicies()) {
validFieldsWithStandardizers.add(fieldName);
}
}
}
List<StructField> fieldsList = new ArrayList<>(feedFields.length);
for (StructField feedField : feedFields) {
String lowerFeedFieldName = feedField.name().toLowerCase();
if (policyMapFeedFieldNames.contains(lowerFeedFieldName)) {
StructField field = feedField;
// get the corresponding valid table field name
String lowerFieldName = validFieldToFeedFieldMap.get(lowerFeedFieldName);
// if we are standardizing then use the field type matching the _valid table
if (validFieldsWithStandardizers.contains(lowerFieldName)) {
// get the valid table
field = validFieldsMap.get(lowerFieldName);
HCatDataType dataType = HCatDataType.createFromDataType(field.name(), field.dataType().simpleString());
if (dataType != null && dataType.isDateOrTimestamp()) {
field = new StructField(field.name(), DataTypes.StringType, field.nullable(), field.metadata());
}
}
fieldsList.add(field);
} else {
log.warn("Valid table field {} is not present in policy map", lowerFeedFieldName);
}
}
// Insert the two custom fields before the processing partition column
fieldsList.add(new StructField(CleanseAndValidateRow.PROCESSING_DTTM_COL, DataTypes.StringType, true, Metadata.empty()));
fieldsList.add(fieldsList.size() - 1, new StructField(CleanseAndValidateRow.REJECT_REASON_COL, DataTypes.StringType, true, Metadata.empty()));
return new StructType(fieldsList.toArray(new StructField[0]));
}
use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRowTest method mixedStandardizeAndValidate.
@Test
public void mixedStandardizeAndValidate() {
String fieldValue = "TeSt_fiELd";
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(UppercaseStandardizer.instance());
policies.add(new CharacterValidator("UPPERCASE"));
policies.add(LowercaseStandardizer.instance());
policies.add(new CharacterValidator("LOWERCASE"));
policies.add(UppercaseStandardizer.instance());
policies.add(new CharacterValidator("UPPERCASE"));
policies.add(LowercaseStandardizer.instance());
policies.add(new CharacterValidator("LOWERCASE"));
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, fieldValue, fieldDataType, new HashMap<Class, Class>());
assertEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
assertEquals("test_field", result.getFieldValue());
}
use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRowTest method invalidStandardizeAndValidate.
@Test
public void invalidStandardizeAndValidate() {
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
policies.add(new LookupValidator("blah"));
policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
policies.add(new LookupValidator("aatestaa"));
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
assertEquals("aabaraa", result.getFieldValue());
assertNotEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
}
use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRowTest method standardizeRegex.
@Test
public void standardizeRegex() {
SimpleRegexReplacer standardizer = new SimpleRegexReplacer("(?i)foo", "bar");
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(standardizer);
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
assertEquals(result.getFieldValue(), "aabaraa");
result = validator.standardizeAndValidateField(fieldPolicy, null, fieldDataType, new HashMap<Class, Class>());
assertNull(result.getFieldValue());
result = validator.standardizeAndValidateField(fieldPolicy, "", fieldDataType, new HashMap<Class, Class>());
assertEquals(result.getFieldValue(), "");
}
use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRow method resolveDataTypes.
/**
* Converts the table schema into the corresponding data type structures
*/
@Nonnull
private HCatDataType[] resolveDataTypes(StructField[] fields) {
List<HCatDataType> cols = new ArrayList<>(fields.length);
for (StructField field : fields) {
String colName = field.name();
String dataType = field.dataType().simpleString();
cols.add(HCatDataType.createFromDataType(colName, dataType));
}
return cols.toArray(new HCatDataType[0]);
}
Aggregations