use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRow method call.
@Override
public CleansedRowResult call(@Nonnull final Row row) throws Exception {
/*
Cache for performance. Validators accept different parameters (numeric,string, etc) so we need to resolve the type using reflection
*/
Map<Class, Class> validatorParamType = new HashMap<>();
int nulls = hasProcessingDttm ? 1 : 0;
// Create placeholder for the new values plus one columns for reject_reason
Object[] newValues = new Object[dataTypes.length + 1];
boolean rowValid = true;
String sbRejectReason;
List<ValidationResult> results = null;
boolean[] columnsValid = new boolean[dataTypes.length];
Map<Integer, Object> originalValues = new HashMap<>();
// Iterate through columns to cleanse and validate
for (int idx = 0; idx < dataTypes.length; idx++) {
ValidationResult result;
FieldPolicy fieldPolicy = policies[idx];
HCatDataType dataType = dataTypes[idx];
boolean columnValid = true;
boolean isBinaryType = dataType.getConvertibleType().equals(byte[].class);
// Extract the value (allowing for null or missing field for odd-ball data)
Object val = (idx == row.length() || row.isNullAt(idx) ? null : row.get(idx));
if (dataType.isUnchecked()) {
if (val == null) {
nulls++;
}
newValues[idx] = val;
originalValues.put(idx, val);
} else {
Object fieldValue = (val);
boolean isEmpty;
if (fieldValue == null) {
nulls++;
}
originalValues.put(idx, fieldValue);
StandardizationAndValidationResult standardizationAndValidationResult = standardizeAndValidateField(fieldPolicy, fieldValue, dataType, validatorParamType);
result = standardizationAndValidationResult.getFinalValidationResult();
// only apply the standardized result value if the routine is valid
fieldValue = result.isValid() ? standardizationAndValidationResult.getFieldValue() : fieldValue;
// reevaluate the isEmpty flag
isEmpty = ((fieldValue == null) || (StringUtils.isEmpty(fieldValue.toString())));
// hive will auto convert byte[] or String fields to a target binary type.
if (result.isValid() && isBinaryType && !(fieldValue instanceof byte[]) && !(fieldValue instanceof String)) {
// set it to null
fieldValue = null;
} else if ((dataType.isNumeric() || isBinaryType) && isEmpty) {
// if its a numeric column and the field is empty then set it to null as well
fieldValue = null;
}
newValues[idx] = fieldValue;
if (!result.isValid()) {
rowValid = false;
results = (results == null ? new Vector<ValidationResult>() : results);
results.addAll(standardizationAndValidationResult.getValidationResults());
columnValid = false;
}
}
// Record fact that we there was an invalid column
columnsValid[idx] = columnValid;
}
// Return success unless all values were null. That would indicate a blank line in the file.
if (nulls >= dataTypes.length) {
rowValid = false;
results = (results == null ? new Vector<ValidationResult>() : results);
results.add(ValidationResult.failRow("empty", "Row is empty"));
}
if (!rowValid) {
for (int idx = 0; idx < dataTypes.length; idx++) {
// the _invalid table dataTypes matches the source, not the destination
if (newValues[idx] == null || originalValues.get(idx) == null || newValues[idx].getClass() != originalValues.get(idx).getClass()) {
newValues[idx] = originalValues.get(idx);
}
// otherwise the data has changed, but its still the same data type so we can keep the newly changed value
}
}
// Convert to reject reasons to JSON
sbRejectReason = toJSONArray(results);
// Record the results in the appended columns, move processing partition value last
if (hasProcessingDttm) {
// PROCESSING_DTTM_COL
newValues[dataTypes.length] = newValues[dataTypes.length - 1];
// REJECT_REASON_COL
newValues[dataTypes.length - 1] = sbRejectReason;
} else {
newValues[dataTypes.length] = sbRejectReason;
}
return new CleansedRowResult(RowFactory.create(newValues), columnsValid, rowValid);
}
use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRowTest method exceptionsShouldNotStopStandardization.
@Test
public void exceptionsShouldNotStopStandardization() {
StandardizationPolicy standardizer = EXCEPTION_POLICY;
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(standardizer);
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType, new HashMap<Class, Class>());
assertEquals(result.getFieldValue(), "aafooaa");
}
use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRowTest method castStringToBoolean.
@Test
public void castStringToBoolean() throws InvalidFormatException {
Object booleanValueAsBoolean;
String booleanFieldName = "flag";
String booleanValueAsString = "true";
HCatDataType dataType = HCatDataType.createFromDataType(booleanFieldName, "boolean");
booleanValueAsBoolean = dataType.toNativeValue(booleanValueAsString);
assertEquals(booleanValueAsBoolean.getClass().getName(), "java.lang.Boolean");
assertEquals(booleanValueAsBoolean.toString(), "true");
}
use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRowTest method standardizeShouldNotChangeType.
@Test
public void standardizeShouldNotChangeType() {
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(ADD_ONE_STANDARDISATION_POLICY);
policies.add(ADD_ONE_STANDARDISATION_POLICY);
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("temp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "int");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, 0, fieldDataType, new HashMap<Class, Class>());
assertEquals(2, result.getFieldValue());
assertEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
}
use of com.thinkbiganalytics.spark.validation.HCatDataType in project kylo by Teradata.
the class CleanseAndValidateRowTest method nullValueStandardizeAndValidate.
@Test
public void nullValueStandardizeAndValidate() {
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
policies.add(new LookupValidator("blah"));
policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
policies.add(new LookupValidator("aatestaa"));
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, null, fieldDataType, new HashMap<Class, Class>());
assertEquals(StandardDataValidator.VALID_RESULT, result.getFinalValidationResult());
}
Aggregations