use of com.thinkbiganalytics.policy.validation.ValidationResult in project kylo by Teradata.
the class CleanseAndValidateRow method standardizeAndValidateField.
StandardizationAndValidationResult standardizeAndValidateField(FieldPolicy fieldPolicy, Object value, HCatDataType dataType, Map<Class, Class> validatorParamType) {
StandardizationAndValidationResult result = new StandardizationAndValidationResult(value);
List<BaseFieldPolicy> fieldPolicies = fieldPolicy.getAllPolicies();
int standardizerCount = 0;
for (BaseFieldPolicy p : fieldPolicies) {
if (p instanceof StandardizationPolicy) {
standardizerCount++;
}
}
boolean validateNullValues = false;
int processedStandardizers = 0;
for (BaseFieldPolicy p : fieldPolicies) {
boolean isEmpty = ((result.getFieldValue() == null) || (StringUtils.isEmpty(result.getFieldValue().toString())));
if (p instanceof StandardizationPolicy) {
processedStandardizers++;
StandardizationPolicy standardizationPolicy = (StandardizationPolicy) p;
boolean shouldStandardize = true;
if (isEmpty && !(standardizationPolicy instanceof AcceptsEmptyValues)) {
shouldStandardize = false;
}
if (!standardizationPolicy.accepts(result.getFieldValue())) {
shouldStandardize = false;
}
if (shouldStandardize) {
Object newValue = result.getFieldValue();
try {
newValue = standardizationPolicy.convertRawValue(result.getFieldValue());
} catch (Exception e) {
log.error("Standardizer '{}' threw exception while attempting to standardize value, original value will be kept. Exception: {}", standardizationPolicy.getClass(), e);
}
// If this is the last standardizer for this field and the standardized value is returned as a String, and target column is not String, then validate and convert it to correct type
if (newValue != null && dataType.getConvertibleType() != newValue.getClass() && standardizerCount == processedStandardizers) {
try {
// Date and timestamp fields can be valid as strings
boolean isValueOk = dataType.isStringValueValidForHiveType(newValue.toString());
if (!isValueOk) {
// if the current string is not in a correct format attempt to convert it
try {
newValue = dataType.toNativeValue(newValue.toString());
} catch (RuntimeException e) {
result.addValidationResult(ValidationResult.failField("incompatible", dataType.getName(), "Not convertible to " + dataType.getNativeType()));
}
}
} catch (InvalidFormatException e) {
log.warn("Could not convert value {} to correct type {}", newValue.toString(), dataType.getConvertibleType().getName());
}
}
result.setFieldValue(newValue);
}
}
if (p instanceof ValidationPolicy) {
ValidationPolicy validationPolicy = (ValidationPolicy) p;
// not null validator
if (!isEmpty || validateNullValues || validationPolicy instanceof NotNullValidator) {
ValidationResult validationResult = validateValue(validationPolicy, dataType, result.getFieldValue(), validatorParamType);
if (isEmpty && validationPolicy instanceof NotNullValidator) {
validateNullValues = validationPolicy != VALID_RESULT;
}
// only need to add those that are invalid
if (validationResult != VALID_RESULT) {
result.addValidationResult(validationResult);
// exit out of processing if invalid records found.
break;
}
}
// reset the failOnEmpty flag back to false
if (!(validationPolicy instanceof NotNullValidator)) {
validateNullValues = false;
}
}
}
ValidationResult finalValidationCheck = finalValidationCheck(fieldPolicy, dataType, result.getFieldValue());
if (finalValidationCheck != VALID_RESULT) {
result.addValidationResult(finalValidationCheck);
}
return result;
}
use of com.thinkbiganalytics.policy.validation.ValidationResult in project kylo by Teradata.
the class CleanseAndValidateRow method call.
@Override
public CleansedRowResult call(@Nonnull final Row row) throws Exception {
/*
Cache for performance. Validators accept different parameters (numeric,string, etc) so we need to resolve the type using reflection
*/
Map<Class, Class> validatorParamType = new HashMap<>();
int nulls = hasProcessingDttm ? 1 : 0;
// Create placeholder for the new values plus one columns for reject_reason
Object[] newValues = new Object[dataTypes.length + 1];
boolean rowValid = true;
String sbRejectReason;
List<ValidationResult> results = null;
boolean[] columnsValid = new boolean[dataTypes.length];
Map<Integer, Object> originalValues = new HashMap<>();
// Iterate through columns to cleanse and validate
for (int idx = 0; idx < dataTypes.length; idx++) {
ValidationResult result;
FieldPolicy fieldPolicy = policies[idx];
HCatDataType dataType = dataTypes[idx];
boolean columnValid = true;
boolean isBinaryType = dataType.getConvertibleType().equals(byte[].class);
// Extract the value (allowing for null or missing field for odd-ball data)
Object val = (idx == row.length() || row.isNullAt(idx) ? null : row.get(idx));
if (dataType.isUnchecked()) {
if (val == null) {
nulls++;
}
newValues[idx] = val;
originalValues.put(idx, val);
} else {
Object fieldValue = (val);
boolean isEmpty;
if (fieldValue == null) {
nulls++;
}
originalValues.put(idx, fieldValue);
StandardizationAndValidationResult standardizationAndValidationResult = standardizeAndValidateField(fieldPolicy, fieldValue, dataType, validatorParamType);
result = standardizationAndValidationResult.getFinalValidationResult();
// only apply the standardized result value if the routine is valid
fieldValue = result.isValid() ? standardizationAndValidationResult.getFieldValue() : fieldValue;
// reevaluate the isEmpty flag
isEmpty = ((fieldValue == null) || (StringUtils.isEmpty(fieldValue.toString())));
// hive will auto convert byte[] or String fields to a target binary type.
if (result.isValid() && isBinaryType && !(fieldValue instanceof byte[]) && !(fieldValue instanceof String)) {
// set it to null
fieldValue = null;
} else if ((dataType.isNumeric() || isBinaryType) && isEmpty) {
// if its a numeric column and the field is empty then set it to null as well
fieldValue = null;
}
newValues[idx] = fieldValue;
if (!result.isValid()) {
rowValid = false;
results = (results == null ? new Vector<ValidationResult>() : results);
results.addAll(standardizationAndValidationResult.getValidationResults());
columnValid = false;
}
}
// Record fact that we there was an invalid column
columnsValid[idx] = columnValid;
}
// Return success unless all values were null. That would indicate a blank line in the file.
if (nulls >= dataTypes.length) {
rowValid = false;
results = (results == null ? new Vector<ValidationResult>() : results);
results.add(ValidationResult.failRow("empty", "Row is empty"));
}
if (!rowValid) {
for (int idx = 0; idx < dataTypes.length; idx++) {
// the _invalid table dataTypes matches the source, not the destination
if (newValues[idx] == null || originalValues.get(idx) == null || newValues[idx].getClass() != originalValues.get(idx).getClass()) {
newValues[idx] = originalValues.get(idx);
}
// otherwise the data has changed, but its still the same data type so we can keep the newly changed value
}
}
// Convert to reject reasons to JSON
sbRejectReason = toJSONArray(results);
// Record the results in the appended columns, move processing partition value last
if (hasProcessingDttm) {
// PROCESSING_DTTM_COL
newValues[dataTypes.length] = newValues[dataTypes.length - 1];
// REJECT_REASON_COL
newValues[dataTypes.length - 1] = sbRejectReason;
} else {
newValues[dataTypes.length] = sbRejectReason;
}
return new CleansedRowResult(RowFactory.create(newValues), columnsValid, rowValid);
}
use of com.thinkbiganalytics.policy.validation.ValidationResult in project kylo by Teradata.
the class CleanseAndValidateRow method toJSONArray.
private String toJSONArray(List<ValidationResult> results) {
// Convert to reject reasons to JSON
StringBuilder sb = null;
if (results != null) {
sb = new StringBuilder();
for (ValidationResult result : results) {
if (sb.length() > 0) {
sb.append(",");
} else {
sb.append("[");
}
sb.append(result.toJSON());
}
sb.append("]");
}
return (sb == null ? "" : sb.toString());
}
Aggregations