use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.
the class ShortColumnStatistics method getStatistics.
/**
* Write statistics for output result table
*/
@Override
public List<OutputRow> getStatistics() {
final List<OutputRow> rows = new ArrayList<>();
writeStatisticsCommon(rows);
if (allNulls()) {
min = 0;
max = 0;
sum = 0;
mean = 0;
stddev = 0;
variance = 0;
}
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX), String.valueOf(max)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN), String.valueOf(min)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SUM), String.valueOf(sum)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MEAN), String.valueOf(mean)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.STDDEV), String.valueOf(stddev)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.VARIANCE), String.valueOf(variance)));
return rows;
}
use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.
the class ValidationStage method apply.
@Nonnull
@Override
public TransformResult apply(@Nullable final TransformResult result) {
Preconditions.checkNotNull(result);
// Validate the data set
final DataValidatorResult validatorResult = validator.validate(result.getDataSet(), getPolicyMap(result.getDataSet().schema()));
// Add the validation result to the transform result
final List<List<TransformValidationResult>> rows = validatorResult.getCleansedRowResultRDD().map(new ListTransformValidationResults()).collect();
result.setValidationResults(rows);
// Add the profile to the transform result
final List<OutputRow> profile = (result.getProfile() != null) ? new ArrayList<>(result.getProfile()) : new ArrayList<OutputRow>();
profile.addAll(validator.getProfileStats(validatorResult));
result.setProfile(profile);
return result;
}
use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.
the class StandardDataValidator method saveProfileToTable.
@Override
public void saveProfileToTable(@Nonnull final String databaseName, @Nonnull final String tableName, @Nonnull final String partition, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
try {
// Create a temporary table that can be used to copy data from. Writing directly to the partition from a spark dataframe doesn't work.
final String tempTable = tableName + "_" + System.currentTimeMillis();
// Refactor this into something common with profile table
@SuppressWarnings("squid:S2095") final JavaRDD<OutputRow> statsRDD = JavaSparkContext.fromSparkContext(hiveContext.sparkContext()).parallelize(getProfileStats(result));
final DataSet df = scs.toDataSet(hiveContext, statsRDD, OutputRow.class);
df.registerTempTable(tempTable);
final String insertSQL = "INSERT OVERWRITE TABLE " + HiveUtils.quoteIdentifier(databaseName, tableName) + " PARTITION (processing_dttm='" + partition + "')" + " SELECT columnname, metrictype, metricvalue FROM " + HiveUtils.quoteIdentifier(tempTable);
log.info("Writing profile stats {}", insertSQL);
scs.sql(hiveContext, insertSQL);
} catch (final Exception e) {
log.error("Failed to insert validation stats", e);
throw Throwables.propagate(e);
}
}
Aggregations