use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.
the class ProfileStage method apply.
@Nonnull
@Override
public TransformResult apply(@Nullable final TransformResult result) {
Preconditions.checkNotNull(result);
// Profile data set
final StatisticsModel dataStats = profiler.profile(result.getDataSet(), new ProfilerConfiguration());
// Add stats to result
if (dataStats != null) {
final List<OutputRow> profile = (result.getProfile() != null) ? new ArrayList<>(result.getProfile()) : new ArrayList<OutputRow>(dataStats.getColumnStatisticsMap().size());
for (final ColumnStatistics columnStats : dataStats.getColumnStatisticsMap().values()) {
profile.addAll(columnStats.getStatistics());
}
result.setProfile(profile);
}
return result;
}
use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.
the class DateColumnStatistics method getStatistics.
/**
* Write statistics for output result table
*/
@Override
public List<OutputRow> getStatistics() {
final List<OutputRow> rows = new ArrayList<>();
writeStatisticsCommon(rows);
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_DATE), String.valueOf(maxDate)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_DATE), String.valueOf(minDate)));
return rows;
}
use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.
the class DoubleColumnStatistics method getStatistics.
/**
* Write statistics for output result table
*/
@Override
public List<OutputRow> getStatistics() {
final List<OutputRow> rows = new ArrayList<>();
writeStatisticsCommon(rows);
if (allNulls()) {
min = 0;
max = 0;
sum = 0;
mean = 0;
stddev = 0;
variance = 0;
}
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX), String.valueOf(max)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN), String.valueOf(min)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SUM), String.valueOf(sum)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MEAN), String.valueOf(mean)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.STDDEV), String.valueOf(stddev)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.VARIANCE), String.valueOf(variance)));
return rows;
}
use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.
the class IntegerColumnStatistics method getStatistics.
/**
* Write statistics for output result table
*/
@Override
public List<OutputRow> getStatistics() {
final List<OutputRow> rows = new ArrayList<>();
writeStatisticsCommon(rows);
if (allNulls()) {
min = 0;
max = 0;
sum = 0;
mean = 0;
stddev = 0;
variance = 0;
}
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX), String.valueOf(max)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN), String.valueOf(min)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SUM), String.valueOf(sum)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MEAN), String.valueOf(df.format(mean))));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.STDDEV), String.valueOf(df.format(stddev))));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.VARIANCE), String.valueOf(df.format(variance))));
return rows;
}
use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.
the class StandardDataValidator method getProfileStats.
@Override
public List<OutputRow> getProfileStats(@Nonnull final DataValidatorResult result) {
final List<OutputRow> stats = new ArrayList<>();
final long[] validationCounts = cleansedRowResultsValidationCounts(result.getCleansedRowResultRDD(), result.getSchema().length() - 1);
// Calculate global stats
final long validCount = validationCounts[result.getSchema().length() - 1];
final long invalidCount = validationCounts[result.getSchema().length()];
log.info("Valid count {} invalid count {}", validCount, invalidCount);
stats.add(new OutputRow(ALL_COLUMNS, TOTAL_COUNT, Long.toString(validCount + invalidCount)));
stats.add(new OutputRow(ALL_COLUMNS, VALID_COUNT, Long.toString(validCount)));
stats.add(new OutputRow(ALL_COLUMNS, INVALID_COUNT, Long.toString(invalidCount)));
// Calculate column stats
final StructField[] fields = result.getSchema().fields();
for (int i = 0; i < validationCounts.length && i < fields.length - 1; i++) {
stats.add(new OutputRow(fields[i].name(), INVALID_COUNT, Long.toString(validationCounts[i])));
}
return stats;
}
Aggregations