Search in sources :

Example 6 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class ProfileStage method apply.

@Nonnull
@Override
public TransformResult apply(@Nullable final TransformResult result) {
    Preconditions.checkNotNull(result);
    // Profile data set
    final StatisticsModel dataStats = profiler.profile(result.getDataSet(), new ProfilerConfiguration());
    // Add stats to result
    if (dataStats != null) {
        final List<OutputRow> profile = (result.getProfile() != null) ? new ArrayList<>(result.getProfile()) : new ArrayList<OutputRow>(dataStats.getColumnStatisticsMap().size());
        for (final ColumnStatistics columnStats : dataStats.getColumnStatisticsMap().values()) {
            profile.addAll(columnStats.getStatistics());
        }
        result.setProfile(profile);
    }
    return result;
}
Also used : ColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics) StatisticsModel(com.thinkbiganalytics.spark.dataprofiler.StatisticsModel) ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow) Nonnull(javax.annotation.Nonnull)

Example 7 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class DateColumnStatistics method getStatistics.

/**
 * Write statistics for output result table
 */
@Override
public List<OutputRow> getStatistics() {
    final List<OutputRow> rows = new ArrayList<>();
    writeStatisticsCommon(rows);
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_DATE), String.valueOf(maxDate)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_DATE), String.valueOf(minDate)));
    return rows;
}
Also used : ArrayList(java.util.ArrayList) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Example 8 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class DoubleColumnStatistics method getStatistics.

/**
 * Write statistics for output result table
 */
@Override
public List<OutputRow> getStatistics() {
    final List<OutputRow> rows = new ArrayList<>();
    writeStatisticsCommon(rows);
    if (allNulls()) {
        min = 0;
        max = 0;
        sum = 0;
        mean = 0;
        stddev = 0;
        variance = 0;
    }
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX), String.valueOf(max)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN), String.valueOf(min)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SUM), String.valueOf(sum)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MEAN), String.valueOf(mean)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.STDDEV), String.valueOf(stddev)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.VARIANCE), String.valueOf(variance)));
    return rows;
}
Also used : ArrayList(java.util.ArrayList) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Example 9 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class IntegerColumnStatistics method getStatistics.

/**
 * Write statistics for output result table
 */
@Override
public List<OutputRow> getStatistics() {
    final List<OutputRow> rows = new ArrayList<>();
    writeStatisticsCommon(rows);
    if (allNulls()) {
        min = 0;
        max = 0;
        sum = 0;
        mean = 0;
        stddev = 0;
        variance = 0;
    }
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX), String.valueOf(max)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN), String.valueOf(min)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SUM), String.valueOf(sum)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MEAN), String.valueOf(df.format(mean))));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.STDDEV), String.valueOf(df.format(stddev))));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.VARIANCE), String.valueOf(df.format(variance))));
    return rows;
}
Also used : ArrayList(java.util.ArrayList) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Example 10 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class StandardDataValidator method getProfileStats.

@Override
public List<OutputRow> getProfileStats(@Nonnull final DataValidatorResult result) {
    final List<OutputRow> stats = new ArrayList<>();
    final long[] validationCounts = cleansedRowResultsValidationCounts(result.getCleansedRowResultRDD(), result.getSchema().length() - 1);
    // Calculate global stats
    final long validCount = validationCounts[result.getSchema().length() - 1];
    final long invalidCount = validationCounts[result.getSchema().length()];
    log.info("Valid count {} invalid count {}", validCount, invalidCount);
    stats.add(new OutputRow(ALL_COLUMNS, TOTAL_COUNT, Long.toString(validCount + invalidCount)));
    stats.add(new OutputRow(ALL_COLUMNS, VALID_COUNT, Long.toString(validCount)));
    stats.add(new OutputRow(ALL_COLUMNS, INVALID_COUNT, Long.toString(invalidCount)));
    // Calculate column stats
    final StructField[] fields = result.getSchema().fields();
    for (int i = 0; i < validationCounts.length && i < fields.length - 1; i++) {
        stats.add(new OutputRow(fields[i].name(), INVALID_COUNT, Long.toString(validationCounts[i])));
    }
    return stats;
}
Also used : StructField(org.apache.spark.sql.types.StructField) ArrayList(java.util.ArrayList) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Aggregations

OutputRow (com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)18 ArrayList (java.util.ArrayList)13 ProfilerConfiguration (com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration)2 Nonnull (javax.annotation.Nonnull)2 DataSet (com.thinkbiganalytics.spark.DataSet)1 ColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics)1 StatisticsModel (com.thinkbiganalytics.spark.dataprofiler.StatisticsModel)1 TimestampColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics)1 ProfilerTest (com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)1 DataValidatorResult (com.thinkbiganalytics.spark.datavalidator.DataValidatorResult)1 List (java.util.List)1 StructField (org.apache.spark.sql.types.StructField)1 Test (org.junit.Test)1