Search in sources :

Example 16 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class ShortColumnStatistics method getStatistics.

/**
 * Write statistics for output result table
 */
@Override
public List<OutputRow> getStatistics() {
    final List<OutputRow> rows = new ArrayList<>();
    writeStatisticsCommon(rows);
    if (allNulls()) {
        min = 0;
        max = 0;
        sum = 0;
        mean = 0;
        stddev = 0;
        variance = 0;
    }
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX), String.valueOf(max)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN), String.valueOf(min)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SUM), String.valueOf(sum)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MEAN), String.valueOf(mean)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.STDDEV), String.valueOf(stddev)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.VARIANCE), String.valueOf(variance)));
    return rows;
}
Also used : ArrayList(java.util.ArrayList) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Example 17 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class ValidationStage method apply.

@Nonnull
@Override
public TransformResult apply(@Nullable final TransformResult result) {
    Preconditions.checkNotNull(result);
    // Validate the data set
    final DataValidatorResult validatorResult = validator.validate(result.getDataSet(), getPolicyMap(result.getDataSet().schema()));
    // Add the validation result to the transform result
    final List<List<TransformValidationResult>> rows = validatorResult.getCleansedRowResultRDD().map(new ListTransformValidationResults()).collect();
    result.setValidationResults(rows);
    // Add the profile to the transform result
    final List<OutputRow> profile = (result.getProfile() != null) ? new ArrayList<>(result.getProfile()) : new ArrayList<OutputRow>();
    profile.addAll(validator.getProfileStats(validatorResult));
    result.setProfile(profile);
    return result;
}
Also used : DataValidatorResult(com.thinkbiganalytics.spark.datavalidator.DataValidatorResult) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow) ArrayList(java.util.ArrayList) List(java.util.List) Nonnull(javax.annotation.Nonnull)

Example 18 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class StandardDataValidator method saveProfileToTable.

@Override
public void saveProfileToTable(@Nonnull final String databaseName, @Nonnull final String tableName, @Nonnull final String partition, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
    try {
        // Create a temporary table that can be used to copy data from. Writing directly to the partition from a spark dataframe doesn't work.
        final String tempTable = tableName + "_" + System.currentTimeMillis();
        // Refactor this into something common with profile table
        @SuppressWarnings("squid:S2095") final JavaRDD<OutputRow> statsRDD = JavaSparkContext.fromSparkContext(hiveContext.sparkContext()).parallelize(getProfileStats(result));
        final DataSet df = scs.toDataSet(hiveContext, statsRDD, OutputRow.class);
        df.registerTempTable(tempTable);
        final String insertSQL = "INSERT OVERWRITE TABLE " + HiveUtils.quoteIdentifier(databaseName, tableName) + " PARTITION (processing_dttm='" + partition + "')" + " SELECT columnname, metrictype, metricvalue FROM " + HiveUtils.quoteIdentifier(tempTable);
        log.info("Writing profile stats {}", insertSQL);
        scs.sql(hiveContext, insertSQL);
    } catch (final Exception e) {
        log.error("Failed to insert validation stats", e);
        throw Throwables.propagate(e);
    }
}
Also used : DataSet(com.thinkbiganalytics.spark.DataSet) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Aggregations

OutputRow (com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)18 ArrayList (java.util.ArrayList)13 ProfilerConfiguration (com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration)2 Nonnull (javax.annotation.Nonnull)2 DataSet (com.thinkbiganalytics.spark.DataSet)1 ColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics)1 StatisticsModel (com.thinkbiganalytics.spark.dataprofiler.StatisticsModel)1 TimestampColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics)1 ProfilerTest (com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)1 DataValidatorResult (com.thinkbiganalytics.spark.datavalidator.DataValidatorResult)1 List (java.util.List)1 StructField (org.apache.spark.sql.types.StructField)1 Test (org.junit.Test)1