Search in sources :

Example 1 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class StandardColumnStatistics method writeStatisticsCommon.

/**
 * Write common statistics information for output result table
 */
void writeStatisticsCommon(@Nonnull final List<OutputRow> rows) {
    writeColumnSchemaInformation(rows);
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.NULL_COUNT), String.valueOf(nullCount)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.TOTAL_COUNT), String.valueOf(totalCount)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.UNIQUE_COUNT), String.valueOf(uniqueCount)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.PERC_NULL_VALUES), df.format(percNullValues)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.PERC_UNIQUE_VALUES), df.format(percUniqueValues)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.PERC_DUPLICATE_VALUES), df.format(percDuplicateValues)));
    writeTopNInformation(rows);
}
Also used : OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Example 2 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class StandardColumnStatistics method writeColumnSchemaInformation.

/**
 * Write column's schema information for output result table
 */
private void writeColumnSchemaInformation(@Nonnull final List<OutputRow> rows) {
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.COLUMN_DATATYPE), String.valueOf(columnField.dataType())));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.COLUMN_NULLABLE), String.valueOf(columnField.nullable())));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.COLUMN_METADATA), String.valueOf(columnField.metadata())));
}
Also used : OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Example 3 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class StringColumnStatistics method getStatistics.

/**
 * Write statistics for output result table
 */
@Override
public List<OutputRow> getStatistics() {
    final List<OutputRow> rows = new ArrayList<>();
    writeStatisticsCommon(rows);
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_LENGTH), String.valueOf(maxLength)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_LENGTH), String.valueOf(minLength)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.LONGEST_STRING), String.valueOf(longestString)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SHORTEST_STRING), String.valueOf(shortestString)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.EMPTY_COUNT), String.valueOf(emptyCount)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.PERC_EMPTY_VALUES), df.format(percEmptyValues)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_STRING_CASE), String.valueOf(minStringCase)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_STRING_CASE), String.valueOf(maxStringCase)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_STRING_ICASE), String.valueOf(minStringICase)));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_STRING_ICASE), String.valueOf(maxStringICase)));
    return rows;
}
Also used : ArrayList(java.util.ArrayList) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Example 4 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class TimestampColumnStatistics method getStatistics.

/**
 * Writes the statistics to an output table.
 */
@Override
public List<OutputRow> getStatistics() {
    final List<OutputRow> rows = new ArrayList<>();
    // Write common statistics
    writeStatisticsCommon(rows);
    // Write timestamp-specific statistics
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_TIMESTAMP), (maxTimestamp != null) ? maxTimestamp.toString() : ""));
    rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_TIMESTAMP), (minTimestamp != null) ? minTimestamp.toString() : ""));
    return rows;
}
Also used : ArrayList(java.util.ArrayList) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Example 5 with OutputRow

use of com.thinkbiganalytics.spark.dataprofiler.output.OutputRow in project kylo by Teradata.

the class TimestampColumnCase2Test method writeStatistics.

/**
 * Verify writing statistics.
 */
@Test
public void writeStatistics() {
    final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
    // Test when empty
    TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    List<OutputRow> rows = stats.getStatistics();
    Assert.assertEquals(12, rows.size());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=0]", rows.get(3).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=0]", rows.get(4).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=0]", rows.get(5).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=0]", rows.get(6).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=0]", rows.get(7).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=]", rows.get(9).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=]", rows.get(10).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=]", rows.get(11).toString());
    // Test with multiple values
    stats.accomodate("", 1L);
    stats.accomodate("2016-06-27 14:04:29", 1L);
    stats.accomodate("2016-06-27 14:04:30", 1L);
    stats.accomodate("2016-06-27 14:04:31", 1L);
    stats.accomodate(null, 1L);
    rows = stats.getStatistics();
    Assert.assertEquals(12, rows.size());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=1]", rows.get(3).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=5]", rows.get(4).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=5]", rows.get(5).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=20]", rows.get(6).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=100]", rows.get(7).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]", rows.get(9).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=2016-06-27 14:04:31.0]", rows.get(10).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=2016-06-27 14:04:29.0]", rows.get(11).toString());
}
Also used : ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) Test(org.junit.Test) ProfilerTest(com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)

Aggregations

OutputRow (com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)18 ArrayList (java.util.ArrayList)13 ProfilerConfiguration (com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration)2 Nonnull (javax.annotation.Nonnull)2 DataSet (com.thinkbiganalytics.spark.DataSet)1 ColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics)1 StatisticsModel (com.thinkbiganalytics.spark.dataprofiler.StatisticsModel)1 TimestampColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics)1 ProfilerTest (com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)1 DataValidatorResult (com.thinkbiganalytics.spark.datavalidator.DataValidatorResult)1 List (java.util.List)1 StructField (org.apache.spark.sql.types.StructField)1 Test (org.junit.Test)1