Search in sources :

Example 1 with TimestampColumnStatistics

use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.

the class TimestampColumnCase2Test method writeStatistics.

/**
 * Verify writing statistics.
 */
@Test
public void writeStatistics() {
    final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
    // Test when empty
    TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    List<OutputRow> rows = stats.getStatistics();
    Assert.assertEquals(12, rows.size());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=0]", rows.get(3).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=0]", rows.get(4).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=0]", rows.get(5).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=0]", rows.get(6).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=0]", rows.get(7).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=]", rows.get(9).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=]", rows.get(10).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=]", rows.get(11).toString());
    // Test with multiple values
    stats.accomodate("", 1L);
    stats.accomodate("2016-06-27 14:04:29", 1L);
    stats.accomodate("2016-06-27 14:04:30", 1L);
    stats.accomodate("2016-06-27 14:04:31", 1L);
    stats.accomodate(null, 1L);
    rows = stats.getStatistics();
    Assert.assertEquals(12, rows.size());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=1]", rows.get(3).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=5]", rows.get(4).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=5]", rows.get(5).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=20]", rows.get(6).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=100]", rows.get(7).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]", rows.get(9).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=2016-06-27 14:04:31.0]", rows.get(10).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=2016-06-27 14:04:29.0]", rows.get(11).toString());
}
Also used : ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) Test(org.junit.Test) ProfilerTest(com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)

Example 2 with TimestampColumnStatistics

use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.

the class TimestampColumnCase2Test method getVerboseStatistics.

/**
 * Verify statistics string.
 */
@Test
public void getVerboseStatistics() {
    final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
    // Test when empty
    TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    String expected = "{\nColumnInfo [name=ts, datatype=timestamp, nullable=true, metadata={}]\n" + "CommonStatistics [nullCount=0, totalCount=0, uniqueCount=0, percNullValues=0, percUniqueValues=0, percDuplicateValues=0]\n" + "Top 3 values [\n]\n" + "TimestampColumnStatistics [maxTimestamp=, minTimestamp=]\n}";
    Assert.assertEquals(expected, stats.getVerboseStatistics());
    // Test with multiple values
    stats.accomodate("", 1L);
    stats.accomodate("2016-06-27 14:04:29", 1L);
    stats.accomodate("2016-06-27 14:04:30", 1L);
    stats.accomodate("2016-06-27 14:04:31", 1L);
    stats.accomodate(null, 1L);
    expected = "{\nColumnInfo [name=ts, datatype=timestamp, nullable=true, metadata={}]\n" + "CommonStatistics [nullCount=1, totalCount=5, uniqueCount=5, percNullValues=20, percUniqueValues=100, percDuplicateValues=0]\n" + "Top 3 values [\n1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]\n" + "TimestampColumnStatistics [maxTimestamp=2016-06-27 14:04:31.0, minTimestamp=2016-06-27 14:04:29.0]\n}";
    Assert.assertEquals(expected, stats.getVerboseStatistics());
}
Also used : ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) Test(org.junit.Test) ProfilerTest(com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)

Example 3 with TimestampColumnStatistics

use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.

the class TimestampColumnCase2Test method combine.

/**
 * Verify combining statistics.
 */
@Test
public void combine() {
    final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
    // Test when 'this' is empty
    TimestampColumnStatistics other = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    other.accomodate("2016-06-27 14:04:30", 1L);
    stats.combine(other);
    Timestamp ts1 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 30).getMillis());
    Assert.assertEquals(ts1, stats.getMaxTimestamp());
    Assert.assertEquals(ts1, stats.getMinTimestamp());
    // Test when other is empty
    other = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    stats.combine(other);
    Assert.assertEquals(ts1, stats.getMaxTimestamp());
    Assert.assertEquals(ts1, stats.getMinTimestamp());
    // Test when other has later timestamp
    other.accomodate("2016-06-27 14:04:31", 1L);
    stats.combine(other);
    Timestamp ts2 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 31).getMillis());
    Assert.assertEquals(ts2, stats.getMaxTimestamp());
    Assert.assertEquals(ts1, stats.getMinTimestamp());
    // Test when other has earlier timestamp
    other.accomodate("2016-06-27 14:04:29", 1L);
    stats.combine(other);
    Timestamp ts3 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 29).getMillis());
    Assert.assertEquals(ts2, stats.getMaxTimestamp());
    Assert.assertEquals(ts3, stats.getMinTimestamp());
}
Also used : ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) Timestamp(java.sql.Timestamp) DateTime(org.joda.time.DateTime) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) Test(org.junit.Test) ProfilerTest(com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)

Example 4 with TimestampColumnStatistics

use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.

the class StandardStatisticsModel method add.

/**
 * Include a column value in calculation of profile statistics for the column
 *
 * @param columnIndex numeric index of column (0-based)
 * @param columnValue value in column
 * @param columnCount number of times value is found in column
 * @param columnField schema information of the column
 */
public void add(Integer columnIndex, Object columnValue, Long columnCount, StructField columnField) {
    StandardColumnStatistics newColumnStatistics;
    DataType columnDataType = columnField.dataType();
    switch(columnDataType.simpleString()) {
        /*
             * Hive datatype: 		TINYINT
             * SparkSQL datatype: 	        tinyint
             * Java datatype:		Byte
             */
        case "tinyint":
            newColumnStatistics = new ByteColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		SMALLINT
             * SparkSQL datatype: 	        smallint
             * Java datatype:		Short
             */
        case "smallint":
            newColumnStatistics = new ShortColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		INT
             * SparkSQL datatype: 	        int
             * Java datatype:		Int
             */
        case "int":
            newColumnStatistics = new IntegerColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		BIGINT
             * SparkSQL datatype: 	        bigint
             * Java datatype:		Long
             */
        case "bigint":
            newColumnStatistics = new LongColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		FLOAT
             * SparkSQL datatype: 	        float
             * Java datatype:		Float
             */
        case "float":
            newColumnStatistics = new FloatColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		DOUBLE
             * SparkSQL datatype: 	        double
             * Java datatype:		Double
             */
        case "double":
            newColumnStatistics = new DoubleColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatypes: 		STRING, VARCHAR
             * SparkSQL datatype: 	        string
             * Java datatype:		String
             */
        case "string":
            newColumnStatistics = new StringColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		BOOLEAN
             * SparkSQL datatype: 	        boolean
             * Java datatype:		Boolean
             */
        case "boolean":
            newColumnStatistics = new BooleanColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		DATE
             * SparkSQL datatype: 	        date
             * Java datatype:		java.sql.Date
             */
        case "date":
            newColumnStatistics = new DateColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		TIMESTAMP
             * SparkSQL datatype: 	        timestamp
             * Java datatype:		java.sql.Timestamp
             */
        case "timestamp":
            newColumnStatistics = new TimestampColumnStatistics(columnField, profilerConfiguration);
            break;
        default:
            /*
             * Hive datatype: 		DECIMAL
             * SparkSQL datatype: 	        decimal
             * Java datatype:		java.math.BigDecimal
             *
             * Handle the decimal type here since it comes with scale and precision e.g. decimal(7,5)
             */
            String decimalTypeRegex = "decimal\\S+";
            if (columnDataType.simpleString().matches(decimalTypeRegex)) {
                newColumnStatistics = new BigDecimalColumnStatistics(columnField, profilerConfiguration);
            } else /*
                 * Hive datatypes: CHAR, BINARY, ARRAY, MAP, STRUCT, UNIONTYPE
                 */
            {
                if (log.isWarnEnabled()) {
                    log.warn("[PROFILER-INFO] Unsupported data type: {}", columnDataType.simpleString());
                }
                newColumnStatistics = new UnsupportedColumnStatistics(columnField, profilerConfiguration);
            }
    }
    if (!columnStatisticsMap.containsKey(columnIndex)) {
        columnStatisticsMap.put(columnIndex, newColumnStatistics);
    }
    StandardColumnStatistics currentColumnStatistics = columnStatisticsMap.get(columnIndex);
    currentColumnStatistics.accomodate(columnValue, columnCount);
}
Also used : DateColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.DateColumnStatistics) StandardColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.StandardColumnStatistics) FloatColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.FloatColumnStatistics) IntegerColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.IntegerColumnStatistics) UnsupportedColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.UnsupportedColumnStatistics) BigDecimalColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.BigDecimalColumnStatistics) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) ShortColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.ShortColumnStatistics) StringColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.StringColumnStatistics) BooleanColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.BooleanColumnStatistics) LongColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.LongColumnStatistics) DoubleColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.DoubleColumnStatistics) DataType(org.apache.spark.sql.types.DataType) ByteColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.ByteColumnStatistics)

Example 5 with TimestampColumnStatistics

use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.

the class TimestampColumnCase2Test method accomodate.

/**
 * Verify accommodating column values.
 */
@Test
public void accomodate() {
    final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
    // Test with a null value
    TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    stats.accomodate(null, 1L);
    Assert.assertNull(stats.getMaxTimestamp());
    Assert.assertNull(stats.getMinTimestamp());
    // Test with uninitialized max & min
    stats.accomodate("2016-06-27 14:04:30", 1L);
    Timestamp ts1 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 30).getMillis());
    Assert.assertEquals(ts1, stats.getMaxTimestamp());
    Assert.assertEquals(ts1, stats.getMinTimestamp());
    // Test with a later timestamp
    stats.accomodate("2016-06-27 14:04:31", 1L);
    Timestamp ts2 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 31).getMillis());
    Assert.assertEquals(ts2, stats.getMaxTimestamp());
    Assert.assertEquals(ts1, stats.getMinTimestamp());
    // Test with an earlier timestamp
    stats.accomodate("2016-06-27 14:04:29", 1L);
    Timestamp ts3 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 29).getMillis());
    Assert.assertEquals(ts2, stats.getMaxTimestamp());
    Assert.assertEquals(ts3, stats.getMinTimestamp());
}
Also used : ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) Timestamp(java.sql.Timestamp) DateTime(org.joda.time.DateTime) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) Test(org.junit.Test) ProfilerTest(com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)

Aggregations

TimestampColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics)5 ProfilerConfiguration (com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration)4 ProfilerTest (com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)4 Test (org.junit.Test)4 Timestamp (java.sql.Timestamp)2 DateTime (org.joda.time.DateTime)2 BigDecimalColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.BigDecimalColumnStatistics)1 BooleanColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.BooleanColumnStatistics)1 ByteColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.ByteColumnStatistics)1 DateColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.DateColumnStatistics)1 DoubleColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.DoubleColumnStatistics)1 FloatColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.FloatColumnStatistics)1 IntegerColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.IntegerColumnStatistics)1 LongColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.LongColumnStatistics)1 ShortColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.ShortColumnStatistics)1 StandardColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.StandardColumnStatistics)1 StringColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.StringColumnStatistics)1 UnsupportedColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.UnsupportedColumnStatistics)1 OutputRow (com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)1 DataType (org.apache.spark.sql.types.DataType)1