use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.
the class TimestampColumnCase2Test method writeStatistics.
/**
* Verify writing statistics.
*/
@Test
public void writeStatistics() {
final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
// Test when empty
TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
List<OutputRow> rows = stats.getStatistics();
Assert.assertEquals(12, rows.size());
Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=0]", rows.get(3).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=0]", rows.get(4).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=0]", rows.get(5).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=0]", rows.get(6).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=0]", rows.get(7).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=]", rows.get(9).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=]", rows.get(10).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=]", rows.get(11).toString());
// Test with multiple values
stats.accomodate("", 1L);
stats.accomodate("2016-06-27 14:04:29", 1L);
stats.accomodate("2016-06-27 14:04:30", 1L);
stats.accomodate("2016-06-27 14:04:31", 1L);
stats.accomodate(null, 1L);
rows = stats.getStatistics();
Assert.assertEquals(12, rows.size());
Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=1]", rows.get(3).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=5]", rows.get(4).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=5]", rows.get(5).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=20]", rows.get(6).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=100]", rows.get(7).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]", rows.get(9).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=2016-06-27 14:04:31.0]", rows.get(10).toString());
Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=2016-06-27 14:04:29.0]", rows.get(11).toString());
}
use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.
the class TimestampColumnCase2Test method getVerboseStatistics.
/**
* Verify statistics string.
*/
@Test
public void getVerboseStatistics() {
final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
// Test when empty
TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
String expected = "{\nColumnInfo [name=ts, datatype=timestamp, nullable=true, metadata={}]\n" + "CommonStatistics [nullCount=0, totalCount=0, uniqueCount=0, percNullValues=0, percUniqueValues=0, percDuplicateValues=0]\n" + "Top 3 values [\n]\n" + "TimestampColumnStatistics [maxTimestamp=, minTimestamp=]\n}";
Assert.assertEquals(expected, stats.getVerboseStatistics());
// Test with multiple values
stats.accomodate("", 1L);
stats.accomodate("2016-06-27 14:04:29", 1L);
stats.accomodate("2016-06-27 14:04:30", 1L);
stats.accomodate("2016-06-27 14:04:31", 1L);
stats.accomodate(null, 1L);
expected = "{\nColumnInfo [name=ts, datatype=timestamp, nullable=true, metadata={}]\n" + "CommonStatistics [nullCount=1, totalCount=5, uniqueCount=5, percNullValues=20, percUniqueValues=100, percDuplicateValues=0]\n" + "Top 3 values [\n1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]\n" + "TimestampColumnStatistics [maxTimestamp=2016-06-27 14:04:31.0, minTimestamp=2016-06-27 14:04:29.0]\n}";
Assert.assertEquals(expected, stats.getVerboseStatistics());
}
use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.
the class TimestampColumnCase2Test method combine.
/**
* Verify combining statistics.
*/
@Test
public void combine() {
final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
// Test when 'this' is empty
TimestampColumnStatistics other = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
other.accomodate("2016-06-27 14:04:30", 1L);
stats.combine(other);
Timestamp ts1 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 30).getMillis());
Assert.assertEquals(ts1, stats.getMaxTimestamp());
Assert.assertEquals(ts1, stats.getMinTimestamp());
// Test when other is empty
other = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
stats.combine(other);
Assert.assertEquals(ts1, stats.getMaxTimestamp());
Assert.assertEquals(ts1, stats.getMinTimestamp());
// Test when other has later timestamp
other.accomodate("2016-06-27 14:04:31", 1L);
stats.combine(other);
Timestamp ts2 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 31).getMillis());
Assert.assertEquals(ts2, stats.getMaxTimestamp());
Assert.assertEquals(ts1, stats.getMinTimestamp());
// Test when other has earlier timestamp
other.accomodate("2016-06-27 14:04:29", 1L);
stats.combine(other);
Timestamp ts3 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 29).getMillis());
Assert.assertEquals(ts2, stats.getMaxTimestamp());
Assert.assertEquals(ts3, stats.getMinTimestamp());
}
use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.
the class StandardStatisticsModel method add.
/**
* Include a column value in calculation of profile statistics for the column
*
* @param columnIndex numeric index of column (0-based)
* @param columnValue value in column
* @param columnCount number of times value is found in column
* @param columnField schema information of the column
*/
public void add(Integer columnIndex, Object columnValue, Long columnCount, StructField columnField) {
StandardColumnStatistics newColumnStatistics;
DataType columnDataType = columnField.dataType();
switch(columnDataType.simpleString()) {
/*
* Hive datatype: TINYINT
* SparkSQL datatype: tinyint
* Java datatype: Byte
*/
case "tinyint":
newColumnStatistics = new ByteColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: SMALLINT
* SparkSQL datatype: smallint
* Java datatype: Short
*/
case "smallint":
newColumnStatistics = new ShortColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: INT
* SparkSQL datatype: int
* Java datatype: Int
*/
case "int":
newColumnStatistics = new IntegerColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: BIGINT
* SparkSQL datatype: bigint
* Java datatype: Long
*/
case "bigint":
newColumnStatistics = new LongColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: FLOAT
* SparkSQL datatype: float
* Java datatype: Float
*/
case "float":
newColumnStatistics = new FloatColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: DOUBLE
* SparkSQL datatype: double
* Java datatype: Double
*/
case "double":
newColumnStatistics = new DoubleColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatypes: STRING, VARCHAR
* SparkSQL datatype: string
* Java datatype: String
*/
case "string":
newColumnStatistics = new StringColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: BOOLEAN
* SparkSQL datatype: boolean
* Java datatype: Boolean
*/
case "boolean":
newColumnStatistics = new BooleanColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: DATE
* SparkSQL datatype: date
* Java datatype: java.sql.Date
*/
case "date":
newColumnStatistics = new DateColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: TIMESTAMP
* SparkSQL datatype: timestamp
* Java datatype: java.sql.Timestamp
*/
case "timestamp":
newColumnStatistics = new TimestampColumnStatistics(columnField, profilerConfiguration);
break;
default:
/*
* Hive datatype: DECIMAL
* SparkSQL datatype: decimal
* Java datatype: java.math.BigDecimal
*
* Handle the decimal type here since it comes with scale and precision e.g. decimal(7,5)
*/
String decimalTypeRegex = "decimal\\S+";
if (columnDataType.simpleString().matches(decimalTypeRegex)) {
newColumnStatistics = new BigDecimalColumnStatistics(columnField, profilerConfiguration);
} else /*
* Hive datatypes: CHAR, BINARY, ARRAY, MAP, STRUCT, UNIONTYPE
*/
{
if (log.isWarnEnabled()) {
log.warn("[PROFILER-INFO] Unsupported data type: {}", columnDataType.simpleString());
}
newColumnStatistics = new UnsupportedColumnStatistics(columnField, profilerConfiguration);
}
}
if (!columnStatisticsMap.containsKey(columnIndex)) {
columnStatisticsMap.put(columnIndex, newColumnStatistics);
}
StandardColumnStatistics currentColumnStatistics = columnStatisticsMap.get(columnIndex);
currentColumnStatistics.accomodate(columnValue, columnCount);
}
use of com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics in project kylo by Teradata.
the class TimestampColumnCase2Test method accomodate.
/**
* Verify accommodating column values.
*/
@Test
public void accomodate() {
final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
// Test with a null value
TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
stats.accomodate(null, 1L);
Assert.assertNull(stats.getMaxTimestamp());
Assert.assertNull(stats.getMinTimestamp());
// Test with uninitialized max & min
stats.accomodate("2016-06-27 14:04:30", 1L);
Timestamp ts1 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 30).getMillis());
Assert.assertEquals(ts1, stats.getMaxTimestamp());
Assert.assertEquals(ts1, stats.getMinTimestamp());
// Test with a later timestamp
stats.accomodate("2016-06-27 14:04:31", 1L);
Timestamp ts2 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 31).getMillis());
Assert.assertEquals(ts2, stats.getMaxTimestamp());
Assert.assertEquals(ts1, stats.getMinTimestamp());
// Test with an earlier timestamp
stats.accomodate("2016-06-27 14:04:29", 1L);
Timestamp ts3 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 29).getMillis());
Assert.assertEquals(ts2, stats.getMaxTimestamp());
Assert.assertEquals(ts3, stats.getMinTimestamp());
}
Aggregations