use of com.thinkbiganalytics.spark.dataprofiler.columns.IntegerColumnStatistics in project kylo by Teradata.
the class StandardStatisticsModel method add.
/**
* Include a column value in calculation of profile statistics for the column
*
* @param columnIndex numeric index of column (0-based)
* @param columnValue value in column
* @param columnCount number of times value is found in column
* @param columnField schema information of the column
*/
public void add(Integer columnIndex, Object columnValue, Long columnCount, StructField columnField) {
StandardColumnStatistics newColumnStatistics;
DataType columnDataType = columnField.dataType();
switch(columnDataType.simpleString()) {
/*
* Hive datatype: TINYINT
* SparkSQL datatype: tinyint
* Java datatype: Byte
*/
case "tinyint":
newColumnStatistics = new ByteColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: SMALLINT
* SparkSQL datatype: smallint
* Java datatype: Short
*/
case "smallint":
newColumnStatistics = new ShortColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: INT
* SparkSQL datatype: int
* Java datatype: Int
*/
case "int":
newColumnStatistics = new IntegerColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: BIGINT
* SparkSQL datatype: bigint
* Java datatype: Long
*/
case "bigint":
newColumnStatistics = new LongColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: FLOAT
* SparkSQL datatype: float
* Java datatype: Float
*/
case "float":
newColumnStatistics = new FloatColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: DOUBLE
* SparkSQL datatype: double
* Java datatype: Double
*/
case "double":
newColumnStatistics = new DoubleColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatypes: STRING, VARCHAR
* SparkSQL datatype: string
* Java datatype: String
*/
case "string":
newColumnStatistics = new StringColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: BOOLEAN
* SparkSQL datatype: boolean
* Java datatype: Boolean
*/
case "boolean":
newColumnStatistics = new BooleanColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: DATE
* SparkSQL datatype: date
* Java datatype: java.sql.Date
*/
case "date":
newColumnStatistics = new DateColumnStatistics(columnField, profilerConfiguration);
break;
/*
* Hive datatype: TIMESTAMP
* SparkSQL datatype: timestamp
* Java datatype: java.sql.Timestamp
*/
case "timestamp":
newColumnStatistics = new TimestampColumnStatistics(columnField, profilerConfiguration);
break;
default:
/*
* Hive datatype: DECIMAL
* SparkSQL datatype: decimal
* Java datatype: java.math.BigDecimal
*
* Handle the decimal type here since it comes with scale and precision e.g. decimal(7,5)
*/
String decimalTypeRegex = "decimal\\S+";
if (columnDataType.simpleString().matches(decimalTypeRegex)) {
newColumnStatistics = new BigDecimalColumnStatistics(columnField, profilerConfiguration);
} else /*
* Hive datatypes: CHAR, BINARY, ARRAY, MAP, STRUCT, UNIONTYPE
*/
{
if (log.isWarnEnabled()) {
log.warn("[PROFILER-INFO] Unsupported data type: {}", columnDataType.simpleString());
}
newColumnStatistics = new UnsupportedColumnStatistics(columnField, profilerConfiguration);
}
}
if (!columnStatisticsMap.containsKey(columnIndex)) {
columnStatisticsMap.put(columnIndex, newColumnStatistics);
}
StandardColumnStatistics currentColumnStatistics = columnStatisticsMap.get(columnIndex);
currentColumnStatistics.accomodate(columnValue, columnCount);
}
Aggregations