Search in sources :

Example 1 with BooleanColumnStatistics

use of com.thinkbiganalytics.spark.dataprofiler.columns.BooleanColumnStatistics in project kylo by Teradata.

the class StandardStatisticsModel method add.

/**
 * Include a column value in calculation of profile statistics for the column
 *
 * @param columnIndex numeric index of column (0-based)
 * @param columnValue value in column
 * @param columnCount number of times value is found in column
 * @param columnField schema information of the column
 */
public void add(Integer columnIndex, Object columnValue, Long columnCount, StructField columnField) {
    StandardColumnStatistics newColumnStatistics;
    DataType columnDataType = columnField.dataType();
    switch(columnDataType.simpleString()) {
        /*
             * Hive datatype: 		TINYINT
             * SparkSQL datatype: 	        tinyint
             * Java datatype:		Byte
             */
        case "tinyint":
            newColumnStatistics = new ByteColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		SMALLINT
             * SparkSQL datatype: 	        smallint
             * Java datatype:		Short
             */
        case "smallint":
            newColumnStatistics = new ShortColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		INT
             * SparkSQL datatype: 	        int
             * Java datatype:		Int
             */
        case "int":
            newColumnStatistics = new IntegerColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		BIGINT
             * SparkSQL datatype: 	        bigint
             * Java datatype:		Long
             */
        case "bigint":
            newColumnStatistics = new LongColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		FLOAT
             * SparkSQL datatype: 	        float
             * Java datatype:		Float
             */
        case "float":
            newColumnStatistics = new FloatColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		DOUBLE
             * SparkSQL datatype: 	        double
             * Java datatype:		Double
             */
        case "double":
            newColumnStatistics = new DoubleColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatypes: 		STRING, VARCHAR
             * SparkSQL datatype: 	        string
             * Java datatype:		String
             */
        case "string":
            newColumnStatistics = new StringColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		BOOLEAN
             * SparkSQL datatype: 	        boolean
             * Java datatype:		Boolean
             */
        case "boolean":
            newColumnStatistics = new BooleanColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		DATE
             * SparkSQL datatype: 	        date
             * Java datatype:		java.sql.Date
             */
        case "date":
            newColumnStatistics = new DateColumnStatistics(columnField, profilerConfiguration);
            break;
        /*
             * Hive datatype: 		TIMESTAMP
             * SparkSQL datatype: 	        timestamp
             * Java datatype:		java.sql.Timestamp
             */
        case "timestamp":
            newColumnStatistics = new TimestampColumnStatistics(columnField, profilerConfiguration);
            break;
        default:
            /*
             * Hive datatype: 		DECIMAL
             * SparkSQL datatype: 	        decimal
             * Java datatype:		java.math.BigDecimal
             *
             * Handle the decimal type here since it comes with scale and precision e.g. decimal(7,5)
             */
            String decimalTypeRegex = "decimal\\S+";
            if (columnDataType.simpleString().matches(decimalTypeRegex)) {
                newColumnStatistics = new BigDecimalColumnStatistics(columnField, profilerConfiguration);
            } else /*
                 * Hive datatypes: CHAR, BINARY, ARRAY, MAP, STRUCT, UNIONTYPE
                 */
            {
                if (log.isWarnEnabled()) {
                    log.warn("[PROFILER-INFO] Unsupported data type: {}", columnDataType.simpleString());
                }
                newColumnStatistics = new UnsupportedColumnStatistics(columnField, profilerConfiguration);
            }
    }
    if (!columnStatisticsMap.containsKey(columnIndex)) {
        columnStatisticsMap.put(columnIndex, newColumnStatistics);
    }
    StandardColumnStatistics currentColumnStatistics = columnStatisticsMap.get(columnIndex);
    currentColumnStatistics.accomodate(columnValue, columnCount);
}
Also used : DateColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.DateColumnStatistics) StandardColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.StandardColumnStatistics) FloatColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.FloatColumnStatistics) IntegerColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.IntegerColumnStatistics) UnsupportedColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.UnsupportedColumnStatistics) BigDecimalColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.BigDecimalColumnStatistics) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) ShortColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.ShortColumnStatistics) StringColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.StringColumnStatistics) BooleanColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.BooleanColumnStatistics) LongColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.LongColumnStatistics) DoubleColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.DoubleColumnStatistics) DataType(org.apache.spark.sql.types.DataType) ByteColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.ByteColumnStatistics)

Aggregations

BigDecimalColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.BigDecimalColumnStatistics)1 BooleanColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.BooleanColumnStatistics)1 ByteColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.ByteColumnStatistics)1 DateColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.DateColumnStatistics)1 DoubleColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.DoubleColumnStatistics)1 FloatColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.FloatColumnStatistics)1 IntegerColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.IntegerColumnStatistics)1 LongColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.LongColumnStatistics)1 ShortColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.ShortColumnStatistics)1 StandardColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.StandardColumnStatistics)1 StringColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.StringColumnStatistics)1 TimestampColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics)1 UnsupportedColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.UnsupportedColumnStatistics)1 DataType (org.apache.spark.sql.types.DataType)1