Search in sources :

Example 1 with ProfilerConfiguration

use of com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration in project kylo by Teradata.

the class ProfilerTest method setUp.

@Before
@SuppressWarnings("unchecked")
public void setUp() {
    if (columnStatsMap == null) {
        StructField[] schemaFields = new StructField[15];
        schemaFields[0] = DataTypes.createStructField("id", DataTypes.IntegerType, true);
        schemaFields[1] = DataTypes.createStructField("firstname", DataTypes.StringType, true);
        schemaFields[2] = DataTypes.createStructField("lastname", DataTypes.StringType, true);
        schemaFields[3] = DataTypes.createStructField("age", DataTypes.IntegerType, true);
        schemaFields[4] = DataTypes.createStructField("description", DataTypes.StringType, true);
        schemaFields[5] = DataTypes.createStructField("height", DataTypes.DoubleType, true);
        schemaFields[6] = DataTypes.createStructField("joindate", DataTypes.DateType, true);
        schemaFields[7] = DataTypes.createStructField("lifemember", DataTypes.BooleanType, true);
        schemaFields[8] = DataTypes.createStructField("lastlogin", DataTypes.TimestampType, true);
        schemaFields[9] = DataTypes.createStructField("phash", DataTypes.LongType, true);
        schemaFields[10] = DataTypes.createStructField("weight", DataTypes.FloatType, true);
        schemaFields[11] = DataTypes.createStructField("credits", DataTypes.ShortType, true);
        schemaFields[12] = DataTypes.createStructField("ccode", DataTypes.ByteType, true);
        schemaFields[13] = DataTypes.createStructField("score", DataTypes.createDecimalType(7, 5), true);
        schemaFields[14] = DataTypes.createStructField("favoritepet", DataTypes.StringType, true);
        StructType schema = DataTypes.createStructType(schemaFields);
        List<Row> rows = new ArrayList<>();
        rows.add(RowFactory.create(1, "Jon", "Wright", 14, "Jon::Wright", 5.85d, Date.valueOf("2010-05-04"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 1456890911L, 40.2f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Cat"));
        rows.add(RowFactory.create(2, "Jon", "Hudson", null, "Jon::Hudson", 5.85d, Date.valueOf("1990-10-25"), null, Timestamp.valueOf("2011-01-08 11:25:45"), 7638962135L, 110.5f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(8.223)), "alligator"));
        rows.add(RowFactory.create(3, "Rachael", "Hu", 40, "Rachael::Hu", 6.22d, Date.valueOf("1990-10-25"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 160.7f, (short) 1400, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Alpaca"));
        rows.add(RowFactory.create(4, EMPTY_STRING, EMPTY_STRING, 40, null, null, Date.valueOf("1956-11-12"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 2988626110L, null, null, (byte) 99, null, "Cat"));
        rows.add(RowFactory.create(5, "Rachael", EMPTY_STRING, 22, "Rachael::", 5.85d, Date.valueOf("2005-12-24"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8260467621L, 160.7f, (short) 100, null, new BigDecimal(String.valueOf(4.343)), "Zebra"));
        rows.add(RowFactory.create(6, "Elizabeth", "Taylor", 40, "Elizabeth::Taylor", 5.85d, Date.valueOf("2011-08-08"), null, Timestamp.valueOf("2016-01-14 14:20:20"), 8732866249L, null, (short) 1400, null, new BigDecimal(String.valueOf(4.343)), "ZEBRA"));
        rows.add(RowFactory.create(7, "Jon", "Taylor", 18, "Jon::Taylor", null, Date.valueOf("2011-08-08"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 110.5f, (short) 500, (byte) 40, new BigDecimal(String.valueOf(4.343)), null));
        rows.add(RowFactory.create(8, "Rachael", EMPTY_STRING, 22, "Rachael::", 4.37d, Date.valueOf("2011-08-08"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8782348100L, null, null, null, null, "albatross"));
        rows.add(RowFactory.create(9, EMPTY_STRING, "Edmundson Jr", 11, "::Edmundson Jr", 4.88d, Date.valueOf("2007-06-07"), Boolean.FALSE, Timestamp.valueOf("2007-03-16 08:24:37"), null, 155.3f, (short) 0, (byte) 99, new BigDecimal(String.valueOf(1.567)), EMPTY_STRING));
        rows.add(RowFactory.create(10, "Jon", EMPTY_STRING, 65, "Jon::", null, Date.valueOf("1975-04-04"), Boolean.TRUE, Timestamp.valueOf("2007-03-16 08:24:31"), null, 180.6f, (short) 5000, (byte) 2, new BigDecimal(String.valueOf(4.343)), "Cat"));
        final JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sqlContext.sparkContext());
        JavaRDD<Row> dataRDD = javaSparkContext.parallelize(rows);
        DataSet dataDF = scs.toDataSet(sqlContext.createDataFrame(dataRDD, schema));
        /* Enable to debug contents of test data */
        /*
            for (Row r: dataRDD.collect()) {
                System.out.println(r.toString());
            }
            */
        StatisticsModel statsModel = profiler.profile(dataDF, new ProfilerConfiguration());
        columnStatsMap = (statsModel != null) ? (Map) statsModel.getColumnStatisticsMap() : (Map<Integer, StandardColumnStatistics>) Collections.EMPTY_MAP;
    }
}
Also used : StatisticsModel(com.thinkbiganalytics.spark.dataprofiler.StatisticsModel) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) ArrayList(java.util.ArrayList) BigDecimal(java.math.BigDecimal) StructField(org.apache.spark.sql.types.StructField) ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Map(java.util.Map) Before(org.junit.Before)

Example 2 with ProfilerConfiguration

use of com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration in project kylo by Teradata.

the class TimestampColumnCase2Test method writeStatistics.

/**
 * Verify writing statistics.
 */
@Test
public void writeStatistics() {
    final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
    // Test when empty
    TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    List<OutputRow> rows = stats.getStatistics();
    Assert.assertEquals(12, rows.size());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=0]", rows.get(3).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=0]", rows.get(4).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=0]", rows.get(5).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=0]", rows.get(6).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=0]", rows.get(7).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=]", rows.get(9).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=]", rows.get(10).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=]", rows.get(11).toString());
    // Test with multiple values
    stats.accomodate("", 1L);
    stats.accomodate("2016-06-27 14:04:29", 1L);
    stats.accomodate("2016-06-27 14:04:30", 1L);
    stats.accomodate("2016-06-27 14:04:31", 1L);
    stats.accomodate(null, 1L);
    rows = stats.getStatistics();
    Assert.assertEquals(12, rows.size());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=1]", rows.get(3).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=5]", rows.get(4).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=5]", rows.get(5).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=20]", rows.get(6).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=100]", rows.get(7).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]", rows.get(9).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=2016-06-27 14:04:31.0]", rows.get(10).toString());
    Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=2016-06-27 14:04:29.0]", rows.get(11).toString());
}
Also used : ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) Test(org.junit.Test) ProfilerTest(com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)

Example 3 with ProfilerConfiguration

use of com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration in project kylo by Teradata.

the class TimestampColumnCase2Test method getVerboseStatistics.

/**
 * Verify statistics string.
 */
@Test
public void getVerboseStatistics() {
    final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
    // Test when empty
    TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    String expected = "{\nColumnInfo [name=ts, datatype=timestamp, nullable=true, metadata={}]\n" + "CommonStatistics [nullCount=0, totalCount=0, uniqueCount=0, percNullValues=0, percUniqueValues=0, percDuplicateValues=0]\n" + "Top 3 values [\n]\n" + "TimestampColumnStatistics [maxTimestamp=, minTimestamp=]\n}";
    Assert.assertEquals(expected, stats.getVerboseStatistics());
    // Test with multiple values
    stats.accomodate("", 1L);
    stats.accomodate("2016-06-27 14:04:29", 1L);
    stats.accomodate("2016-06-27 14:04:30", 1L);
    stats.accomodate("2016-06-27 14:04:31", 1L);
    stats.accomodate(null, 1L);
    expected = "{\nColumnInfo [name=ts, datatype=timestamp, nullable=true, metadata={}]\n" + "CommonStatistics [nullCount=1, totalCount=5, uniqueCount=5, percNullValues=20, percUniqueValues=100, percDuplicateValues=0]\n" + "Top 3 values [\n1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]\n" + "TimestampColumnStatistics [maxTimestamp=2016-06-27 14:04:31.0, minTimestamp=2016-06-27 14:04:29.0]\n}";
    Assert.assertEquals(expected, stats.getVerboseStatistics());
}
Also used : ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) Test(org.junit.Test) ProfilerTest(com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)

Example 4 with ProfilerConfiguration

use of com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration in project kylo by Teradata.

the class TimestampColumnCase2Test method combine.

/**
 * Verify combining statistics.
 */
@Test
public void combine() {
    final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration();
    // Test when 'this' is empty
    TimestampColumnStatistics other = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    other.accomodate("2016-06-27 14:04:30", 1L);
    stats.combine(other);
    Timestamp ts1 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 30).getMillis());
    Assert.assertEquals(ts1, stats.getMaxTimestamp());
    Assert.assertEquals(ts1, stats.getMinTimestamp());
    // Test when other is empty
    other = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration);
    stats.combine(other);
    Assert.assertEquals(ts1, stats.getMaxTimestamp());
    Assert.assertEquals(ts1, stats.getMinTimestamp());
    // Test when other has later timestamp
    other.accomodate("2016-06-27 14:04:31", 1L);
    stats.combine(other);
    Timestamp ts2 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 31).getMillis());
    Assert.assertEquals(ts2, stats.getMaxTimestamp());
    Assert.assertEquals(ts1, stats.getMinTimestamp());
    // Test when other has earlier timestamp
    other.accomodate("2016-06-27 14:04:29", 1L);
    stats.combine(other);
    Timestamp ts3 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 29).getMillis());
    Assert.assertEquals(ts2, stats.getMaxTimestamp());
    Assert.assertEquals(ts3, stats.getMinTimestamp());
}
Also used : ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) Timestamp(java.sql.Timestamp) DateTime(org.joda.time.DateTime) TimestampColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics) Test(org.junit.Test) ProfilerTest(com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)

Example 5 with ProfilerConfiguration

use of com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration in project kylo by Teradata.

the class ProfileStage method apply.

@Nonnull
@Override
public TransformResult apply(@Nullable final TransformResult result) {
    Preconditions.checkNotNull(result);
    // Profile data set
    final StatisticsModel dataStats = profiler.profile(result.getDataSet(), new ProfilerConfiguration());
    // Add stats to result
    if (dataStats != null) {
        final List<OutputRow> profile = (result.getProfile() != null) ? new ArrayList<>(result.getProfile()) : new ArrayList<OutputRow>(dataStats.getColumnStatisticsMap().size());
        for (final ColumnStatistics columnStats : dataStats.getColumnStatisticsMap().values()) {
            profile.addAll(columnStats.getStatistics());
        }
        result.setProfile(profile);
    }
    return result;
}
Also used : ColumnStatistics(com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics) StatisticsModel(com.thinkbiganalytics.spark.dataprofiler.StatisticsModel) ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow) Nonnull(javax.annotation.Nonnull)

Aggregations

ProfilerConfiguration (com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration)6 TimestampColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics)4 ProfilerTest (com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest)4 Test (org.junit.Test)4 StatisticsModel (com.thinkbiganalytics.spark.dataprofiler.StatisticsModel)2 OutputRow (com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)2 Timestamp (java.sql.Timestamp)2 DateTime (org.joda.time.DateTime)2 DataSet (com.thinkbiganalytics.spark.DataSet)1 ColumnStatistics (com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics)1 BigDecimal (java.math.BigDecimal)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 Nonnull (javax.annotation.Nonnull)1 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)1 Row (org.apache.spark.sql.Row)1 StructField (org.apache.spark.sql.types.StructField)1 StructType (org.apache.spark.sql.types.StructType)1 Before (org.junit.Before)1