Search in sources :

Example 1 with StringColumnSummary

use of org.apache.flink.api.java.summarize.StringColumnSummary in project flink by apache.

the class DataSetUtilsITCase method testSummarize.

@Test
public void testSummarize() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    List<Tuple8<Short, Integer, Long, Float, Double, String, Boolean, DoubleValue>> data = new ArrayList<>();
    data.add(new Tuple8<>((short) 1, 1, 100L, 0.1f, 1.012376, "hello", false, new DoubleValue(50.0)));
    data.add(new Tuple8<>((short) 2, 2, 1000L, 0.2f, 2.003453, "hello", true, new DoubleValue(50.0)));
    data.add(new Tuple8<>((short) 4, 10, 10000L, 0.2f, 75.00005, "null", true, new DoubleValue(50.0)));
    data.add(new Tuple8<>((short) 10, 4, 100L, 0.9f, 79.5, "", true, new DoubleValue(50.0)));
    data.add(new Tuple8<>((short) 5, 5, 1000L, 0.2f, 10.0000001, "a", false, new DoubleValue(50.0)));
    data.add(new Tuple8<>((short) 6, 6, 10L, 0.1f, 0.0000000000023, "", true, new DoubleValue(100.0)));
    data.add(new Tuple8<>((short) 7, 7, 1L, 0.2f, Double.POSITIVE_INFINITY, "abcdefghijklmnop", true, new DoubleValue(100.0)));
    data.add(new Tuple8<>((short) 8, 8, -100L, 0.001f, Double.NaN, "abcdefghi", true, new DoubleValue(100.0)));
    Collections.shuffle(data);
    DataSet<Tuple8<Short, Integer, Long, Float, Double, String, Boolean, DoubleValue>> ds = env.fromCollection(data);
    // call method under test
    Tuple results = DataSetUtils.summarize(ds);
    Assert.assertEquals(8, results.getArity());
    NumericColumnSummary<Short> col0Summary = results.getField(0);
    Assert.assertEquals(8, col0Summary.getNonMissingCount());
    Assert.assertEquals(1, col0Summary.getMin().shortValue());
    Assert.assertEquals(10, col0Summary.getMax().shortValue());
    Assert.assertEquals(5.375, col0Summary.getMean().doubleValue(), 0.0);
    NumericColumnSummary<Integer> col1Summary = results.getField(1);
    Assert.assertEquals(1, col1Summary.getMin().intValue());
    Assert.assertEquals(10, col1Summary.getMax().intValue());
    Assert.assertEquals(5.375, col1Summary.getMean().doubleValue(), 0.0);
    NumericColumnSummary<Long> col2Summary = results.getField(2);
    Assert.assertEquals(-100L, col2Summary.getMin().longValue());
    Assert.assertEquals(10000L, col2Summary.getMax().longValue());
    NumericColumnSummary<Float> col3Summary = results.getField(3);
    Assert.assertEquals(8, col3Summary.getTotalCount());
    Assert.assertEquals(0.001000, col3Summary.getMin().doubleValue(), 0.0000001);
    Assert.assertEquals(0.89999999, col3Summary.getMax().doubleValue(), 0.0000001);
    Assert.assertEquals(0.2376249988883501, col3Summary.getMean().doubleValue(), 0.000000000001);
    Assert.assertEquals(0.0768965488108089, col3Summary.getVariance().doubleValue(), 0.00000001);
    Assert.assertEquals(0.27730226975415995, col3Summary.getStandardDeviation().doubleValue(), 0.000000000001);
    NumericColumnSummary<Double> col4Summary = results.getField(4);
    Assert.assertEquals(6, col4Summary.getNonMissingCount());
    Assert.assertEquals(2, col4Summary.getMissingCount());
    Assert.assertEquals(0.0000000000023, col4Summary.getMin().doubleValue(), 0.0);
    Assert.assertEquals(79.5, col4Summary.getMax().doubleValue(), 0.000000000001);
    StringColumnSummary col5Summary = results.getField(5);
    Assert.assertEquals(8, col5Summary.getTotalCount());
    Assert.assertEquals(0, col5Summary.getNullCount());
    Assert.assertEquals(8, col5Summary.getNonNullCount());
    Assert.assertEquals(2, col5Summary.getEmptyCount());
    Assert.assertEquals(0, col5Summary.getMinLength().intValue());
    Assert.assertEquals(16, col5Summary.getMaxLength().intValue());
    Assert.assertEquals(5.0, col5Summary.getMeanLength().doubleValue(), 0.0001);
    BooleanColumnSummary col6Summary = results.getField(6);
    Assert.assertEquals(8, col6Summary.getTotalCount());
    Assert.assertEquals(2, col6Summary.getFalseCount());
    Assert.assertEquals(6, col6Summary.getTrueCount());
    Assert.assertEquals(0, col6Summary.getNullCount());
    NumericColumnSummary<Double> col7Summary = results.getField(7);
    Assert.assertEquals(100.0, col7Summary.getMax().doubleValue(), 0.00001);
    Assert.assertEquals(50.0, col7Summary.getMin().doubleValue(), 0.00001);
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) ArrayList(java.util.ArrayList) StringColumnSummary(org.apache.flink.api.java.summarize.StringColumnSummary) BooleanColumnSummary(org.apache.flink.api.java.summarize.BooleanColumnSummary) Tuple8(org.apache.flink.api.java.tuple.Tuple8) DoubleValue(org.apache.flink.types.DoubleValue) Tuple(org.apache.flink.api.java.tuple.Tuple) Test(org.junit.Test)

Example 2 with StringColumnSummary

use of org.apache.flink.api.java.summarize.StringColumnSummary in project flink by apache.

the class StringSummaryAggregatorTest method testAllNullStrings.

@Test
public void testAllNullStrings() {
    StringColumnSummary summary = summarize(null, null, null, null);
    Assert.assertEquals(4, summary.getTotalCount());
    Assert.assertEquals(4, summary.getNullCount());
    Assert.assertEquals(0, summary.getNonNullCount());
    Assert.assertEquals(0, summary.getEmptyCount());
    Assert.assertNull(summary.getMinLength());
    Assert.assertNull(summary.getMaxLength());
    Assert.assertNull(summary.getMeanLength());
}
Also used : StringColumnSummary(org.apache.flink.api.java.summarize.StringColumnSummary) Test(org.junit.Test)

Example 3 with StringColumnSummary

use of org.apache.flink.api.java.summarize.StringColumnSummary in project flink by apache.

the class StringSummaryAggregatorTest method testMixedGroup.

@Test
public void testMixedGroup() {
    StringColumnSummary summary = summarize("abc", "", null, "  ", "defghi", "foo", null, null, "", " ");
    Assert.assertEquals(10, summary.getTotalCount());
    Assert.assertEquals(3, summary.getNullCount());
    Assert.assertEquals(7, summary.getNonNullCount());
    Assert.assertEquals(2, summary.getEmptyCount());
    Assert.assertEquals(0, summary.getMinLength().intValue());
    Assert.assertEquals(6, summary.getMaxLength().intValue());
    Assert.assertEquals(2.142857, summary.getMeanLength().doubleValue(), 0.001);
}
Also used : StringColumnSummary(org.apache.flink.api.java.summarize.StringColumnSummary) Test(org.junit.Test)

Example 4 with StringColumnSummary

use of org.apache.flink.api.java.summarize.StringColumnSummary in project flink by apache.

the class StringSummaryAggregatorTest method testAllWithValues.

@Test
public void testAllWithValues() {
    StringColumnSummary summary = summarize("cat", "hat", "dog", "frog");
    Assert.assertEquals(4, summary.getTotalCount());
    Assert.assertEquals(0, summary.getNullCount());
    Assert.assertEquals(4, summary.getNonNullCount());
    Assert.assertEquals(0, summary.getEmptyCount());
    Assert.assertEquals(3, summary.getMinLength().intValue());
    Assert.assertEquals(4, summary.getMaxLength().intValue());
    Assert.assertEquals(3.25, summary.getMeanLength().doubleValue(), 0.0);
}
Also used : StringColumnSummary(org.apache.flink.api.java.summarize.StringColumnSummary) Test(org.junit.Test)

Aggregations

StringColumnSummary (org.apache.flink.api.java.summarize.StringColumnSummary)4 Test (org.junit.Test)4 ArrayList (java.util.ArrayList)1 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)1 BooleanColumnSummary (org.apache.flink.api.java.summarize.BooleanColumnSummary)1 Tuple (org.apache.flink.api.java.tuple.Tuple)1 Tuple8 (org.apache.flink.api.java.tuple.Tuple8)1 DoubleValue (org.apache.flink.types.DoubleValue)1