use of org.apache.flink.api.java.tuple.Tuple in project flink by apache.
the class DataSetUtilsITCase method testSummarize.
@Test
public void testSummarize() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
List<Tuple8<Short, Integer, Long, Float, Double, String, Boolean, DoubleValue>> data = new ArrayList<>();
data.add(new Tuple8<>((short) 1, 1, 100L, 0.1f, 1.012376, "hello", false, new DoubleValue(50.0)));
data.add(new Tuple8<>((short) 2, 2, 1000L, 0.2f, 2.003453, "hello", true, new DoubleValue(50.0)));
data.add(new Tuple8<>((short) 4, 10, 10000L, 0.2f, 75.00005, "null", true, new DoubleValue(50.0)));
data.add(new Tuple8<>((short) 10, 4, 100L, 0.9f, 79.5, "", true, new DoubleValue(50.0)));
data.add(new Tuple8<>((short) 5, 5, 1000L, 0.2f, 10.0000001, "a", false, new DoubleValue(50.0)));
data.add(new Tuple8<>((short) 6, 6, 10L, 0.1f, 0.0000000000023, "", true, new DoubleValue(100.0)));
data.add(new Tuple8<>((short) 7, 7, 1L, 0.2f, Double.POSITIVE_INFINITY, "abcdefghijklmnop", true, new DoubleValue(100.0)));
data.add(new Tuple8<>((short) 8, 8, -100L, 0.001f, Double.NaN, "abcdefghi", true, new DoubleValue(100.0)));
Collections.shuffle(data);
DataSet<Tuple8<Short, Integer, Long, Float, Double, String, Boolean, DoubleValue>> ds = env.fromCollection(data);
// call method under test
Tuple results = DataSetUtils.summarize(ds);
Assert.assertEquals(8, results.getArity());
NumericColumnSummary<Short> col0Summary = results.getField(0);
Assert.assertEquals(8, col0Summary.getNonMissingCount());
Assert.assertEquals(1, col0Summary.getMin().shortValue());
Assert.assertEquals(10, col0Summary.getMax().shortValue());
Assert.assertEquals(5.375, col0Summary.getMean().doubleValue(), 0.0);
NumericColumnSummary<Integer> col1Summary = results.getField(1);
Assert.assertEquals(1, col1Summary.getMin().intValue());
Assert.assertEquals(10, col1Summary.getMax().intValue());
Assert.assertEquals(5.375, col1Summary.getMean().doubleValue(), 0.0);
NumericColumnSummary<Long> col2Summary = results.getField(2);
Assert.assertEquals(-100L, col2Summary.getMin().longValue());
Assert.assertEquals(10000L, col2Summary.getMax().longValue());
NumericColumnSummary<Float> col3Summary = results.getField(3);
Assert.assertEquals(8, col3Summary.getTotalCount());
Assert.assertEquals(0.001000, col3Summary.getMin().doubleValue(), 0.0000001);
Assert.assertEquals(0.89999999, col3Summary.getMax().doubleValue(), 0.0000001);
Assert.assertEquals(0.2376249988883501, col3Summary.getMean().doubleValue(), 0.000000000001);
Assert.assertEquals(0.0768965488108089, col3Summary.getVariance().doubleValue(), 0.00000001);
Assert.assertEquals(0.27730226975415995, col3Summary.getStandardDeviation().doubleValue(), 0.000000000001);
NumericColumnSummary<Double> col4Summary = results.getField(4);
Assert.assertEquals(6, col4Summary.getNonMissingCount());
Assert.assertEquals(2, col4Summary.getMissingCount());
Assert.assertEquals(0.0000000000023, col4Summary.getMin().doubleValue(), 0.0);
Assert.assertEquals(79.5, col4Summary.getMax().doubleValue(), 0.000000000001);
StringColumnSummary col5Summary = results.getField(5);
Assert.assertEquals(8, col5Summary.getTotalCount());
Assert.assertEquals(0, col5Summary.getNullCount());
Assert.assertEquals(8, col5Summary.getNonNullCount());
Assert.assertEquals(2, col5Summary.getEmptyCount());
Assert.assertEquals(0, col5Summary.getMinLength().intValue());
Assert.assertEquals(16, col5Summary.getMaxLength().intValue());
Assert.assertEquals(5.0, col5Summary.getMeanLength().doubleValue(), 0.0001);
BooleanColumnSummary col6Summary = results.getField(6);
Assert.assertEquals(8, col6Summary.getTotalCount());
Assert.assertEquals(2, col6Summary.getFalseCount());
Assert.assertEquals(6, col6Summary.getTrueCount());
Assert.assertEquals(0, col6Summary.getNullCount());
NumericColumnSummary<Double> col7Summary = results.getField(7);
Assert.assertEquals(100.0, col7Summary.getMax().doubleValue(), 0.00001);
Assert.assertEquals(50.0, col7Summary.getMin().doubleValue(), 0.00001);
}
use of org.apache.flink.api.java.tuple.Tuple in project flink by apache.
the class AggregationFunctionTest method groupSumIntegerTest.
@Test
public void groupSumIntegerTest() throws Exception {
// preparing expected outputs
List<Tuple2<Integer, Integer>> expectedGroupSumList = new ArrayList<>();
List<Tuple2<Integer, Integer>> expectedGroupMinList = new ArrayList<>();
List<Tuple2<Integer, Integer>> expectedGroupMaxList = new ArrayList<>();
int groupedSum0 = 0;
int groupedSum1 = 0;
int groupedSum2 = 0;
for (int i = 0; i < 9; i++) {
int groupedSum;
switch(i % 3) {
case 0:
groupedSum = groupedSum0 += i;
break;
case 1:
groupedSum = groupedSum1 += i;
break;
default:
groupedSum = groupedSum2 += i;
break;
}
expectedGroupSumList.add(new Tuple2<>(i % 3, groupedSum));
expectedGroupMinList.add(new Tuple2<>(i % 3, i % 3));
expectedGroupMaxList.add(new Tuple2<>(i % 3, i));
}
// some necessary boiler plate
TypeInformation<Tuple2<Integer, Integer>> typeInfo = TypeExtractor.getForObject(new Tuple2<>(0, 0));
ExecutionConfig config = new ExecutionConfig();
KeySelector<Tuple2<Integer, Integer>, Tuple> keySelector = KeySelectorUtil.getSelectorForKeys(new Keys.ExpressionKeys<>(new int[] { 0 }, typeInfo), typeInfo, config);
TypeInformation<Tuple> keyType = TypeExtractor.getKeySelectorTypes(keySelector, typeInfo);
// aggregations tested
ReduceFunction<Tuple2<Integer, Integer>> sumFunction = new SumAggregator<>(1, typeInfo, config);
ReduceFunction<Tuple2<Integer, Integer>> minFunction = new ComparableAggregator<>(1, typeInfo, AggregationType.MIN, config);
ReduceFunction<Tuple2<Integer, Integer>> maxFunction = new ComparableAggregator<>(1, typeInfo, AggregationType.MAX, config);
List<Tuple2<Integer, Integer>> groupedSumList = MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(sumFunction, typeInfo.createSerializer(config)), getInputList(), keySelector, keyType);
List<Tuple2<Integer, Integer>> groupedMinList = MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(minFunction, typeInfo.createSerializer(config)), getInputList(), keySelector, keyType);
List<Tuple2<Integer, Integer>> groupedMaxList = MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(maxFunction, typeInfo.createSerializer(config)), getInputList(), keySelector, keyType);
assertEquals(expectedGroupSumList, groupedSumList);
assertEquals(expectedGroupMinList, groupedMinList);
assertEquals(expectedGroupMaxList, groupedMaxList);
}
use of org.apache.flink.api.java.tuple.Tuple in project flink by apache.
the class AggregationFunctionTest method pojoMinMaxByTest.
@Test
public void pojoMinMaxByTest() throws Exception {
// Pojos are grouped on field 0, aggregated on field 1
// preparing expected outputs
List<MyPojo3> maxByFirstExpected = ImmutableList.of(new MyPojo3(0, 0), new MyPojo3(1, 1), new MyPojo3(2, 2), new MyPojo3(2, 2), new MyPojo3(2, 2), new MyPojo3(2, 2), new MyPojo3(2, 2), new MyPojo3(2, 2), new MyPojo3(2, 2));
List<MyPojo3> maxByLastExpected = ImmutableList.of(new MyPojo3(0, 0), new MyPojo3(1, 1), new MyPojo3(2, 2), new MyPojo3(2, 2), new MyPojo3(2, 2), new MyPojo3(2, 5), new MyPojo3(2, 5), new MyPojo3(2, 5), new MyPojo3(2, 8));
List<MyPojo3> minByFirstExpected = ImmutableList.of(new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 0));
List<MyPojo3> minByLastExpected = ImmutableList.of(new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 0), new MyPojo3(0, 3), new MyPojo3(0, 3), new MyPojo3(0, 3), new MyPojo3(0, 6), new MyPojo3(0, 6), new MyPojo3(0, 6));
// some necessary boiler plate
TypeInformation<MyPojo3> typeInfo = TypeExtractor.getForObject(new MyPojo3(0, 0));
ExecutionConfig config = new ExecutionConfig();
KeySelector<MyPojo3, Tuple> keySelector = KeySelectorUtil.getSelectorForKeys(new Keys.ExpressionKeys<>(new String[] { "f0" }, typeInfo), typeInfo, config);
TypeInformation<Tuple> keyType = TypeExtractor.getKeySelectorTypes(keySelector, typeInfo);
// aggregations tested
ReduceFunction<MyPojo3> maxByFunctionFirst = new ComparableAggregator<>("f1", typeInfo, AggregationType.MAXBY, true, config);
ReduceFunction<MyPojo3> maxByFunctionLast = new ComparableAggregator<>("f1", typeInfo, AggregationType.MAXBY, false, config);
ReduceFunction<MyPojo3> minByFunctionFirst = new ComparableAggregator<>("f1", typeInfo, AggregationType.MINBY, true, config);
ReduceFunction<MyPojo3> minByFunctionLast = new ComparableAggregator<>("f1", typeInfo, AggregationType.MINBY, false, config);
assertEquals(maxByFirstExpected, MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(maxByFunctionFirst, typeInfo.createSerializer(config)), getInputByPojoList(), keySelector, keyType));
assertEquals(maxByLastExpected, MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(maxByFunctionLast, typeInfo.createSerializer(config)), getInputByPojoList(), keySelector, keyType));
assertEquals(minByLastExpected, MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(minByFunctionLast, typeInfo.createSerializer(config)), getInputByPojoList(), keySelector, keyType));
assertEquals(minByFirstExpected, MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(minByFunctionFirst, typeInfo.createSerializer(config)), getInputByPojoList(), keySelector, keyType));
}
use of org.apache.flink.api.java.tuple.Tuple in project flink by apache.
the class AggregationFunctionTest method pojoGroupSumIntegerTest.
@Test
public void pojoGroupSumIntegerTest() throws Exception {
// preparing expected outputs
List<MyPojo> expectedGroupSumList = new ArrayList<>();
List<MyPojo> expectedGroupMinList = new ArrayList<>();
List<MyPojo> expectedGroupMaxList = new ArrayList<>();
int groupedSum0 = 0;
int groupedSum1 = 0;
int groupedSum2 = 0;
for (int i = 0; i < 9; i++) {
int groupedSum;
switch(i % 3) {
case 0:
groupedSum = groupedSum0 += i;
break;
case 1:
groupedSum = groupedSum1 += i;
break;
default:
groupedSum = groupedSum2 += i;
break;
}
expectedGroupSumList.add(new MyPojo(i % 3, groupedSum));
expectedGroupMinList.add(new MyPojo(i % 3, i % 3));
expectedGroupMaxList.add(new MyPojo(i % 3, i));
}
// some necessary boiler plate
TypeInformation<MyPojo> typeInfo = TypeExtractor.getForObject(new MyPojo(0, 0));
ExecutionConfig config = new ExecutionConfig();
KeySelector<MyPojo, Tuple> keySelector = KeySelectorUtil.getSelectorForKeys(new Keys.ExpressionKeys<>(new String[] { "f0" }, typeInfo), typeInfo, config);
TypeInformation<Tuple> keyType = TypeExtractor.getKeySelectorTypes(keySelector, typeInfo);
// aggregations tested
ReduceFunction<MyPojo> sumFunction = new SumAggregator<>("f1", typeInfo, config);
ReduceFunction<MyPojo> minFunction = new ComparableAggregator<>("f1", typeInfo, AggregationType.MIN, false, config);
ReduceFunction<MyPojo> maxFunction = new ComparableAggregator<>("f1", typeInfo, AggregationType.MAX, false, config);
List<MyPojo> groupedSumList = MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(sumFunction, typeInfo.createSerializer(config)), getInputPojoList(), keySelector, keyType);
List<MyPojo> groupedMinList = MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(minFunction, typeInfo.createSerializer(config)), getInputPojoList(), keySelector, keyType);
List<MyPojo> groupedMaxList = MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(maxFunction, typeInfo.createSerializer(config)), getInputPojoList(), keySelector, keyType);
assertEquals(expectedGroupSumList, groupedSumList);
assertEquals(expectedGroupMinList, groupedMinList);
assertEquals(expectedGroupMaxList, groupedMaxList);
}
use of org.apache.flink.api.java.tuple.Tuple in project flink by apache.
the class AggregationFunctionTest method minMaxByTest.
@Test
public void minMaxByTest() throws Exception {
// Tuples are grouped on field 0, aggregated on field 1
// preparing expected outputs
List<Tuple3<Integer, Integer, Integer>> maxByFirstExpected = ImmutableList.of(Tuple3.of(0, 0, 0), Tuple3.of(0, 1, 1), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 2));
List<Tuple3<Integer, Integer, Integer>> maxByLastExpected = ImmutableList.of(Tuple3.of(0, 0, 0), Tuple3.of(0, 1, 1), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 2), Tuple3.of(0, 2, 5), Tuple3.of(0, 2, 5), Tuple3.of(0, 2, 5), Tuple3.of(0, 2, 8));
List<Tuple3<Integer, Integer, Integer>> minByFirstExpected = ImmutableList.of(Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0));
List<Tuple3<Integer, Integer, Integer>> minByLastExpected = ImmutableList.of(Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 0), Tuple3.of(0, 0, 3), Tuple3.of(0, 0, 3), Tuple3.of(0, 0, 3), Tuple3.of(0, 0, 6), Tuple3.of(0, 0, 6), Tuple3.of(0, 0, 6));
// some necessary boiler plate
TypeInformation<Tuple3<Integer, Integer, Integer>> typeInfo = TypeExtractor.getForObject(Tuple3.of(0, 0, 0));
ExecutionConfig config = new ExecutionConfig();
KeySelector<Tuple3<Integer, Integer, Integer>, Tuple> keySelector = KeySelectorUtil.getSelectorForKeys(new Keys.ExpressionKeys<>(new int[] { 0 }, typeInfo), typeInfo, config);
TypeInformation<Tuple> keyType = TypeExtractor.getKeySelectorTypes(keySelector, typeInfo);
// aggregations tested
ReduceFunction<Tuple3<Integer, Integer, Integer>> maxByFunctionFirst = new ComparableAggregator<>(1, typeInfo, AggregationType.MAXBY, true, config);
ReduceFunction<Tuple3<Integer, Integer, Integer>> maxByFunctionLast = new ComparableAggregator<>(1, typeInfo, AggregationType.MAXBY, false, config);
ReduceFunction<Tuple3<Integer, Integer, Integer>> minByFunctionFirst = new ComparableAggregator<>(1, typeInfo, AggregationType.MINBY, true, config);
ReduceFunction<Tuple3<Integer, Integer, Integer>> minByFunctionLast = new ComparableAggregator<>(1, typeInfo, AggregationType.MINBY, false, config);
assertEquals(maxByFirstExpected, MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(maxByFunctionFirst, typeInfo.createSerializer(config)), getInputByList(), keySelector, keyType));
assertEquals(maxByLastExpected, MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(maxByFunctionLast, typeInfo.createSerializer(config)), getInputByList(), keySelector, keyType));
assertEquals(minByLastExpected, MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(minByFunctionLast, typeInfo.createSerializer(config)), getInputByList(), keySelector, keyType));
assertEquals(minByFirstExpected, MockContext.createAndExecuteForKeyedStream(new StreamGroupedReduce<>(minByFunctionFirst, typeInfo.createSerializer(config)), getInputByList(), keySelector, keyType));
}
Aggregations