use of org.apache.flink.api.common.functions.MapPartitionFunction in project flink by apache.
the class PartitionMapOperatorTest method testMapPartitionWithRuntimeContext.
@Test
public void testMapPartitionWithRuntimeContext() {
try {
final String taskName = "Test Task";
final AtomicBoolean opened = new AtomicBoolean();
final AtomicBoolean closed = new AtomicBoolean();
final MapPartitionFunction<String, Integer> parser = new RichMapPartitionFunction<String, Integer>() {
@Override
public void open(Configuration parameters) throws Exception {
opened.set(true);
RuntimeContext ctx = getRuntimeContext();
assertEquals(0, ctx.getIndexOfThisSubtask());
assertEquals(1, ctx.getNumberOfParallelSubtasks());
assertEquals(taskName, ctx.getTaskName());
}
@Override
public void mapPartition(Iterable<String> values, Collector<Integer> out) {
for (String s : values) {
out.collect(Integer.parseInt(s));
}
}
@Override
public void close() throws Exception {
closed.set(true);
}
};
MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String, Integer>> op = new MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String, Integer>>(parser, new UnaryOperatorInformation<String, Integer>(BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO), taskName);
List<String> input = new ArrayList<String>(asList("1", "2", "3", "4", "5", "6"));
final TaskInfo taskInfo = new TaskInfo(taskName, 1, 0, 1, 0);
ExecutionConfig executionConfig = new ExecutionConfig();
executionConfig.disableObjectReuse();
List<Integer> resultMutableSafe = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig);
executionConfig.enableObjectReuse();
List<Integer> resultRegular = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig);
assertEquals(asList(1, 2, 3, 4, 5, 6), resultMutableSafe);
assertEquals(asList(1, 2, 3, 4, 5, 6), resultRegular);
assertTrue(opened.get());
assertTrue(closed.get());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.api.common.functions.MapPartitionFunction in project flink by apache.
the class DataSetUtils method summarize.
// --------------------------------------------------------------------------------------------
// Summarize
// --------------------------------------------------------------------------------------------
/**
* Summarize a DataSet of Tuples by collecting single pass statistics for all columns
*
* Example usage:
* <pre>
* {@code
* Dataset<Tuple3<Double, String, Boolean>> input = // [...]
* Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
*
* summary.f0.getStandardDeviation()
* summary.f1.getMaxLength()
* }
* </pre>
* @return the summary as a Tuple the same width as input rows
*/
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
if (!input.getType().isTupleType()) {
throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
}
final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {
@Override
public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
for (Tuple value : values) {
aggregator.aggregate(value);
}
out.collect(aggregator);
}
}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {
@Override
public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
agg1.combine(agg2);
return agg1;
}
});
return result.collect().get(0).result();
}
Aggregations