use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.
the class MapOperatorTest method testMapWithRuntimeContext.
@Test
public void testMapWithRuntimeContext() {
try {
final String taskName = "Test Task";
final AtomicBoolean opened = new AtomicBoolean();
final AtomicBoolean closed = new AtomicBoolean();
final MapFunction<String, Integer> parser = new RichMapFunction<String, Integer>() {
@Override
public void open(Configuration parameters) throws Exception {
opened.set(true);
RuntimeContext ctx = getRuntimeContext();
assertEquals(0, ctx.getIndexOfThisSubtask());
assertEquals(1, ctx.getNumberOfParallelSubtasks());
assertEquals(taskName, ctx.getTaskName());
}
@Override
public Integer map(String value) {
return Integer.parseInt(value);
}
@Override
public void close() throws Exception {
closed.set(true);
}
};
MapOperatorBase<String, Integer, MapFunction<String, Integer>> op = new MapOperatorBase<String, Integer, MapFunction<String, Integer>>(parser, new UnaryOperatorInformation<String, Integer>(BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO), taskName);
List<String> input = new ArrayList<String>(asList("1", "2", "3", "4", "5", "6"));
final HashMap<String, Accumulator<?, ?>> accumulatorMap = new HashMap<String, Accumulator<?, ?>>();
final HashMap<String, Future<Path>> cpTasks = new HashMap<>();
final TaskInfo taskInfo = new TaskInfo(taskName, 1, 0, 1, 0);
ExecutionConfig executionConfig = new ExecutionConfig();
executionConfig.disableObjectReuse();
List<Integer> resultMutableSafe = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, cpTasks, accumulatorMap, new UnregisteredMetricsGroup()), executionConfig);
executionConfig.enableObjectReuse();
List<Integer> resultRegular = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, cpTasks, accumulatorMap, new UnregisteredMetricsGroup()), executionConfig);
assertEquals(asList(1, 2, 3, 4, 5, 6), resultMutableSafe);
assertEquals(asList(1, 2, 3, 4, 5, 6), resultRegular);
assertTrue(opened.get());
assertTrue(closed.get());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.
the class MusicProfiles method main.
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
/**
* Read the user-song-play triplets.
*/
DataSet<Tuple3<String, String, Integer>> triplets = getUserSongTripletsData(env);
/**
* Read the mismatches dataset and extract the songIDs
*/
DataSet<Tuple1<String>> mismatches = getMismatchesData(env).map(new ExtractMismatchSongIds());
/**
* Filter out the mismatches from the triplets dataset
*/
DataSet<Tuple3<String, String, Integer>> validTriplets = triplets.coGroup(mismatches).where(1).equalTo(0).with(new FilterOutMismatches());
/**
* Create a user -> song weighted bipartite graph where the edge weights
* correspond to play counts
*/
Graph<String, NullValue, Integer> userSongGraph = Graph.fromTupleDataSet(validTriplets, env);
/**
* Get the top track (most listened) for each user
*/
DataSet<Tuple2<String, String>> usersWithTopTrack = userSongGraph.groupReduceOnEdges(new GetTopSongPerUser(), EdgeDirection.OUT).filter(new FilterSongNodes());
if (fileOutput) {
usersWithTopTrack.writeAsCsv(topTracksOutputPath, "\n", "\t");
} else {
usersWithTopTrack.print();
}
/**
* Create a user-user similarity graph, based on common songs, i.e. two
* users that listen to the same song are connected. For each song, we
* create an edge between each pair of its in-neighbors.
*/
DataSet<Edge<String, NullValue>> similarUsers = userSongGraph.getEdges().filter(new FilterFunction<Edge<String, Integer>>() {
public boolean filter(Edge<String, Integer> edge) {
return (edge.getValue() > playcountThreshold);
}
}).groupBy(1).reduceGroup(new CreateSimilarUserEdges()).distinct();
Graph<String, Long, NullValue> similarUsersGraph = Graph.fromDataSet(similarUsers, new MapFunction<String, Long>() {
public Long map(String value) {
return 1l;
}
}, env).getUndirected();
/**
* Detect user communities using the label propagation library method
*/
// Initialize each vertex with a unique numeric label and run the label propagation algorithm
DataSet<Tuple2<String, Long>> idsWithInitialLabels = DataSetUtils.zipWithUniqueId(similarUsersGraph.getVertexIds()).map(new MapFunction<Tuple2<Long, String>, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(Tuple2<Long, String> tuple2) throws Exception {
return new Tuple2<String, Long>(tuple2.f1, tuple2.f0);
}
});
DataSet<Vertex<String, Long>> verticesWithCommunity = similarUsersGraph.joinWithVertices(idsWithInitialLabels, new VertexJoinFunction<Long, Long>() {
public Long vertexJoin(Long vertexValue, Long inputValue) {
return inputValue;
}
}).run(new LabelPropagation<String, Long, NullValue>(maxIterations));
if (fileOutput) {
verticesWithCommunity.writeAsCsv(communitiesOutputPath, "\n", "\t");
// since file sinks are lazy, we trigger the execution explicitly
env.execute();
} else {
verticesWithCommunity.print();
}
}
use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.
the class DataStreamTest method testUnion.
/**
* Tests union functionality. This ensures that self-unions and unions of streams
* with differing parallelism work.
*
* @throws Exception
*/
@Test
public void testUnion() throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(4);
DataStream<Long> input1 = env.generateSequence(0, 0).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
});
DataStream<Long> selfUnion = input1.union(input1).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
});
DataStream<Long> input6 = env.generateSequence(0, 0).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
});
DataStream<Long> selfUnionDifferentPartition = input6.broadcast().union(input6).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
});
DataStream<Long> input2 = env.generateSequence(0, 0).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
}).setParallelism(4);
DataStream<Long> input3 = env.generateSequence(0, 0).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
}).setParallelism(2);
DataStream<Long> unionDifferingParallelism = input2.union(input3).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
}).setParallelism(4);
DataStream<Long> input4 = env.generateSequence(0, 0).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
}).setParallelism(2);
DataStream<Long> input5 = env.generateSequence(0, 0).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
}).setParallelism(4);
DataStream<Long> unionDifferingPartitioning = input4.broadcast().union(input5).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
}).setParallelism(4);
StreamGraph streamGraph = env.getStreamGraph();
// verify self union
assertTrue(streamGraph.getStreamNode(selfUnion.getId()).getInEdges().size() == 2);
for (StreamEdge edge : streamGraph.getStreamNode(selfUnion.getId()).getInEdges()) {
assertTrue(edge.getPartitioner() instanceof ForwardPartitioner);
}
// verify self union with differnt partitioners
assertTrue(streamGraph.getStreamNode(selfUnionDifferentPartition.getId()).getInEdges().size() == 2);
boolean hasForward = false;
boolean hasBroadcast = false;
for (StreamEdge edge : streamGraph.getStreamNode(selfUnionDifferentPartition.getId()).getInEdges()) {
if (edge.getPartitioner() instanceof ForwardPartitioner) {
hasForward = true;
}
if (edge.getPartitioner() instanceof BroadcastPartitioner) {
hasBroadcast = true;
}
}
assertTrue(hasForward && hasBroadcast);
// verify union of streams with differing parallelism
assertTrue(streamGraph.getStreamNode(unionDifferingParallelism.getId()).getInEdges().size() == 2);
for (StreamEdge edge : streamGraph.getStreamNode(unionDifferingParallelism.getId()).getInEdges()) {
if (edge.getSourceId() == input2.getId()) {
assertTrue(edge.getPartitioner() instanceof ForwardPartitioner);
} else if (edge.getSourceId() == input3.getId()) {
assertTrue(edge.getPartitioner() instanceof RebalancePartitioner);
} else {
fail("Wrong input edge.");
}
}
// verify union of streams with differing partitionings
assertTrue(streamGraph.getStreamNode(unionDifferingPartitioning.getId()).getInEdges().size() == 2);
for (StreamEdge edge : streamGraph.getStreamNode(unionDifferingPartitioning.getId()).getInEdges()) {
if (edge.getSourceId() == input4.getId()) {
assertTrue(edge.getPartitioner() instanceof BroadcastPartitioner);
} else if (edge.getSourceId() == input5.getId()) {
assertTrue(edge.getPartitioner() instanceof ForwardPartitioner);
} else {
fail("Wrong input edge.");
}
}
}
use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.
the class DataStreamTest method testParallelism.
/**
* Tests whether parallelism gets set.
*/
@Test
public void testParallelism() {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<Tuple2<Long, Long>> src = env.fromElements(new Tuple2<>(0L, 0L));
env.setParallelism(10);
SingleOutputStreamOperator<Long> map = src.map(new MapFunction<Tuple2<Long, Long>, Long>() {
@Override
public Long map(Tuple2<Long, Long> value) throws Exception {
return null;
}
}).name("MyMap");
DataStream<Long> windowed = map.windowAll(GlobalWindows.create()).trigger(PurgingTrigger.of(CountTrigger.of(10))).fold(0L, new FoldFunction<Long, Long>() {
@Override
public Long fold(Long accumulator, Long value) throws Exception {
return null;
}
});
windowed.addSink(new DiscardingSink<Long>());
DataStreamSink<Long> sink = map.addSink(new SinkFunction<Long>() {
private static final long serialVersionUID = 1L;
@Override
public void invoke(Long value) throws Exception {
}
});
assertEquals(1, env.getStreamGraph().getStreamNode(src.getId()).getParallelism());
assertEquals(10, env.getStreamGraph().getStreamNode(map.getId()).getParallelism());
assertEquals(1, env.getStreamGraph().getStreamNode(windowed.getId()).getParallelism());
assertEquals(10, env.getStreamGraph().getStreamNode(sink.getTransformation().getId()).getParallelism());
env.setParallelism(7);
// Some parts, such as windowing rely on the fact that previous operators have a parallelism
// set when instantiating the Discretizer. This would break if we dynamically changed
// the parallelism of operations when changing the setting on the Execution Environment.
assertEquals(1, env.getStreamGraph().getStreamNode(src.getId()).getParallelism());
assertEquals(10, env.getStreamGraph().getStreamNode(map.getId()).getParallelism());
assertEquals(1, env.getStreamGraph().getStreamNode(windowed.getId()).getParallelism());
assertEquals(10, env.getStreamGraph().getStreamNode(sink.getTransformation().getId()).getParallelism());
try {
src.setParallelism(3);
fail();
} catch (IllegalArgumentException success) {
// do nothing
}
DataStreamSource<Long> parallelSource = env.generateSequence(0, 0);
parallelSource.addSink(new DiscardingSink<Long>());
assertEquals(7, env.getStreamGraph().getStreamNode(parallelSource.getId()).getParallelism());
parallelSource.setParallelism(3);
assertEquals(3, env.getStreamGraph().getStreamNode(parallelSource.getId()).getParallelism());
map.setParallelism(2);
assertEquals(2, env.getStreamGraph().getStreamNode(map.getId()).getParallelism());
sink.setParallelism(4);
assertEquals(4, env.getStreamGraph().getStreamNode(sink.getTransformation().getId()).getParallelism());
}
use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.
the class DataStreamTest method testNaming.
/**
* Tests {@link SingleOutputStreamOperator#name(String)} functionality.
*
* @throws Exception
*/
@Test
public void testNaming() throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<Long> dataStream1 = env.generateSequence(0, 0).name("testSource1").map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
}).name("testMap");
DataStream<Long> dataStream2 = env.generateSequence(0, 0).name("testSource2").map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
return null;
}
}).name("testMap");
dataStream1.connect(dataStream2).flatMap(new CoFlatMapFunction<Long, Long, Long>() {
@Override
public void flatMap1(Long value, Collector<Long> out) throws Exception {
}
@Override
public void flatMap2(Long value, Collector<Long> out) throws Exception {
}
}).name("testCoFlatMap").windowAll(GlobalWindows.create()).trigger(PurgingTrigger.of(CountTrigger.of(10))).fold(0L, new FoldFunction<Long, Long>() {
private static final long serialVersionUID = 1L;
@Override
public Long fold(Long accumulator, Long value) throws Exception {
return null;
}
}).name("testWindowFold").print();
//test functionality through the operator names in the execution plan
String plan = env.getExecutionPlan();
assertTrue(plan.contains("testSource1"));
assertTrue(plan.contains("testSource2"));
assertTrue(plan.contains("testMap"));
assertTrue(plan.contains("testMap"));
assertTrue(plan.contains("testCoFlatMap"));
assertTrue(plan.contains("testWindowFold"));
}
Aggregations