use of org.apache.flink.api.common.functions.FilterFunction in project flink by apache.
the class DataStreamTest method operatorTest.
@Test
public void operatorTest() {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<Long> src = env.generateSequence(0, 0);
MapFunction<Long, Integer> mapFunction = new MapFunction<Long, Integer>() {
@Override
public Integer map(Long value) throws Exception {
return null;
}
};
DataStream<Integer> map = src.map(mapFunction);
map.addSink(new DiscardingSink<Integer>());
assertEquals(mapFunction, getFunctionForDataStream(map));
FlatMapFunction<Long, Integer> flatMapFunction = new FlatMapFunction<Long, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public void flatMap(Long value, Collector<Integer> out) throws Exception {
}
};
DataStream<Integer> flatMap = src.flatMap(flatMapFunction);
flatMap.addSink(new DiscardingSink<Integer>());
assertEquals(flatMapFunction, getFunctionForDataStream(flatMap));
FilterFunction<Integer> filterFunction = new FilterFunction<Integer>() {
@Override
public boolean filter(Integer value) throws Exception {
return false;
}
};
DataStream<Integer> unionFilter = map.union(flatMap).filter(filterFunction);
unionFilter.addSink(new DiscardingSink<Integer>());
assertEquals(filterFunction, getFunctionForDataStream(unionFilter));
try {
env.getStreamGraph().getStreamEdges(map.getId(), unionFilter.getId());
} catch (RuntimeException e) {
fail(e.getMessage());
}
try {
env.getStreamGraph().getStreamEdges(flatMap.getId(), unionFilter.getId());
} catch (RuntimeException e) {
fail(e.getMessage());
}
OutputSelector<Integer> outputSelector = new OutputSelector<Integer>() {
@Override
public Iterable<String> select(Integer value) {
return null;
}
};
SplitStream<Integer> split = unionFilter.split(outputSelector);
split.select("dummy").addSink(new DiscardingSink<Integer>());
List<OutputSelector<?>> outputSelectors = env.getStreamGraph().getStreamNode(unionFilter.getId()).getOutputSelectors();
assertEquals(1, outputSelectors.size());
assertEquals(outputSelector, outputSelectors.get(0));
DataStream<Integer> select = split.select("a");
DataStreamSink<Integer> sink = select.print();
StreamEdge splitEdge = env.getStreamGraph().getStreamEdges(unionFilter.getId(), sink.getTransformation().getId()).get(0);
assertEquals("a", splitEdge.getSelectedNames().get(0));
ConnectedStreams<Integer, Integer> connect = map.connect(flatMap);
CoMapFunction<Integer, Integer, String> coMapper = new CoMapFunction<Integer, Integer, String>() {
private static final long serialVersionUID = 1L;
@Override
public String map1(Integer value) {
return null;
}
@Override
public String map2(Integer value) {
return null;
}
};
DataStream<String> coMap = connect.map(coMapper);
coMap.addSink(new DiscardingSink<String>());
assertEquals(coMapper, getFunctionForDataStream(coMap));
try {
env.getStreamGraph().getStreamEdges(map.getId(), coMap.getId());
} catch (RuntimeException e) {
fail(e.getMessage());
}
try {
env.getStreamGraph().getStreamEdges(flatMap.getId(), coMap.getId());
} catch (RuntimeException e) {
fail(e.getMessage());
}
}
use of org.apache.flink.api.common.functions.FilterFunction in project flink by apache.
the class FilterLambda1 method main.
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<String> input = env.fromElements("Please filter", "the words", "but not this");
FilterFunction<String> filter = (v) -> WordFilter.filter(v);
DataSet<String> output = input.filter(filter);
output.print();
env.execute();
}
use of org.apache.flink.api.common.functions.FilterFunction in project flink by apache.
the class TPCHQuery3 method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
if (!params.has("lineitem") && !params.has("customer") && !params.has("orders")) {
System.err.println(" This program expects data from the TPC-H benchmark as input data.");
System.err.println(" Due to legal restrictions, we can not ship generated data.");
System.out.println(" You can find the TPC-H data generator at http://www.tpc.org/tpch/.");
System.out.println(" Usage: TPCHQuery3 --lineitem <path> --customer <path> --orders <path> [--output <path>]");
return;
}
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.getConfig().setGlobalJobParameters(params);
// get input data
DataSet<Lineitem> lineitems = getLineitemDataSet(env, params.get("lineitem"));
DataSet<Order> orders = getOrdersDataSet(env, params.get("customer"));
DataSet<Customer> customers = getCustomerDataSet(env, params.get("orders"));
// Filter market segment "AUTOMOBILE"
customers = customers.filter(new FilterFunction<Customer>() {
@Override
public boolean filter(Customer c) {
return c.getMktsegment().equals("AUTOMOBILE");
}
});
// Filter all Orders with o_orderdate < 12.03.1995
orders = orders.filter(new FilterFunction<Order>() {
private final DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
private final Date date = format.parse("1995-03-12");
@Override
public boolean filter(Order o) throws ParseException {
return format.parse(o.getOrderdate()).before(date);
}
});
// Filter all Lineitems with l_shipdate > 12.03.1995
lineitems = lineitems.filter(new FilterFunction<Lineitem>() {
private final DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
private final Date date = format.parse("1995-03-12");
@Override
public boolean filter(Lineitem l) throws ParseException {
return format.parse(l.getShipdate()).after(date);
}
});
// Join customers with orders and package them into a ShippingPriorityItem
DataSet<ShippingPriorityItem> customerWithOrders = customers.join(orders).where(0).equalTo(1).with(new JoinFunction<Customer, Order, ShippingPriorityItem>() {
@Override
public ShippingPriorityItem join(Customer c, Order o) {
return new ShippingPriorityItem(o.getOrderKey(), 0.0, o.getOrderdate(), o.getShippriority());
}
});
// Join the last join result with Lineitems
DataSet<ShippingPriorityItem> result = customerWithOrders.join(lineitems).where(0).equalTo(0).with(new JoinFunction<ShippingPriorityItem, Lineitem, ShippingPriorityItem>() {
@Override
public ShippingPriorityItem join(ShippingPriorityItem i, Lineitem l) {
i.setRevenue(l.getExtendedprice() * (1 - l.getDiscount()));
return i;
}
}).groupBy(0, 2, 3).aggregate(Aggregations.SUM, 1);
// emit result
if (params.has("output")) {
result.writeAsCsv(params.get("output"), "\n", "|");
// execute program
env.execute("TPCH Query 3 Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
result.print();
}
}
use of org.apache.flink.api.common.functions.FilterFunction in project flink by apache.
the class TPCHQuery10 method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
if (!params.has("customer") && !params.has("orders") && !params.has("lineitem") && !params.has("nation")) {
System.err.println(" This program expects data from the TPC-H benchmark as input data.");
System.err.println(" Due to legal restrictions, we can not ship generated data.");
System.err.println(" You can find the TPC-H data generator at http://www.tpc.org/tpch/.");
System.err.println(" Usage: TPCHQuery10 --customer <path> --orders <path> --lineitem <path> --nation <path> [--output <path>]");
return;
}
// get customer data set: (custkey, name, address, nationkey, acctbal)
DataSet<Tuple5<Integer, String, String, Integer, Double>> customers = getCustomerDataSet(env, params.get("customer"));
// get orders data set: (orderkey, custkey, orderdate)
DataSet<Tuple3<Integer, Integer, String>> orders = getOrdersDataSet(env, params.get("orders"));
// get lineitem data set: (orderkey, extendedprice, discount, returnflag)
DataSet<Tuple4<Integer, Double, Double, String>> lineitems = getLineitemDataSet(env, params.get("lineitem"));
// get nation data set: (nationkey, name)
DataSet<Tuple2<Integer, String>> nations = getNationsDataSet(env, params.get("nation"));
// orders filtered by year: (orderkey, custkey)
DataSet<Tuple2<Integer, Integer>> ordersFilteredByYear = // filter by year
orders.filter(new FilterFunction<Tuple3<Integer, Integer, String>>() {
@Override
public boolean filter(Tuple3<Integer, Integer, String> o) {
return Integer.parseInt(o.f2.substring(0, 4)) > 1990;
}
}).project(0, 1);
// lineitems filtered by flag: (orderkey, revenue)
DataSet<Tuple2<Integer, Double>> lineitemsFilteredByFlag = // filter by flag
lineitems.filter(new FilterFunction<Tuple4<Integer, Double, Double, String>>() {
@Override
public boolean filter(Tuple4<Integer, Double, Double, String> l) {
return l.f3.equals("R");
}
}).map(new MapFunction<Tuple4<Integer, Double, Double, String>, Tuple2<Integer, Double>>() {
@Override
public Tuple2<Integer, Double> map(Tuple4<Integer, Double, Double, String> l) {
// revenue per item = l_extendedprice * (1 - l_discount)
return new Tuple2<Integer, Double>(l.f0, l.f1 * (1 - l.f2));
}
});
// join orders with lineitems: (custkey, revenue)
DataSet<Tuple2<Integer, Double>> revenueByCustomer = ordersFilteredByYear.joinWithHuge(lineitemsFilteredByFlag).where(0).equalTo(0).projectFirst(1).projectSecond(1);
revenueByCustomer = revenueByCustomer.groupBy(0).aggregate(Aggregations.SUM, 1);
// join customer with nation (custkey, name, address, nationname, acctbal)
DataSet<Tuple5<Integer, String, String, String, Double>> customerWithNation = customers.joinWithTiny(nations).where(3).equalTo(0).projectFirst(0, 1, 2).projectSecond(1).projectFirst(4);
// join customer (with nation) with revenue (custkey, name, address, nationname, acctbal, revenue)
DataSet<Tuple6<Integer, String, String, String, Double, Double>> result = customerWithNation.join(revenueByCustomer).where(0).equalTo(0).projectFirst(0, 1, 2, 3, 4).projectSecond(1);
// emit result
if (params.has("output")) {
result.writeAsCsv(params.get("output"), "\n", "|");
// execute program
env.execute("TPCH Query 10 Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
result.print();
}
}
use of org.apache.flink.api.common.functions.FilterFunction in project flink by apache.
the class MusicProfiles method main.
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
/**
* Read the user-song-play triplets.
*/
DataSet<Tuple3<String, String, Integer>> triplets = getUserSongTripletsData(env);
/**
* Read the mismatches dataset and extract the songIDs
*/
DataSet<Tuple1<String>> mismatches = getMismatchesData(env).map(new ExtractMismatchSongIds());
/**
* Filter out the mismatches from the triplets dataset
*/
DataSet<Tuple3<String, String, Integer>> validTriplets = triplets.coGroup(mismatches).where(1).equalTo(0).with(new FilterOutMismatches());
/**
* Create a user -> song weighted bipartite graph where the edge weights
* correspond to play counts
*/
Graph<String, NullValue, Integer> userSongGraph = Graph.fromTupleDataSet(validTriplets, env);
/**
* Get the top track (most listened) for each user
*/
DataSet<Tuple2<String, String>> usersWithTopTrack = userSongGraph.groupReduceOnEdges(new GetTopSongPerUser(), EdgeDirection.OUT).filter(new FilterSongNodes());
if (fileOutput) {
usersWithTopTrack.writeAsCsv(topTracksOutputPath, "\n", "\t");
} else {
usersWithTopTrack.print();
}
/**
* Create a user-user similarity graph, based on common songs, i.e. two
* users that listen to the same song are connected. For each song, we
* create an edge between each pair of its in-neighbors.
*/
DataSet<Edge<String, NullValue>> similarUsers = userSongGraph.getEdges().filter(new FilterFunction<Edge<String, Integer>>() {
public boolean filter(Edge<String, Integer> edge) {
return (edge.getValue() > playcountThreshold);
}
}).groupBy(1).reduceGroup(new CreateSimilarUserEdges()).distinct();
Graph<String, Long, NullValue> similarUsersGraph = Graph.fromDataSet(similarUsers, new MapFunction<String, Long>() {
public Long map(String value) {
return 1l;
}
}, env).getUndirected();
/**
* Detect user communities using the label propagation library method
*/
// Initialize each vertex with a unique numeric label and run the label propagation algorithm
DataSet<Tuple2<String, Long>> idsWithInitialLabels = DataSetUtils.zipWithUniqueId(similarUsersGraph.getVertexIds()).map(new MapFunction<Tuple2<Long, String>, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(Tuple2<Long, String> tuple2) throws Exception {
return new Tuple2<String, Long>(tuple2.f1, tuple2.f0);
}
});
DataSet<Vertex<String, Long>> verticesWithCommunity = similarUsersGraph.joinWithVertices(idsWithInitialLabels, new VertexJoinFunction<Long, Long>() {
public Long vertexJoin(Long vertexValue, Long inputValue) {
return inputValue;
}
}).run(new LabelPropagation<String, Long, NullValue>(maxIterations));
if (fileOutput) {
verticesWithCommunity.writeAsCsv(communitiesOutputPath, "\n", "\t");
// since file sinks are lazy, we trigger the execution explicitly
env.execute();
} else {
verticesWithCommunity.print();
}
}
Aggregations