use of org.apache.flink.api.java.DataSet in project flink by apache.
the class OperatorTranslation method translateTwoInputOperator.
private <I1, I2, O> org.apache.flink.api.common.operators.Operator<O> translateTwoInputOperator(TwoInputOperator<?, ?, ?, ?> op) {
@SuppressWarnings("unchecked") TwoInputOperator<I1, I2, O, ?> typedOp = (TwoInputOperator<I1, I2, O, ?>) op;
@SuppressWarnings("unchecked") DataSet<I1> typedInput1 = (DataSet<I1>) op.getInput1();
@SuppressWarnings("unchecked") DataSet<I2> typedInput2 = (DataSet<I2>) op.getInput2();
Operator<I1> input1 = translate(typedInput1);
Operator<I2> input2 = translate(typedInput2);
org.apache.flink.api.common.operators.Operator<O> dataFlowOp = typedOp.translateToDataFlow(input1, input2);
if (op instanceof UdfOperator<?>) {
@SuppressWarnings("unchecked") TwoInputUdfOperator<I1, I2, O, ?> udfOp = (TwoInputUdfOperator<I1, I2, O, ?>) op;
// set configuration parameters
Configuration opParams = udfOp.getParameters();
if (opParams != null) {
dataFlowOp.getParameters().addAll(opParams);
}
if (dataFlowOp instanceof org.apache.flink.api.common.operators.DualInputOperator) {
org.apache.flink.api.common.operators.DualInputOperator<?, ?, O, ?> binaryOp = (org.apache.flink.api.common.operators.DualInputOperator<?, ?, O, ?>) dataFlowOp;
// set the semantic properties
binaryOp.setSemanticProperties(udfOp.getSemanticProperties());
}
}
return dataFlowOp;
}
use of org.apache.flink.api.java.DataSet in project flink by apache.
the class WordCount method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
// set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// get input data
DataSet<String> text = getTextDataSet(env);
DataSet<Tuple2<String, Integer>> counts = // normalize and split each line
text.map(line -> line.toLowerCase().split("\\W+")).flatMap((String[] tokens, Collector<Tuple2<String, Integer>> out) -> {
// emit the pairs with non-zero-length words
Arrays.stream(tokens).filter(t -> t.length() > 0).forEach(t -> out.collect(new Tuple2<>(t, 1)));
}).groupBy(0).sum(1);
// emit result
if (fileOutput) {
counts.writeAsCsv(outputPath, "\n", " ");
} else {
counts.print();
}
// execute program
env.execute("WordCount Example");
}
use of org.apache.flink.api.java.DataSet in project flink by apache.
the class TriangleListing method main.
public static void main(String[] args) throws Exception {
// Set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.getConfig().enableObjectReuse();
ParameterTool parameters = ParameterTool.fromArgs(args);
env.getConfig().setGlobalJobParameters(parameters);
if (!parameters.has("directed")) {
throw new ProgramParametrizationException(getUsage("must declare execution mode as '--directed true' or '--directed false'"));
}
boolean directedAlgorithm = parameters.getBoolean("directed");
int little_parallelism = parameters.getInt("little_parallelism", PARALLELISM_DEFAULT);
boolean triadic_census = parameters.getBoolean("triadic_census", DEFAULT_TRIADIC_CENSUS);
GraphAnalytic tc = null;
DataSet tl;
switch(parameters.get("input", "")) {
case "csv":
{
String lineDelimiter = StringEscapeUtils.unescapeJava(parameters.get("input_line_delimiter", CsvOutputFormat.DEFAULT_LINE_DELIMITER));
String fieldDelimiter = StringEscapeUtils.unescapeJava(parameters.get("input_field_delimiter", CsvOutputFormat.DEFAULT_FIELD_DELIMITER));
GraphCsvReader reader = Graph.fromCsvReader(parameters.getRequired("input_filename"), env).ignoreCommentsEdges("#").lineDelimiterEdges(lineDelimiter).fieldDelimiterEdges(fieldDelimiter);
switch(parameters.get("type", "")) {
case "integer":
{
Graph<LongValue, NullValue, NullValue> graph = reader.keyType(LongValue.class);
if (directedAlgorithm) {
if (parameters.getBoolean("simplify", false)) {
graph = graph.run(new org.apache.flink.graph.asm.simple.directed.Simplify<LongValue, NullValue, NullValue>().setParallelism(little_parallelism));
}
if (triadic_census) {
tc = graph.run(new org.apache.flink.graph.library.clustering.directed.TriadicCensus<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
tl = graph.run(new org.apache.flink.graph.library.clustering.directed.TriangleListing<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
} else {
if (parameters.getBoolean("simplify", false)) {
graph = graph.run(new org.apache.flink.graph.asm.simple.undirected.Simplify<LongValue, NullValue, NullValue>(false).setParallelism(little_parallelism));
}
if (triadic_census) {
tc = graph.run(new org.apache.flink.graph.library.clustering.undirected.TriadicCensus<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
tl = graph.run(new org.apache.flink.graph.library.clustering.undirected.TriangleListing<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
}
break;
case "string":
{
Graph<StringValue, NullValue, NullValue> graph = reader.keyType(StringValue.class);
if (directedAlgorithm) {
if (parameters.getBoolean("simplify", false)) {
graph = graph.run(new org.apache.flink.graph.asm.simple.directed.Simplify<StringValue, NullValue, NullValue>().setParallelism(little_parallelism));
}
if (triadic_census) {
tc = graph.run(new org.apache.flink.graph.library.clustering.directed.TriadicCensus<StringValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
tl = graph.run(new org.apache.flink.graph.library.clustering.directed.TriangleListing<StringValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
} else {
if (parameters.getBoolean("simplify", false)) {
graph = graph.run(new org.apache.flink.graph.asm.simple.undirected.Simplify<StringValue, NullValue, NullValue>(false).setParallelism(little_parallelism));
}
if (triadic_census) {
tc = graph.run(new org.apache.flink.graph.library.clustering.undirected.TriadicCensus<StringValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
tl = graph.run(new org.apache.flink.graph.library.clustering.undirected.TriangleListing<StringValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
}
break;
default:
throw new ProgramParametrizationException(getUsage("invalid CSV type"));
}
}
break;
case "rmat":
{
int scale = parameters.getInt("scale", DEFAULT_SCALE);
int edgeFactor = parameters.getInt("edge_factor", DEFAULT_EDGE_FACTOR);
RandomGenerableFactory<JDKRandomGenerator> rnd = new JDKRandomGeneratorFactory();
long vertexCount = 1L << scale;
long edgeCount = vertexCount * edgeFactor;
Graph<LongValue, NullValue, NullValue> graph = new RMatGraph<>(env, rnd, vertexCount, edgeCount).generate();
if (directedAlgorithm) {
if (scale > 32) {
Graph<LongValue, NullValue, NullValue> simpleGraph = graph.run(new org.apache.flink.graph.asm.simple.directed.Simplify<LongValue, NullValue, NullValue>().setParallelism(little_parallelism));
if (triadic_census) {
tc = simpleGraph.run(new org.apache.flink.graph.library.clustering.directed.TriadicCensus<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
tl = simpleGraph.run(new org.apache.flink.graph.library.clustering.directed.TriangleListing<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
} else {
Graph<LongValue, NullValue, NullValue> simpleGraph = graph.run(new org.apache.flink.graph.asm.simple.directed.Simplify<LongValue, NullValue, NullValue>().setParallelism(little_parallelism));
if (triadic_census) {
tc = simpleGraph.run(new org.apache.flink.graph.library.clustering.directed.TriadicCensus<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
tl = simpleGraph.run(new org.apache.flink.graph.library.clustering.directed.TriangleListing<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
} else {
boolean clipAndFlip = parameters.getBoolean("clip_and_flip", DEFAULT_CLIP_AND_FLIP);
if (scale > 32) {
Graph<LongValue, NullValue, NullValue> simpleGraph = graph.run(new org.apache.flink.graph.asm.simple.undirected.Simplify<LongValue, NullValue, NullValue>(clipAndFlip).setParallelism(little_parallelism));
if (triadic_census) {
tc = simpleGraph.run(new org.apache.flink.graph.library.clustering.undirected.TriadicCensus<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
tl = simpleGraph.run(new org.apache.flink.graph.library.clustering.undirected.TriangleListing<LongValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
} else {
Graph<IntValue, NullValue, NullValue> simpleGraph = graph.run(new TranslateGraphIds<LongValue, IntValue, NullValue, NullValue>(new LongValueToUnsignedIntValue()).setParallelism(little_parallelism)).run(new org.apache.flink.graph.asm.simple.undirected.Simplify<IntValue, NullValue, NullValue>(clipAndFlip).setParallelism(little_parallelism));
if (triadic_census) {
tc = simpleGraph.run(new org.apache.flink.graph.library.clustering.undirected.TriadicCensus<IntValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
tl = simpleGraph.run(new org.apache.flink.graph.library.clustering.undirected.TriangleListing<IntValue, NullValue, NullValue>().setLittleParallelism(little_parallelism));
}
}
}
break;
default:
throw new ProgramParametrizationException(getUsage("invalid input type"));
}
switch(parameters.get("output", "")) {
case "print":
System.out.println();
if (directedAlgorithm) {
for (Object e : tl.collect()) {
org.apache.flink.graph.library.clustering.directed.TriangleListing.Result result = (org.apache.flink.graph.library.clustering.directed.TriangleListing.Result) e;
System.out.println(result.toPrintableString());
}
} else {
tl.print();
}
break;
case "hash":
System.out.println();
System.out.println(DataSetUtils.checksumHashCode(tl));
break;
case "csv":
String filename = parameters.getRequired("output_filename");
String lineDelimiter = StringEscapeUtils.unescapeJava(parameters.get("output_line_delimiter", CsvOutputFormat.DEFAULT_LINE_DELIMITER));
String fieldDelimiter = StringEscapeUtils.unescapeJava(parameters.get("output_field_delimiter", CsvOutputFormat.DEFAULT_FIELD_DELIMITER));
tl.writeAsCsv(filename, lineDelimiter, fieldDelimiter);
env.execute();
break;
default:
throw new ProgramParametrizationException(getUsage("invalid output type"));
}
if (tc != null) {
System.out.print("Triadic census:\n ");
System.out.println(tc.getResult().toString().replace(";", "\n "));
}
JobExecutionResult result = env.getLastJobExecutionResult();
NumberFormat nf = NumberFormat.getInstance();
System.out.println();
System.out.println("Execution runtime: " + nf.format(result.getNetRuntime()) + " ms");
}
use of org.apache.flink.api.java.DataSet in project flink by apache.
the class PregelCompilerTest method testPregelCompiler.
@SuppressWarnings("serial")
@Test
public void testPregelCompiler() {
try {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(DEFAULT_PARALLELISM);
// compose test program
{
DataSet<Vertex<Long, Long>> initialVertices = env.fromElements(new Tuple2<>(1L, 1L), new Tuple2<>(2L, 2L)).map(new Tuple2ToVertexMap<Long, Long>());
DataSet<Edge<Long, NullValue>> edges = env.fromElements(new Tuple2<>(1L, 2L)).map(new MapFunction<Tuple2<Long, Long>, Edge<Long, NullValue>>() {
public Edge<Long, NullValue> map(Tuple2<Long, Long> edge) {
return new Edge<>(edge.f0, edge.f1, NullValue.getInstance());
}
});
Graph<Long, Long, NullValue> graph = Graph.fromDataSet(initialVertices, edges, env);
DataSet<Vertex<Long, Long>> result = graph.runVertexCentricIteration(new CCCompute(), null, 100).getVertices();
result.output(new DiscardingOutputFormat<Vertex<Long, Long>>());
}
Plan p = env.createProgramPlan("Pregel Connected Components");
OptimizedPlan op = compileNoStats(p);
// check the sink
SinkPlanNode sink = op.getDataSinks().iterator().next();
assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy());
assertEquals(DEFAULT_PARALLELISM, sink.getParallelism());
// check the iteration
WorksetIterationPlanNode iteration = (WorksetIterationPlanNode) sink.getInput().getSource();
assertEquals(DEFAULT_PARALLELISM, iteration.getParallelism());
// check the solution set delta
PlanNode ssDelta = iteration.getSolutionSetDeltaPlanNode();
assertTrue(ssDelta instanceof SingleInputPlanNode);
SingleInputPlanNode ssFlatMap = (SingleInputPlanNode) ((SingleInputPlanNode) (ssDelta)).getInput().getSource();
assertEquals(DEFAULT_PARALLELISM, ssFlatMap.getParallelism());
assertEquals(ShipStrategyType.FORWARD, ssFlatMap.getInput().getShipStrategy());
// check the computation coGroup
DualInputPlanNode computationCoGroup = (DualInputPlanNode) (ssFlatMap.getInput().getSource());
assertEquals(DEFAULT_PARALLELISM, computationCoGroup.getParallelism());
assertEquals(ShipStrategyType.FORWARD, computationCoGroup.getInput1().getShipStrategy());
assertEquals(ShipStrategyType.PARTITION_HASH, computationCoGroup.getInput2().getShipStrategy());
assertTrue(computationCoGroup.getInput2().getTempMode().isCached());
assertEquals(new FieldList(0), computationCoGroup.getInput2().getShipStrategyKeys());
// check that the initial partitioning is pushed out of the loop
assertEquals(ShipStrategyType.PARTITION_HASH, iteration.getInput1().getShipStrategy());
assertEquals(new FieldList(0), iteration.getInput1().getShipStrategyKeys());
} catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.api.java.DataSet in project flink by apache.
the class PregelCompilerTest method testPregelWithCombiner.
@SuppressWarnings("serial")
@Test
public void testPregelWithCombiner() {
try {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(DEFAULT_PARALLELISM);
// compose test program
{
DataSet<Vertex<Long, Long>> initialVertices = env.fromElements(new Tuple2<>(1L, 1L), new Tuple2<>(2L, 2L)).map(new Tuple2ToVertexMap<Long, Long>());
DataSet<Edge<Long, NullValue>> edges = env.fromElements(new Tuple2<>(1L, 2L)).map(new MapFunction<Tuple2<Long, Long>, Edge<Long, NullValue>>() {
public Edge<Long, NullValue> map(Tuple2<Long, Long> edge) {
return new Edge<>(edge.f0, edge.f1, NullValue.getInstance());
}
});
Graph<Long, Long, NullValue> graph = Graph.fromDataSet(initialVertices, edges, env);
DataSet<Vertex<Long, Long>> result = graph.runVertexCentricIteration(new CCCompute(), new CCCombiner(), 100).getVertices();
result.output(new DiscardingOutputFormat<Vertex<Long, Long>>());
}
Plan p = env.createProgramPlan("Pregel Connected Components");
OptimizedPlan op = compileNoStats(p);
// check the sink
SinkPlanNode sink = op.getDataSinks().iterator().next();
assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy());
assertEquals(DEFAULT_PARALLELISM, sink.getParallelism());
// check the iteration
WorksetIterationPlanNode iteration = (WorksetIterationPlanNode) sink.getInput().getSource();
assertEquals(DEFAULT_PARALLELISM, iteration.getParallelism());
// check the combiner
SingleInputPlanNode combiner = (SingleInputPlanNode) iteration.getInput2().getSource();
assertEquals(ShipStrategyType.FORWARD, combiner.getInput().getShipStrategy());
// check the solution set delta
PlanNode ssDelta = iteration.getSolutionSetDeltaPlanNode();
assertTrue(ssDelta instanceof SingleInputPlanNode);
SingleInputPlanNode ssFlatMap = (SingleInputPlanNode) ((SingleInputPlanNode) (ssDelta)).getInput().getSource();
assertEquals(DEFAULT_PARALLELISM, ssFlatMap.getParallelism());
assertEquals(ShipStrategyType.FORWARD, ssFlatMap.getInput().getShipStrategy());
// check the computation coGroup
DualInputPlanNode computationCoGroup = (DualInputPlanNode) (ssFlatMap.getInput().getSource());
assertEquals(DEFAULT_PARALLELISM, computationCoGroup.getParallelism());
assertEquals(ShipStrategyType.FORWARD, computationCoGroup.getInput1().getShipStrategy());
assertEquals(ShipStrategyType.PARTITION_HASH, computationCoGroup.getInput2().getShipStrategy());
assertTrue(computationCoGroup.getInput2().getTempMode().isCached());
assertEquals(new FieldList(0), computationCoGroup.getInput2().getShipStrategyKeys());
// check that the initial partitioning is pushed out of the loop
assertEquals(ShipStrategyType.PARTITION_HASH, iteration.getInput1().getShipStrategy());
assertEquals(new FieldList(0), iteration.getInput1().getShipStrategyKeys());
} catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
fail(e.getMessage());
}
}
Aggregations