use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class ConnectedComponents method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String... args) throws Exception {
// Checking input parameters
final ParameterTool params = ParameterTool.fromArgs(args);
// set up execution environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
final int maxIterations = params.getInt("iterations", 10);
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// read vertex and edge data
DataSet<Long> vertices = getVertexDataSet(env, params);
DataSet<Tuple2<Long, Long>> edges = getEdgeDataSet(env, params).flatMap(new UndirectEdge());
// assign the initial components (equal to the vertex id)
DataSet<Tuple2<Long, Long>> verticesWithInitialId = vertices.map(new DuplicateValue<Long>());
// open a delta iteration
DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = verticesWithInitialId.iterateDelta(verticesWithInitialId, maxIterations, 0);
// apply the step logic: join with the edges, select the minimum neighbor, update if the component of the candidate is smaller
DataSet<Tuple2<Long, Long>> changes = iteration.getWorkset().join(edges).where(0).equalTo(0).with(new NeighborWithComponentIDJoin()).groupBy(0).aggregate(Aggregations.MIN, 1).join(iteration.getSolutionSet()).where(0).equalTo(0).with(new ComponentIdFilter());
// close the delta iteration (delta and new workset are identical)
DataSet<Tuple2<Long, Long>> result = iteration.closeWith(changes, changes);
// emit result
if (params.has("output")) {
result.writeAsCsv(params.get("output"), "\n", " ");
// execute program
env.execute("Connected Components Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
result.print();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class WebLogAnalysis method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.getConfig().setGlobalJobParameters(params);
// get input data
DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);
// Retain documents with keywords
DataSet<Tuple1<String>> filterDocs = documents.filter(new FilterDocByKeyWords()).project(0);
// Filter ranks by minimum rank
DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks.filter(new FilterByRank());
// Filter visits by visit date
DataSet<Tuple1<String>> filterVisits = visits.filter(new FilterVisitsByDate()).project(0);
// Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks = filterDocs.join(filterRanks).where(0).equalTo(1).projectSecond(0, 1, 2);
// Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time
DataSet<Tuple3<Integer, String, Integer>> result = joinDocsRanks.coGroup(filterVisits).where(1).equalTo(0).with(new AntiJoinVisits());
// emit result
if (params.has("output")) {
result.writeAsCsv(params.get("output"), "\n", "|");
// execute program
env.execute("WebLogAnalysis Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
result.print();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class WordCountPojo method main.
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
// set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// get input data
DataSet<String> text;
if (params.has("input")) {
// read the text file from given input path
text = env.readTextFile(params.get("input"));
} else {
// get default test text data
System.out.println("Executing WordCount example with default input data set.");
System.out.println("Use --input to specify file input.");
text = WordCountData.getDefaultTextLineDataSet(env);
}
DataSet<Word> counts = // split up the lines into Word objects (with frequency = 1)
text.flatMap(new Tokenizer()).groupBy("word").reduce(new ReduceFunction<Word>() {
@Override
public Word reduce(Word value1, Word value2) throws Exception {
return new Word(value1.word, value1.frequency + value2.frequency);
}
});
if (params.has("output")) {
counts.writeAsText(params.get("output"), WriteMode.OVERWRITE);
// execute program
env.execute("WordCount-Pojo Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
counts.print();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class AsyncIOExample method main.
public static void main(String[] args) throws Exception {
// obtain execution environment
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// parse parameters
final ParameterTool params = ParameterTool.fromArgs(args);
final String statePath;
final String cpMode;
final int maxCount;
final long sleepFactor;
final float failRatio;
final String mode;
final int taskNum;
final String timeType;
final long shutdownWaitTS;
final long timeout;
try {
// check the configuration for the job
statePath = params.get("fsStatePath", null);
cpMode = params.get("checkpointMode", "exactly_once");
maxCount = params.getInt("maxCount", 100000);
sleepFactor = params.getLong("sleepFactor", 100);
failRatio = params.getFloat("failRatio", 0.001f);
mode = params.get("waitMode", "ordered");
taskNum = params.getInt("waitOperatorParallelism", 1);
timeType = params.get("eventType", "EventTime");
shutdownWaitTS = params.getLong("shutdownWaitTS", 20000);
timeout = params.getLong("timeout", 10000L);
} catch (Exception e) {
printUsage();
throw e;
}
StringBuilder configStringBuilder = new StringBuilder();
final String lineSeparator = System.getProperty("line.separator");
configStringBuilder.append("Job configuration").append(lineSeparator).append("FS state path=").append(statePath).append(lineSeparator).append("Checkpoint mode=").append(cpMode).append(lineSeparator).append("Max count of input from source=").append(maxCount).append(lineSeparator).append("Sleep factor=").append(sleepFactor).append(lineSeparator).append("Fail ratio=").append(failRatio).append(lineSeparator).append("Waiting mode=").append(mode).append(lineSeparator).append("Parallelism for async wait operator=").append(taskNum).append(lineSeparator).append("Event type=").append(timeType).append(lineSeparator).append("Shutdown wait timestamp=").append(shutdownWaitTS);
LOG.info(configStringBuilder.toString());
if (statePath != null) {
// setup state and checkpoint mode
env.setStateBackend(new FsStateBackend(statePath));
}
if (EXACTLY_ONCE_MODE.equals(cpMode)) {
env.enableCheckpointing(1000L, CheckpointingMode.EXACTLY_ONCE);
} else {
env.enableCheckpointing(1000L, CheckpointingMode.AT_LEAST_ONCE);
}
// enable watermark or not
if (EVENT_TIME.equals(timeType)) {
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
} else if (INGESTION_TIME.equals(timeType)) {
env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
}
// create input stream of an single integer
DataStream<Integer> inputStream = env.addSource(new SimpleSource(maxCount));
// create async function, which will *wait* for a while to simulate the process of async i/o
AsyncFunction<Integer, String> function = new SampleAsyncFunction(sleepFactor, failRatio, shutdownWaitTS);
// add async operator to streaming job
DataStream<String> result;
if (ORDERED.equals(mode)) {
result = AsyncDataStream.orderedWait(inputStream, function, timeout, TimeUnit.MILLISECONDS, 20).setParallelism(taskNum);
} else {
result = AsyncDataStream.unorderedWait(inputStream, function, timeout, TimeUnit.MILLISECONDS, 20).setParallelism(taskNum);
}
// add a reduce to get the sum of each keys.
result.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
private static final long serialVersionUID = -938116068682344455L;
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
out.collect(new Tuple2<>(value, 1));
}
}).keyBy(0).sum(1).print();
// execute the program
env.execute("Async IO Example");
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class WindowJoin method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
// parse the parameters
final ParameterTool params = ParameterTool.fromArgs(args);
final long windowSize = params.getLong("windowSize", 2000);
final long rate = params.getLong("rate", 3L);
System.out.println("Using windowSize=" + windowSize + ", data rate=" + rate);
System.out.println("To customize example, use: WindowJoin [--windowSize <window-size-in-millis>] [--rate <elements-per-second>]");
// obtain execution environment, run this example in "ingestion time"
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// create the data sources for both grades and salaries
DataStream<Tuple2<String, Integer>> grades = GradeSource.getSource(env, rate);
DataStream<Tuple2<String, Integer>> salaries = SalarySource.getSource(env, rate);
// run the actual window join program
// for testability, this functionality is in a separate method.
DataStream<Tuple3<String, Integer, Integer>> joinedStream = runWindowJoin(grades, salaries, windowSize);
// print the results with a single thread, rather than in parallel
joinedStream.print().setParallelism(1);
// execute program
env.execute("Windowed Join Example");
}
Aggregations