Search in sources :

Example 11 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class ConnectedComponents method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String... args) throws Exception {
    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    final int maxIterations = params.getInt("iterations", 10);
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // read vertex and edge data
    DataSet<Long> vertices = getVertexDataSet(env, params);
    DataSet<Tuple2<Long, Long>> edges = getEdgeDataSet(env, params).flatMap(new UndirectEdge());
    // assign the initial components (equal to the vertex id)
    DataSet<Tuple2<Long, Long>> verticesWithInitialId = vertices.map(new DuplicateValue<Long>());
    // open a delta iteration
    DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = verticesWithInitialId.iterateDelta(verticesWithInitialId, maxIterations, 0);
    // apply the step logic: join with the edges, select the minimum neighbor, update if the component of the candidate is smaller
    DataSet<Tuple2<Long, Long>> changes = iteration.getWorkset().join(edges).where(0).equalTo(0).with(new NeighborWithComponentIDJoin()).groupBy(0).aggregate(Aggregations.MIN, 1).join(iteration.getSolutionSet()).where(0).equalTo(0).with(new ComponentIdFilter());
    // close the delta iteration (delta and new workset are identical)
    DataSet<Tuple2<Long, Long>> result = iteration.closeWith(changes, changes);
    // emit result
    if (params.has("output")) {
        result.writeAsCsv(params.get("output"), "\n", " ");
        // execute program
        env.execute("Connected Components Example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        result.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple2(org.apache.flink.api.java.tuple.Tuple2)

Example 12 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class WebLogAnalysis method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    final ParameterTool params = ParameterTool.fromArgs(args);
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.getConfig().setGlobalJobParameters(params);
    // get input data
    DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
    DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
    DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);
    // Retain documents with keywords
    DataSet<Tuple1<String>> filterDocs = documents.filter(new FilterDocByKeyWords()).project(0);
    // Filter ranks by minimum rank
    DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks.filter(new FilterByRank());
    // Filter visits by visit date
    DataSet<Tuple1<String>> filterVisits = visits.filter(new FilterVisitsByDate()).project(0);
    // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
    DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks = filterDocs.join(filterRanks).where(0).equalTo(1).projectSecond(0, 1, 2);
    // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time
    DataSet<Tuple3<Integer, String, Integer>> result = joinDocsRanks.coGroup(filterVisits).where(1).equalTo(0).with(new AntiJoinVisits());
    // emit result
    if (params.has("output")) {
        result.writeAsCsv(params.get("output"), "\n", "|");
        // execute program
        env.execute("WebLogAnalysis Example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        result.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3)

Example 13 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class WordCountPojo method main.

public static void main(String[] args) throws Exception {
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // get input data
    DataSet<String> text;
    if (params.has("input")) {
        // read the text file from given input path
        text = env.readTextFile(params.get("input"));
    } else {
        // get default test text data
        System.out.println("Executing WordCount example with default input data set.");
        System.out.println("Use --input to specify file input.");
        text = WordCountData.getDefaultTextLineDataSet(env);
    }
    DataSet<Word> counts = // split up the lines into Word objects (with frequency = 1)
    text.flatMap(new Tokenizer()).groupBy("word").reduce(new ReduceFunction<Word>() {

        @Override
        public Word reduce(Word value1, Word value2) throws Exception {
            return new Word(value1.word, value1.frequency + value2.frequency);
        }
    });
    if (params.has("output")) {
        counts.writeAsText(params.get("output"), WriteMode.OVERWRITE);
        // execute program
        env.execute("WordCount-Pojo Example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        counts.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment)

Example 14 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class AsyncIOExample method main.

public static void main(String[] args) throws Exception {
    // obtain execution environment
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // parse parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    final String statePath;
    final String cpMode;
    final int maxCount;
    final long sleepFactor;
    final float failRatio;
    final String mode;
    final int taskNum;
    final String timeType;
    final long shutdownWaitTS;
    final long timeout;
    try {
        // check the configuration for the job
        statePath = params.get("fsStatePath", null);
        cpMode = params.get("checkpointMode", "exactly_once");
        maxCount = params.getInt("maxCount", 100000);
        sleepFactor = params.getLong("sleepFactor", 100);
        failRatio = params.getFloat("failRatio", 0.001f);
        mode = params.get("waitMode", "ordered");
        taskNum = params.getInt("waitOperatorParallelism", 1);
        timeType = params.get("eventType", "EventTime");
        shutdownWaitTS = params.getLong("shutdownWaitTS", 20000);
        timeout = params.getLong("timeout", 10000L);
    } catch (Exception e) {
        printUsage();
        throw e;
    }
    StringBuilder configStringBuilder = new StringBuilder();
    final String lineSeparator = System.getProperty("line.separator");
    configStringBuilder.append("Job configuration").append(lineSeparator).append("FS state path=").append(statePath).append(lineSeparator).append("Checkpoint mode=").append(cpMode).append(lineSeparator).append("Max count of input from source=").append(maxCount).append(lineSeparator).append("Sleep factor=").append(sleepFactor).append(lineSeparator).append("Fail ratio=").append(failRatio).append(lineSeparator).append("Waiting mode=").append(mode).append(lineSeparator).append("Parallelism for async wait operator=").append(taskNum).append(lineSeparator).append("Event type=").append(timeType).append(lineSeparator).append("Shutdown wait timestamp=").append(shutdownWaitTS);
    LOG.info(configStringBuilder.toString());
    if (statePath != null) {
        // setup state and checkpoint mode
        env.setStateBackend(new FsStateBackend(statePath));
    }
    if (EXACTLY_ONCE_MODE.equals(cpMode)) {
        env.enableCheckpointing(1000L, CheckpointingMode.EXACTLY_ONCE);
    } else {
        env.enableCheckpointing(1000L, CheckpointingMode.AT_LEAST_ONCE);
    }
    // enable watermark or not
    if (EVENT_TIME.equals(timeType)) {
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
    } else if (INGESTION_TIME.equals(timeType)) {
        env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
    }
    // create input stream of an single integer
    DataStream<Integer> inputStream = env.addSource(new SimpleSource(maxCount));
    // create async function, which will *wait* for a while to simulate the process of async i/o
    AsyncFunction<Integer, String> function = new SampleAsyncFunction(sleepFactor, failRatio, shutdownWaitTS);
    // add async operator to streaming job
    DataStream<String> result;
    if (ORDERED.equals(mode)) {
        result = AsyncDataStream.orderedWait(inputStream, function, timeout, TimeUnit.MILLISECONDS, 20).setParallelism(taskNum);
    } else {
        result = AsyncDataStream.unorderedWait(inputStream, function, timeout, TimeUnit.MILLISECONDS, 20).setParallelism(taskNum);
    }
    // add a reduce to get the sum of each keys.
    result.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {

        private static final long serialVersionUID = -938116068682344455L;

        @Override
        public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
            out.collect(new Tuple2<>(value, 1));
        }
    }).keyBy(0).sum(1).print();
    // execute the program
    env.execute("Async IO Example");
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) Collector(org.apache.flink.util.Collector) AsyncCollector(org.apache.flink.streaming.api.functions.async.collector.AsyncCollector) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) FsStateBackend(org.apache.flink.runtime.state.filesystem.FsStateBackend)

Example 15 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class WindowJoin method main.

// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    // parse the parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    final long windowSize = params.getLong("windowSize", 2000);
    final long rate = params.getLong("rate", 3L);
    System.out.println("Using windowSize=" + windowSize + ", data rate=" + rate);
    System.out.println("To customize example, use: WindowJoin [--windowSize <window-size-in-millis>] [--rate <elements-per-second>]");
    // obtain execution environment, run this example in "ingestion time"
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // create the data sources for both grades and salaries
    DataStream<Tuple2<String, Integer>> grades = GradeSource.getSource(env, rate);
    DataStream<Tuple2<String, Integer>> salaries = SalarySource.getSource(env, rate);
    // run the actual window join program
    // for testability, this functionality is in a separate method.
    DataStream<Tuple3<String, Integer, Integer>> joinedStream = runWindowJoin(grades, salaries, windowSize);
    // print the results with a single thread, rather than in parallel
    joinedStream.print().setParallelism(1);
    // execute program
    env.execute("Windowed Join Example");
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Aggregations

ParameterTool (org.apache.flink.api.java.utils.ParameterTool)43 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)19 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)19 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)15 JobExecutionResult (org.apache.flink.api.common.JobExecutionResult)7 NumberFormat (java.text.NumberFormat)6 Properties (java.util.Properties)6 ProgramParametrizationException (org.apache.flink.client.program.ProgramParametrizationException)6 JDKRandomGeneratorFactory (org.apache.flink.graph.generator.random.JDKRandomGeneratorFactory)6 LongValue (org.apache.flink.types.LongValue)6 NullValue (org.apache.flink.types.NullValue)6 Graph (org.apache.flink.graph.Graph)5 GraphCsvReader (org.apache.flink.graph.GraphCsvReader)5 LongValueToUnsignedIntValue (org.apache.flink.graph.asm.translate.translators.LongValueToUnsignedIntValue)5 RMatGraph (org.apache.flink.graph.generator.RMatGraph)5 RandomGenerableFactory (org.apache.flink.graph.generator.random.RandomGenerableFactory)5 SimpleStringSchema (org.apache.flink.streaming.util.serialization.SimpleStringSchema)5 IntValue (org.apache.flink.types.IntValue)5 StringValue (org.apache.flink.types.StringValue)4 DataSet (org.apache.flink.api.java.DataSet)3