Search in sources :

Example 31 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class LinearRegression method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    final int iterations = params.getInt("iterations", 10);
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // get input x data from elements
    DataSet<Data> data;
    if (params.has("input")) {
        // read data from CSV file
        data = env.readCsvFile(params.get("input")).fieldDelimiter(" ").includeFields(true, true).pojoType(Data.class);
    } else {
        System.out.println("Executing LinearRegression example with default input data set.");
        System.out.println("Use --input to specify file input.");
        data = LinearRegressionData.getDefaultDataDataSet(env);
    }
    // get the parameters from elements
    DataSet<Params> parameters = LinearRegressionData.getDefaultParamsDataSet(env);
    // set number of bulk iterations for SGD linear Regression
    IterativeDataSet<Params> loop = parameters.iterate(iterations);
    DataSet<Params> new_parameters = data.map(new SubUpdate()).withBroadcastSet(loop, "parameters").reduce(new UpdateAccumulator()).map(new Update());
    // feed new parameters back into next iteration
    DataSet<Params> result = loop.closeWith(new_parameters);
    // emit result
    if (params.has("output")) {
        result.writeAsText(params.get("output"));
        // execute program
        env.execute("Linear Regression example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        result.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) LinearRegressionData(org.apache.flink.examples.java.ml.util.LinearRegressionData)

Example 32 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class EmptyFieldsCountAccumulator method main.

public static void main(final String[] args) throws Exception {
    final ParameterTool params = ParameterTool.fromArgs(args);
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // get the data set
    final DataSet<StringTriple> file = getDataSet(env, params);
    // filter lines with empty fields
    final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter());
    // Here, we could do further processing with the filtered lines...
    JobExecutionResult result;
    // output the filtered lines
    if (params.has("output")) {
        filteredLines.writeAsCsv(params.get("output"));
        // execute program
        result = env.execute("Accumulator example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        filteredLines.print();
        result = env.getLastJobExecutionResult();
    }
    // get the accumulator result via its registration key
    final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR);
    System.out.format("Number of detected empty fields per column: %s\n", emptyFields);
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) JobExecutionResult(org.apache.flink.api.common.JobExecutionResult) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment)

Example 33 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class TPCHQuery10 method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    final ParameterTool params = ParameterTool.fromArgs(args);
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (!params.has("customer") && !params.has("orders") && !params.has("lineitem") && !params.has("nation")) {
        System.err.println("  This program expects data from the TPC-H benchmark as input data.");
        System.err.println("  Due to legal restrictions, we can not ship generated data.");
        System.err.println("  You can find the TPC-H data generator at http://www.tpc.org/tpch/.");
        System.err.println("  Usage: TPCHQuery10 --customer <path> --orders <path> --lineitem <path> --nation <path> [--output <path>]");
        return;
    }
    // get customer data set: (custkey, name, address, nationkey, acctbal) 
    DataSet<Tuple5<Integer, String, String, Integer, Double>> customers = getCustomerDataSet(env, params.get("customer"));
    // get orders data set: (orderkey, custkey, orderdate)
    DataSet<Tuple3<Integer, Integer, String>> orders = getOrdersDataSet(env, params.get("orders"));
    // get lineitem data set: (orderkey, extendedprice, discount, returnflag)
    DataSet<Tuple4<Integer, Double, Double, String>> lineitems = getLineitemDataSet(env, params.get("lineitem"));
    // get nation data set: (nationkey, name)
    DataSet<Tuple2<Integer, String>> nations = getNationsDataSet(env, params.get("nation"));
    // orders filtered by year: (orderkey, custkey)
    DataSet<Tuple2<Integer, Integer>> ordersFilteredByYear = // filter by year
    orders.filter(new FilterFunction<Tuple3<Integer, Integer, String>>() {

        @Override
        public boolean filter(Tuple3<Integer, Integer, String> o) {
            return Integer.parseInt(o.f2.substring(0, 4)) > 1990;
        }
    }).project(0, 1);
    // lineitems filtered by flag: (orderkey, revenue)
    DataSet<Tuple2<Integer, Double>> lineitemsFilteredByFlag = // filter by flag
    lineitems.filter(new FilterFunction<Tuple4<Integer, Double, Double, String>>() {

        @Override
        public boolean filter(Tuple4<Integer, Double, Double, String> l) {
            return l.f3.equals("R");
        }
    }).map(new MapFunction<Tuple4<Integer, Double, Double, String>, Tuple2<Integer, Double>>() {

        @Override
        public Tuple2<Integer, Double> map(Tuple4<Integer, Double, Double, String> l) {
            // revenue per item = l_extendedprice * (1 - l_discount)
            return new Tuple2<Integer, Double>(l.f0, l.f1 * (1 - l.f2));
        }
    });
    // join orders with lineitems: (custkey, revenue)
    DataSet<Tuple2<Integer, Double>> revenueByCustomer = ordersFilteredByYear.joinWithHuge(lineitemsFilteredByFlag).where(0).equalTo(0).projectFirst(1).projectSecond(1);
    revenueByCustomer = revenueByCustomer.groupBy(0).aggregate(Aggregations.SUM, 1);
    // join customer with nation (custkey, name, address, nationname, acctbal)
    DataSet<Tuple5<Integer, String, String, String, Double>> customerWithNation = customers.joinWithTiny(nations).where(3).equalTo(0).projectFirst(0, 1, 2).projectSecond(1).projectFirst(4);
    // join customer (with nation) with revenue (custkey, name, address, nationname, acctbal, revenue)
    DataSet<Tuple6<Integer, String, String, String, Double, Double>> result = customerWithNation.join(revenueByCustomer).where(0).equalTo(0).projectFirst(0, 1, 2, 3, 4).projectSecond(1);
    // emit result
    if (params.has("output")) {
        result.writeAsCsv(params.get("output"), "\n", "|");
        // execute program
        env.execute("TPCH Query 10 Example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        result.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) FilterFunction(org.apache.flink.api.common.functions.FilterFunction) Tuple4(org.apache.flink.api.java.tuple.Tuple4) Tuple5(org.apache.flink.api.java.tuple.Tuple5) Tuple6(org.apache.flink.api.java.tuple.Tuple6) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3)

Example 34 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class IncrementalLearningSkeleton method main.

// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
    DataStream<Integer> trainingData = env.addSource(new FiniteTrainingDataSource());
    DataStream<Integer> newData = env.addSource(new FiniteNewDataSource());
    // build new model on every second of new data
    DataStream<Double[]> model = trainingData.assignTimestampsAndWatermarks(new LinearTimestamp()).timeWindowAll(Time.of(5000, TimeUnit.MILLISECONDS)).apply(new PartialModelBuilder());
    // use partial model for newData
    DataStream<Integer> prediction = newData.connect(model).map(new Predictor());
    // emit result
    if (params.has("output")) {
        prediction.writeAsText(params.get("output"));
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        prediction.print();
    }
    // execute program
    env.execute("Streaming Incremental Learning");
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 35 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class SideOutputExample method main.

public static void main(String[] args) throws Exception {
    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up the execution environment
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // get input data
    DataStream<String> text;
    if (params.has("input")) {
        // read the text file from given input path
        text = env.readTextFile(params.get("input"));
    } else {
        System.out.println("Executing WordCount example with default input data set.");
        System.out.println("Use --input to specify file input.");
        // get default test text data
        text = env.fromElements(WordCountData.WORDS);
    }
    SingleOutputStreamOperator<Tuple2<String, Integer>> tokenized = text.keyBy(new KeySelector<String, Integer>() {

        private static final long serialVersionUID = 1L;

        @Override
        public Integer getKey(String value) throws Exception {
            return 0;
        }
    }).process(new Tokenizer());
    DataStream<String> rejectedWords = tokenized.getSideOutput(rejectedWordsTag).map(new MapFunction<String, String>() {

        private static final long serialVersionUID = 1L;

        @Override
        public String map(String value) throws Exception {
            return "rejected: " + value;
        }
    });
    DataStream<Tuple2<String, Integer>> counts = tokenized.keyBy(0).window(TumblingEventTimeWindows.of(Time.seconds(5))).sum(1);
    // emit result
    if (params.has("output")) {
        counts.writeAsText(params.get("output"));
        rejectedWords.writeAsText(params.get("rejected-words-output"));
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        counts.print();
        rejectedWords.print();
    }
    // execute program
    env.execute("Streaming WordCount SideOutput");
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) KeySelector(org.apache.flink.api.java.functions.KeySelector) Tuple2(org.apache.flink.api.java.tuple.Tuple2) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Aggregations

ParameterTool (org.apache.flink.api.java.utils.ParameterTool)43 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)19 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)19 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)15 JobExecutionResult (org.apache.flink.api.common.JobExecutionResult)7 NumberFormat (java.text.NumberFormat)6 Properties (java.util.Properties)6 ProgramParametrizationException (org.apache.flink.client.program.ProgramParametrizationException)6 JDKRandomGeneratorFactory (org.apache.flink.graph.generator.random.JDKRandomGeneratorFactory)6 LongValue (org.apache.flink.types.LongValue)6 NullValue (org.apache.flink.types.NullValue)6 Graph (org.apache.flink.graph.Graph)5 GraphCsvReader (org.apache.flink.graph.GraphCsvReader)5 LongValueToUnsignedIntValue (org.apache.flink.graph.asm.translate.translators.LongValueToUnsignedIntValue)5 RMatGraph (org.apache.flink.graph.generator.RMatGraph)5 RandomGenerableFactory (org.apache.flink.graph.generator.random.RandomGenerableFactory)5 SimpleStringSchema (org.apache.flink.streaming.util.serialization.SimpleStringSchema)5 IntValue (org.apache.flink.types.IntValue)5 StringValue (org.apache.flink.types.StringValue)4 DataSet (org.apache.flink.api.java.DataSet)3