use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class LinearRegression method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
// set up execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
final int iterations = params.getInt("iterations", 10);
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// get input x data from elements
DataSet<Data> data;
if (params.has("input")) {
// read data from CSV file
data = env.readCsvFile(params.get("input")).fieldDelimiter(" ").includeFields(true, true).pojoType(Data.class);
} else {
System.out.println("Executing LinearRegression example with default input data set.");
System.out.println("Use --input to specify file input.");
data = LinearRegressionData.getDefaultDataDataSet(env);
}
// get the parameters from elements
DataSet<Params> parameters = LinearRegressionData.getDefaultParamsDataSet(env);
// set number of bulk iterations for SGD linear Regression
IterativeDataSet<Params> loop = parameters.iterate(iterations);
DataSet<Params> new_parameters = data.map(new SubUpdate()).withBroadcastSet(loop, "parameters").reduce(new UpdateAccumulator()).map(new Update());
// feed new parameters back into next iteration
DataSet<Params> result = loop.closeWith(new_parameters);
// emit result
if (params.has("output")) {
result.writeAsText(params.get("output"));
// execute program
env.execute("Linear Regression example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
result.print();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class EmptyFieldsCountAccumulator method main.
public static void main(final String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// get the data set
final DataSet<StringTriple> file = getDataSet(env, params);
// filter lines with empty fields
final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter());
// Here, we could do further processing with the filtered lines...
JobExecutionResult result;
// output the filtered lines
if (params.has("output")) {
filteredLines.writeAsCsv(params.get("output"));
// execute program
result = env.execute("Accumulator example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
filteredLines.print();
result = env.getLastJobExecutionResult();
}
// get the accumulator result via its registration key
final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR);
System.out.format("Number of detected empty fields per column: %s\n", emptyFields);
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class TPCHQuery10 method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
if (!params.has("customer") && !params.has("orders") && !params.has("lineitem") && !params.has("nation")) {
System.err.println(" This program expects data from the TPC-H benchmark as input data.");
System.err.println(" Due to legal restrictions, we can not ship generated data.");
System.err.println(" You can find the TPC-H data generator at http://www.tpc.org/tpch/.");
System.err.println(" Usage: TPCHQuery10 --customer <path> --orders <path> --lineitem <path> --nation <path> [--output <path>]");
return;
}
// get customer data set: (custkey, name, address, nationkey, acctbal)
DataSet<Tuple5<Integer, String, String, Integer, Double>> customers = getCustomerDataSet(env, params.get("customer"));
// get orders data set: (orderkey, custkey, orderdate)
DataSet<Tuple3<Integer, Integer, String>> orders = getOrdersDataSet(env, params.get("orders"));
// get lineitem data set: (orderkey, extendedprice, discount, returnflag)
DataSet<Tuple4<Integer, Double, Double, String>> lineitems = getLineitemDataSet(env, params.get("lineitem"));
// get nation data set: (nationkey, name)
DataSet<Tuple2<Integer, String>> nations = getNationsDataSet(env, params.get("nation"));
// orders filtered by year: (orderkey, custkey)
DataSet<Tuple2<Integer, Integer>> ordersFilteredByYear = // filter by year
orders.filter(new FilterFunction<Tuple3<Integer, Integer, String>>() {
@Override
public boolean filter(Tuple3<Integer, Integer, String> o) {
return Integer.parseInt(o.f2.substring(0, 4)) > 1990;
}
}).project(0, 1);
// lineitems filtered by flag: (orderkey, revenue)
DataSet<Tuple2<Integer, Double>> lineitemsFilteredByFlag = // filter by flag
lineitems.filter(new FilterFunction<Tuple4<Integer, Double, Double, String>>() {
@Override
public boolean filter(Tuple4<Integer, Double, Double, String> l) {
return l.f3.equals("R");
}
}).map(new MapFunction<Tuple4<Integer, Double, Double, String>, Tuple2<Integer, Double>>() {
@Override
public Tuple2<Integer, Double> map(Tuple4<Integer, Double, Double, String> l) {
// revenue per item = l_extendedprice * (1 - l_discount)
return new Tuple2<Integer, Double>(l.f0, l.f1 * (1 - l.f2));
}
});
// join orders with lineitems: (custkey, revenue)
DataSet<Tuple2<Integer, Double>> revenueByCustomer = ordersFilteredByYear.joinWithHuge(lineitemsFilteredByFlag).where(0).equalTo(0).projectFirst(1).projectSecond(1);
revenueByCustomer = revenueByCustomer.groupBy(0).aggregate(Aggregations.SUM, 1);
// join customer with nation (custkey, name, address, nationname, acctbal)
DataSet<Tuple5<Integer, String, String, String, Double>> customerWithNation = customers.joinWithTiny(nations).where(3).equalTo(0).projectFirst(0, 1, 2).projectSecond(1).projectFirst(4);
// join customer (with nation) with revenue (custkey, name, address, nationname, acctbal, revenue)
DataSet<Tuple6<Integer, String, String, String, Double, Double>> result = customerWithNation.join(revenueByCustomer).where(0).equalTo(0).projectFirst(0, 1, 2, 3, 4).projectSecond(1);
// emit result
if (params.has("output")) {
result.writeAsCsv(params.get("output"), "\n", "|");
// execute program
env.execute("TPCH Query 10 Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
result.print();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class IncrementalLearningSkeleton method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
// Checking input parameters
final ParameterTool params = ParameterTool.fromArgs(args);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<Integer> trainingData = env.addSource(new FiniteTrainingDataSource());
DataStream<Integer> newData = env.addSource(new FiniteNewDataSource());
// build new model on every second of new data
DataStream<Double[]> model = trainingData.assignTimestampsAndWatermarks(new LinearTimestamp()).timeWindowAll(Time.of(5000, TimeUnit.MILLISECONDS)).apply(new PartialModelBuilder());
// use partial model for newData
DataStream<Integer> prediction = newData.connect(model).map(new Predictor());
// emit result
if (params.has("output")) {
prediction.writeAsText(params.get("output"));
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
prediction.print();
}
// execute program
env.execute("Streaming Incremental Learning");
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class SideOutputExample method main.
public static void main(String[] args) throws Exception {
// Checking input parameters
final ParameterTool params = ParameterTool.fromArgs(args);
// set up the execution environment
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// get input data
DataStream<String> text;
if (params.has("input")) {
// read the text file from given input path
text = env.readTextFile(params.get("input"));
} else {
System.out.println("Executing WordCount example with default input data set.");
System.out.println("Use --input to specify file input.");
// get default test text data
text = env.fromElements(WordCountData.WORDS);
}
SingleOutputStreamOperator<Tuple2<String, Integer>> tokenized = text.keyBy(new KeySelector<String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer getKey(String value) throws Exception {
return 0;
}
}).process(new Tokenizer());
DataStream<String> rejectedWords = tokenized.getSideOutput(rejectedWordsTag).map(new MapFunction<String, String>() {
private static final long serialVersionUID = 1L;
@Override
public String map(String value) throws Exception {
return "rejected: " + value;
}
});
DataStream<Tuple2<String, Integer>> counts = tokenized.keyBy(0).window(TumblingEventTimeWindows.of(Time.seconds(5))).sum(1);
// emit result
if (params.has("output")) {
counts.writeAsText(params.get("output"));
rejectedWords.writeAsText(params.get("rejected-words-output"));
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
counts.print();
rejectedWords.print();
}
// execute program
env.execute("Streaming WordCount SideOutput");
}
Aggregations