use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class DistCp method main.
public static void main(String[] args) throws Exception {
// set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
ParameterTool params = ParameterTool.fromArgs(args);
if (!params.has("input") || !params.has("output")) {
System.err.println("Usage: --input <path> --output <path> [--parallelism <n>]");
return;
}
final Path sourcePath = new Path(params.get("input"));
final Path targetPath = new Path(params.get("output"));
if (!isLocal(env) && !(isOnDistributedFS(sourcePath) && isOnDistributedFS(targetPath))) {
System.out.println("In a distributed mode only HDFS input/output paths are supported");
return;
}
final int parallelism = params.getInt("parallelism", 10);
if (parallelism <= 0) {
System.err.println("Parallelism should be greater than 0");
return;
}
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
env.setParallelism(parallelism);
long startTime = System.currentTimeMillis();
LOGGER.info("Initializing copy tasks");
List<FileCopyTask> tasks = getCopyTasks(sourcePath);
LOGGER.info("Copy task initialization took " + (System.currentTimeMillis() - startTime) + "ms");
DataSet<FileCopyTask> inputTasks = new DataSource<>(env, new FileCopyTaskInputFormat(tasks), new GenericTypeInfo<>(FileCopyTask.class), "fileCopyTasks");
FlatMapOperator<FileCopyTask, Object> res = inputTasks.flatMap(new RichFlatMapFunction<FileCopyTask, Object>() {
private static final long serialVersionUID = 1109254230243989929L;
private LongCounter fileCounter;
private LongCounter bytesCounter;
@Override
public void open(Configuration parameters) throws Exception {
bytesCounter = getRuntimeContext().getLongCounter(BYTES_COPIED_CNT_NAME);
fileCounter = getRuntimeContext().getLongCounter(FILES_COPIED_CNT_NAME);
}
@Override
public void flatMap(FileCopyTask task, Collector<Object> out) throws Exception {
LOGGER.info("Processing task: " + task);
Path outPath = new Path(targetPath, task.getRelativePath());
FileSystem targetFs = targetPath.getFileSystem();
// creating parent folders in case of a local FS
if (!targetFs.isDistributedFS()) {
//dealing with cases like file:///tmp or just /tmp
File outFile = outPath.toUri().isAbsolute() ? new File(outPath.toUri()) : new File(outPath.toString());
File parentFile = outFile.getParentFile();
if (!parentFile.mkdirs() && !parentFile.exists()) {
throw new RuntimeException("Cannot create local file system directories: " + parentFile);
}
}
FSDataOutputStream outputStream = null;
FSDataInputStream inputStream = null;
try {
outputStream = targetFs.create(outPath, true);
inputStream = task.getPath().getFileSystem().open(task.getPath());
int bytes = IOUtils.copy(inputStream, outputStream);
bytesCounter.add(bytes);
} finally {
IOUtils.closeQuietly(inputStream);
IOUtils.closeQuietly(outputStream);
}
fileCounter.add(1l);
}
});
// no data sinks are needed, therefore just printing an empty result
res.print();
Map<String, Object> accumulators = env.getLastJobExecutionResult().getAllAccumulatorResults();
LOGGER.info("== COUNTERS ==");
for (Map.Entry<String, Object> e : accumulators.entrySet()) {
LOGGER.info(e.getKey() + ": " + e.getValue());
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class EnumTriangles method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
// Checking input parameters
final ParameterTool params = ParameterTool.fromArgs(args);
// set up execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// read input data
DataSet<Edge> edges;
if (params.has("edges")) {
edges = env.readCsvFile(params.get("edges")).fieldDelimiter(" ").includeFields(true, true).types(Integer.class, Integer.class).map(new TupleEdgeConverter());
} else {
System.out.println("Executing EnumTriangles example with default edges data set.");
System.out.println("Use --edges to specify file input.");
edges = EnumTrianglesData.getDefaultEdgeDataSet(env);
}
// project edges by vertex id
DataSet<Edge> edgesById = edges.map(new EdgeByIdProjector());
DataSet<Triad> triangles = edgesById.groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new TriadBuilder()).join(edgesById).where(Triad.V2, Triad.V3).equalTo(Edge.V1, Edge.V2).with(new TriadFilter());
// emit result
if (params.has("output")) {
triangles.writeAsCsv(params.get("output"), "\n", ",");
// execute program
env.execute("Basic Triangle Enumeration Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
triangles.print();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class PageRank method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
ParameterTool params = ParameterTool.fromArgs(args);
final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages());
final int maxIterations = params.getInt("iterations", 10);
// set up execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// make the parameters available to the web ui
env.getConfig().setGlobalJobParameters(params);
// get input data
DataSet<Long> pagesInput = getPagesDataSet(env, params);
DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params);
// assign initial rank to pages
DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.map(new RankAssigner((1.0d / numPages)));
// build adjacency list from link input
DataSet<Tuple2<Long, Long[]>> adjacencyListInput = linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());
// set iterative data set
IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);
DataSet<Tuple2<Long, Double>> newRanks = iteration.join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch()).groupBy(0).aggregate(SUM, 1).map(new Dampener(DAMPENING_FACTOR, numPages));
DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(newRanks, newRanks.join(iteration).where(0).equalTo(0).filter(new EpsilonFilter()));
// emit result
if (params.has("output")) {
finalPageRanks.writeAsCsv(params.get("output"), "\n", " ");
// execute program
env.execute("Basic Page Rank Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
finalPageRanks.print();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class TransitiveClosureNaive method main.
public static void main(String... args) throws Exception {
// Checking input parameters
final ParameterTool params = ParameterTool.fromArgs(args);
// set up execution environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
final int maxIterations = params.getInt("iterations", 10);
DataSet<Tuple2<Long, Long>> edges;
if (params.has("edges")) {
edges = env.readCsvFile(params.get("edges")).fieldDelimiter(" ").types(Long.class, Long.class);
} else {
System.out.println("Executing TransitiveClosureNaive example with default edges data set.");
System.out.println("Use --edges to specify file input.");
edges = ConnectedComponentsData.getDefaultEdgeDataSet(env);
}
IterativeDataSet<Tuple2<Long, Long>> paths = edges.iterate(maxIterations);
DataSet<Tuple2<Long, Long>> nextPaths = paths.join(edges).where(1).equalTo(0).with(new JoinFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple2<Long, Long>>() {
@Override
public /**
left: Path (z,x) - x is reachable by z
right: Edge (x,y) - edge x-->y exists
out: Path (z,y) - y is reachable by z
*/
Tuple2<Long, Long> join(Tuple2<Long, Long> left, Tuple2<Long, Long> right) throws Exception {
return new Tuple2<Long, Long>(left.f0, right.f1);
}
}).withForwardedFieldsFirst("0").withForwardedFieldsSecond("1").union(paths).groupBy(0, 1).reduceGroup(new GroupReduceFunction<Tuple2<Long, Long>, Tuple2<Long, Long>>() {
@Override
public void reduce(Iterable<Tuple2<Long, Long>> values, Collector<Tuple2<Long, Long>> out) throws Exception {
out.collect(values.iterator().next());
}
}).withForwardedFields("0;1");
DataSet<Tuple2<Long, Long>> newPaths = paths.coGroup(nextPaths).where(0).equalTo(0).with(new CoGroupFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple2<Long, Long>>() {
Set<Tuple2<Long, Long>> prevSet = new HashSet<Tuple2<Long, Long>>();
@Override
public void coGroup(Iterable<Tuple2<Long, Long>> prevPaths, Iterable<Tuple2<Long, Long>> nextPaths, Collector<Tuple2<Long, Long>> out) throws Exception {
for (Tuple2<Long, Long> prev : prevPaths) {
prevSet.add(prev);
}
for (Tuple2<Long, Long> next : nextPaths) {
if (!prevSet.contains(next)) {
out.collect(next);
}
}
}
}).withForwardedFieldsFirst("0").withForwardedFieldsSecond("0");
DataSet<Tuple2<Long, Long>> transitiveClosure = paths.closeWith(nextPaths, newPaths);
// emit result
if (params.has("output")) {
transitiveClosure.writeAsCsv(params.get("output"), "\n", " ");
// execute program explicitly, because file sinks are lazy
env.execute("Transitive Closure Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
transitiveClosure.print();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class LinearRegression method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
// set up execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
final int iterations = params.getInt("iterations", 10);
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// get input x data from elements
DataSet<Data> data;
if (params.has("input")) {
// read data from CSV file
data = env.readCsvFile(params.get("input")).fieldDelimiter(" ").includeFields(true, true).pojoType(Data.class);
} else {
System.out.println("Executing LinearRegression example with default input data set.");
System.out.println("Use --input to specify file input.");
data = LinearRegressionData.getDefaultDataDataSet(env);
}
// get the parameters from elements
DataSet<Params> parameters = LinearRegressionData.getDefaultParamsDataSet(env);
// set number of bulk iterations for SGD linear Regression
IterativeDataSet<Params> loop = parameters.iterate(iterations);
DataSet<Params> new_parameters = data.map(new SubUpdate()).withBroadcastSet(loop, "parameters").reduce(new UpdateAccumulator()).map(new Update());
// feed new parameters back into next iteration
DataSet<Params> result = loop.closeWith(new_parameters);
// emit result
if (params.has("output")) {
result.writeAsText(params.get("output"));
// execute program
env.execute("Linear Regression example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
result.print();
}
}
Aggregations