Search in sources :

Example 26 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class DistCp method main.

public static void main(String[] args) throws Exception {
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    ParameterTool params = ParameterTool.fromArgs(args);
    if (!params.has("input") || !params.has("output")) {
        System.err.println("Usage: --input <path> --output <path> [--parallelism <n>]");
        return;
    }
    final Path sourcePath = new Path(params.get("input"));
    final Path targetPath = new Path(params.get("output"));
    if (!isLocal(env) && !(isOnDistributedFS(sourcePath) && isOnDistributedFS(targetPath))) {
        System.out.println("In a distributed mode only HDFS input/output paths are supported");
        return;
    }
    final int parallelism = params.getInt("parallelism", 10);
    if (parallelism <= 0) {
        System.err.println("Parallelism should be greater than 0");
        return;
    }
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    env.setParallelism(parallelism);
    long startTime = System.currentTimeMillis();
    LOGGER.info("Initializing copy tasks");
    List<FileCopyTask> tasks = getCopyTasks(sourcePath);
    LOGGER.info("Copy task initialization took " + (System.currentTimeMillis() - startTime) + "ms");
    DataSet<FileCopyTask> inputTasks = new DataSource<>(env, new FileCopyTaskInputFormat(tasks), new GenericTypeInfo<>(FileCopyTask.class), "fileCopyTasks");
    FlatMapOperator<FileCopyTask, Object> res = inputTasks.flatMap(new RichFlatMapFunction<FileCopyTask, Object>() {

        private static final long serialVersionUID = 1109254230243989929L;

        private LongCounter fileCounter;

        private LongCounter bytesCounter;

        @Override
        public void open(Configuration parameters) throws Exception {
            bytesCounter = getRuntimeContext().getLongCounter(BYTES_COPIED_CNT_NAME);
            fileCounter = getRuntimeContext().getLongCounter(FILES_COPIED_CNT_NAME);
        }

        @Override
        public void flatMap(FileCopyTask task, Collector<Object> out) throws Exception {
            LOGGER.info("Processing task: " + task);
            Path outPath = new Path(targetPath, task.getRelativePath());
            FileSystem targetFs = targetPath.getFileSystem();
            // creating parent folders in case of a local FS
            if (!targetFs.isDistributedFS()) {
                //dealing with cases like file:///tmp or just /tmp
                File outFile = outPath.toUri().isAbsolute() ? new File(outPath.toUri()) : new File(outPath.toString());
                File parentFile = outFile.getParentFile();
                if (!parentFile.mkdirs() && !parentFile.exists()) {
                    throw new RuntimeException("Cannot create local file system directories: " + parentFile);
                }
            }
            FSDataOutputStream outputStream = null;
            FSDataInputStream inputStream = null;
            try {
                outputStream = targetFs.create(outPath, true);
                inputStream = task.getPath().getFileSystem().open(task.getPath());
                int bytes = IOUtils.copy(inputStream, outputStream);
                bytesCounter.add(bytes);
            } finally {
                IOUtils.closeQuietly(inputStream);
                IOUtils.closeQuietly(outputStream);
            }
            fileCounter.add(1l);
        }
    });
    // no data sinks are needed, therefore just printing an empty result
    res.print();
    Map<String, Object> accumulators = env.getLastJobExecutionResult().getAllAccumulatorResults();
    LOGGER.info("== COUNTERS ==");
    for (Map.Entry<String, Object> e : accumulators.entrySet()) {
        LOGGER.info(e.getKey() + ": " + e.getValue());
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) LongCounter(org.apache.flink.api.common.accumulators.LongCounter) FileSystem(org.apache.flink.core.fs.FileSystem) FSDataOutputStream(org.apache.flink.core.fs.FSDataOutputStream) Path(org.apache.flink.core.fs.Path) IOException(java.io.IOException) DataSource(org.apache.flink.api.java.operators.DataSource) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) File(java.io.File) Map(java.util.Map)

Example 27 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class EnumTriangles method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // read input data
    DataSet<Edge> edges;
    if (params.has("edges")) {
        edges = env.readCsvFile(params.get("edges")).fieldDelimiter(" ").includeFields(true, true).types(Integer.class, Integer.class).map(new TupleEdgeConverter());
    } else {
        System.out.println("Executing EnumTriangles example with default edges data set.");
        System.out.println("Use --edges to specify file input.");
        edges = EnumTrianglesData.getDefaultEdgeDataSet(env);
    }
    // project edges by vertex id
    DataSet<Edge> edgesById = edges.map(new EdgeByIdProjector());
    DataSet<Triad> triangles = edgesById.groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new TriadBuilder()).join(edgesById).where(Triad.V2, Triad.V3).equalTo(Edge.V1, Edge.V2).with(new TriadFilter());
    // emit result
    if (params.has("output")) {
        triangles.writeAsCsv(params.get("output"), "\n", ",");
        // execute program
        env.execute("Basic Triangle Enumeration Example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        triangles.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Triad(org.apache.flink.examples.java.graph.util.EnumTrianglesDataTypes.Triad) Edge(org.apache.flink.examples.java.graph.util.EnumTrianglesDataTypes.Edge)

Example 28 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class PageRank method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    ParameterTool params = ParameterTool.fromArgs(args);
    final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages());
    final int maxIterations = params.getInt("iterations", 10);
    // set up execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // make the parameters available to the web ui
    env.getConfig().setGlobalJobParameters(params);
    // get input data
    DataSet<Long> pagesInput = getPagesDataSet(env, params);
    DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params);
    // assign initial rank to pages
    DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.map(new RankAssigner((1.0d / numPages)));
    // build adjacency list from link input
    DataSet<Tuple2<Long, Long[]>> adjacencyListInput = linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());
    // set iterative data set
    IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);
    DataSet<Tuple2<Long, Double>> newRanks = iteration.join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch()).groupBy(0).aggregate(SUM, 1).map(new Dampener(DAMPENING_FACTOR, numPages));
    DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(newRanks, newRanks.join(iteration).where(0).equalTo(0).filter(new EpsilonFilter()));
    // emit result
    if (params.has("output")) {
        finalPageRanks.writeAsCsv(params.get("output"), "\n", " ");
        // execute program
        env.execute("Basic Page Rank Example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        finalPageRanks.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple2(org.apache.flink.api.java.tuple.Tuple2)

Example 29 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class TransitiveClosureNaive method main.

public static void main(String... args) throws Exception {
    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    final int maxIterations = params.getInt("iterations", 10);
    DataSet<Tuple2<Long, Long>> edges;
    if (params.has("edges")) {
        edges = env.readCsvFile(params.get("edges")).fieldDelimiter(" ").types(Long.class, Long.class);
    } else {
        System.out.println("Executing TransitiveClosureNaive example with default edges data set.");
        System.out.println("Use --edges to specify file input.");
        edges = ConnectedComponentsData.getDefaultEdgeDataSet(env);
    }
    IterativeDataSet<Tuple2<Long, Long>> paths = edges.iterate(maxIterations);
    DataSet<Tuple2<Long, Long>> nextPaths = paths.join(edges).where(1).equalTo(0).with(new JoinFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple2<Long, Long>>() {

        @Override
        public /**
						left: Path (z,x) - x is reachable by z
						right: Edge (x,y) - edge x-->y exists
						out: Path (z,y) - y is reachable by z
					 */
        Tuple2<Long, Long> join(Tuple2<Long, Long> left, Tuple2<Long, Long> right) throws Exception {
            return new Tuple2<Long, Long>(left.f0, right.f1);
        }
    }).withForwardedFieldsFirst("0").withForwardedFieldsSecond("1").union(paths).groupBy(0, 1).reduceGroup(new GroupReduceFunction<Tuple2<Long, Long>, Tuple2<Long, Long>>() {

        @Override
        public void reduce(Iterable<Tuple2<Long, Long>> values, Collector<Tuple2<Long, Long>> out) throws Exception {
            out.collect(values.iterator().next());
        }
    }).withForwardedFields("0;1");
    DataSet<Tuple2<Long, Long>> newPaths = paths.coGroup(nextPaths).where(0).equalTo(0).with(new CoGroupFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple2<Long, Long>>() {

        Set<Tuple2<Long, Long>> prevSet = new HashSet<Tuple2<Long, Long>>();

        @Override
        public void coGroup(Iterable<Tuple2<Long, Long>> prevPaths, Iterable<Tuple2<Long, Long>> nextPaths, Collector<Tuple2<Long, Long>> out) throws Exception {
            for (Tuple2<Long, Long> prev : prevPaths) {
                prevSet.add(prev);
            }
            for (Tuple2<Long, Long> next : nextPaths) {
                if (!prevSet.contains(next)) {
                    out.collect(next);
                }
            }
        }
    }).withForwardedFieldsFirst("0").withForwardedFieldsSecond("0");
    DataSet<Tuple2<Long, Long>> transitiveClosure = paths.closeWith(nextPaths, newPaths);
    // emit result
    if (params.has("output")) {
        transitiveClosure.writeAsCsv(params.get("output"), "\n", " ");
        // execute program explicitly, because file sinks are lazy
        env.execute("Transitive Closure Example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        transitiveClosure.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collector(org.apache.flink.util.Collector) HashSet(java.util.HashSet)

Example 30 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class LinearRegression method main.

// *************************************************************************
//     PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    final int iterations = params.getInt("iterations", 10);
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // get input x data from elements
    DataSet<Data> data;
    if (params.has("input")) {
        // read data from CSV file
        data = env.readCsvFile(params.get("input")).fieldDelimiter(" ").includeFields(true, true).pojoType(Data.class);
    } else {
        System.out.println("Executing LinearRegression example with default input data set.");
        System.out.println("Use --input to specify file input.");
        data = LinearRegressionData.getDefaultDataDataSet(env);
    }
    // get the parameters from elements
    DataSet<Params> parameters = LinearRegressionData.getDefaultParamsDataSet(env);
    // set number of bulk iterations for SGD linear Regression
    IterativeDataSet<Params> loop = parameters.iterate(iterations);
    DataSet<Params> new_parameters = data.map(new SubUpdate()).withBroadcastSet(loop, "parameters").reduce(new UpdateAccumulator()).map(new Update());
    // feed new parameters back into next iteration
    DataSet<Params> result = loop.closeWith(new_parameters);
    // emit result
    if (params.has("output")) {
        result.writeAsText(params.get("output"));
        // execute program
        env.execute("Linear Regression example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        result.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) LinearRegressionData(org.apache.flink.examples.java.ml.util.LinearRegressionData)

Aggregations

ParameterTool (org.apache.flink.api.java.utils.ParameterTool)43 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)19 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)19 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)15 JobExecutionResult (org.apache.flink.api.common.JobExecutionResult)7 NumberFormat (java.text.NumberFormat)6 Properties (java.util.Properties)6 ProgramParametrizationException (org.apache.flink.client.program.ProgramParametrizationException)6 JDKRandomGeneratorFactory (org.apache.flink.graph.generator.random.JDKRandomGeneratorFactory)6 LongValue (org.apache.flink.types.LongValue)6 NullValue (org.apache.flink.types.NullValue)6 Graph (org.apache.flink.graph.Graph)5 GraphCsvReader (org.apache.flink.graph.GraphCsvReader)5 LongValueToUnsignedIntValue (org.apache.flink.graph.asm.translate.translators.LongValueToUnsignedIntValue)5 RMatGraph (org.apache.flink.graph.generator.RMatGraph)5 RandomGenerableFactory (org.apache.flink.graph.generator.random.RandomGenerableFactory)5 SimpleStringSchema (org.apache.flink.streaming.util.serialization.SimpleStringSchema)5 IntValue (org.apache.flink.types.IntValue)5 StringValue (org.apache.flink.types.StringValue)4 DataSet (org.apache.flink.api.java.DataSet)3