Search in sources :

Example 21 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class WordCount method main.

// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up the execution environment
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    // get input data
    DataStream<String> text;
    if (params.has("input")) {
        // read the text file from given input path
        text = env.readTextFile(params.get("input"));
    } else {
        System.out.println("Executing WordCount example with default input data set.");
        System.out.println("Use --input to specify file input.");
        // get default test text data
        text = env.fromElements(WordCountData.WORDS);
    }
    DataStream<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
    text.flatMap(new Tokenizer()).keyBy(0).sum(1);
    // emit result
    if (params.has("output")) {
        counts.writeAsText(params.get("output"));
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        counts.print();
    }
    // execute program
    env.execute("Streaming WordCount");
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) Tuple2(org.apache.flink.api.java.tuple.Tuple2) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 22 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class ManualConsumerProducerTest method main.

public static void main(String[] args) throws Exception {
    ParameterTool pt = ParameterTool.fromArgs(args);
    StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment();
    see.setParallelism(4);
    DataStream<String> simpleStringStream = see.addSource(new ProduceIntoKinesis.EventsGenerator());
    Properties kinesisProducerConfig = new Properties();
    kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_REGION, pt.getRequired("region"));
    kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_ACCESS_KEY_ID, pt.getRequired("accessKey"));
    kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_SECRET_ACCESS_KEY, pt.getRequired("secretKey"));
    FlinkKinesisProducer<String> kinesis = new FlinkKinesisProducer<>(new KinesisSerializationSchema<String>() {

        @Override
        public ByteBuffer serialize(String element) {
            return ByteBuffer.wrap(element.getBytes(ConfigConstants.DEFAULT_CHARSET));
        }

        // every 10th element goes into a different stream
        @Override
        public String getTargetStream(String element) {
            if (element.split("-")[0].endsWith("0")) {
                return "flink-test-2";
            }
            // send to default stream
            return null;
        }
    }, kinesisProducerConfig);
    kinesis.setFailOnError(true);
    kinesis.setDefaultStream("test-flink");
    kinesis.setDefaultPartition("0");
    kinesis.setCustomPartitioner(new KinesisPartitioner<String>() {

        @Override
        public String getPartitionId(String element) {
            int l = element.length();
            return element.substring(l - 1, l);
        }
    });
    simpleStringStream.addSink(kinesis);
    // consuming topology
    Properties consumerProps = new Properties();
    consumerProps.setProperty(ConsumerConfigConstants.AWS_ACCESS_KEY_ID, pt.getRequired("accessKey"));
    consumerProps.setProperty(ConsumerConfigConstants.AWS_SECRET_ACCESS_KEY, pt.getRequired("secretKey"));
    consumerProps.setProperty(ConsumerConfigConstants.AWS_REGION, pt.getRequired("region"));
    DataStream<String> consuming = see.addSource(new FlinkKinesisConsumer<>("test-flink", new SimpleStringSchema(), consumerProps));
    // validate consumed records for correctness
    consuming.flatMap(new FlatMapFunction<String, String>() {

        @Override
        public void flatMap(String value, Collector<String> out) throws Exception {
            String[] parts = value.split("-");
            try {
                long l = Long.parseLong(parts[0]);
                if (l < 0) {
                    throw new RuntimeException("Negative");
                }
            } catch (NumberFormatException nfe) {
                throw new RuntimeException("First part of '" + value + "' is not a valid numeric type");
            }
            if (parts[1].length() != 12) {
                throw new RuntimeException("Second part of '" + value + "' doesn't have 12 characters");
            }
        }
    });
    consuming.print();
    see.execute();
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) Properties(java.util.Properties) ByteBuffer(java.nio.ByteBuffer) FlinkKinesisProducer(org.apache.flink.streaming.connectors.kinesis.FlinkKinesisProducer) SimpleStringSchema(org.apache.flink.streaming.util.serialization.SimpleStringSchema) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) ProduceIntoKinesis(org.apache.flink.streaming.connectors.kinesis.examples.ProduceIntoKinesis)

Example 23 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class ManualExactlyOnceWithStreamReshardingTest method main.

public static void main(String[] args) throws Exception {
    final ParameterTool pt = ParameterTool.fromArgs(args);
    LOG.info("Starting exactly once with stream resharding test");
    final String streamName = "flink-test-" + UUID.randomUUID().toString();
    final String accessKey = pt.getRequired("accessKey");
    final String secretKey = pt.getRequired("secretKey");
    final String region = pt.getRequired("region");
    final Properties configProps = new Properties();
    configProps.setProperty(ConsumerConfigConstants.AWS_ACCESS_KEY_ID, accessKey);
    configProps.setProperty(ConsumerConfigConstants.AWS_SECRET_ACCESS_KEY, secretKey);
    configProps.setProperty(ConsumerConfigConstants.AWS_REGION, region);
    configProps.setProperty(ConsumerConfigConstants.SHARD_DISCOVERY_INTERVAL_MILLIS, "0");
    final AmazonKinesisClient client = AWSUtil.createKinesisClient(configProps);
    // the stream is first created with 1 shard
    client.createStream(streamName, 1);
    // wait until stream has been created
    DescribeStreamResult status = client.describeStream(streamName);
    LOG.info("status {}", status);
    while (!status.getStreamDescription().getStreamStatus().equals("ACTIVE")) {
        status = client.describeStream(streamName);
        LOG.info("Status of stream {}", status);
        Thread.sleep(1000);
    }
    final Configuration flinkConfig = new Configuration();
    flinkConfig.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    flinkConfig.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 8);
    flinkConfig.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 16);
    flinkConfig.setString(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY, "0 s");
    LocalFlinkMiniCluster flink = new LocalFlinkMiniCluster(flinkConfig, false);
    flink.start();
    final int flinkPort = flink.getLeaderRPCPort();
    try {
        // we have to use a manual generator here instead of the FlinkKinesisProducer
        // because the FlinkKinesisProducer currently has a problem where records will be resent to a shard
        // when resharding happens; this affects the consumer exactly-once validation test and will never pass
        final AtomicReference<Throwable> producerError = new AtomicReference<>();
        Runnable manualGenerate = new Runnable() {

            @Override
            public void run() {
                AmazonKinesisClient client = AWSUtil.createKinesisClient(configProps);
                int count = 0;
                final int batchSize = 30;
                while (true) {
                    try {
                        Thread.sleep(10);
                        Set<PutRecordsRequestEntry> batch = new HashSet<>();
                        for (int i = count; i < count + batchSize; i++) {
                            if (i >= TOTAL_EVENT_COUNT) {
                                break;
                            }
                            batch.add(new PutRecordsRequestEntry().withData(ByteBuffer.wrap(((i) + "-" + RandomStringUtils.randomAlphabetic(12)).getBytes(ConfigConstants.DEFAULT_CHARSET))).withPartitionKey(UUID.randomUUID().toString()));
                        }
                        count += batchSize;
                        PutRecordsResult result = client.putRecords(new PutRecordsRequest().withStreamName(streamName).withRecords(batch));
                        // and let this test fail
                        if (result.getFailedRecordCount() > 0) {
                            producerError.set(new RuntimeException("The producer has failed records in one of the put batch attempts."));
                            break;
                        }
                        if (count >= TOTAL_EVENT_COUNT) {
                            break;
                        }
                    } catch (Exception e) {
                        producerError.set(e);
                    }
                }
            }
        };
        Thread producerThread = new Thread(manualGenerate);
        producerThread.start();
        final AtomicReference<Throwable> consumerError = new AtomicReference<>();
        Thread consumerThread = ExactlyOnceValidatingConsumerThread.create(TOTAL_EVENT_COUNT, 10000, 2, 500, 500, accessKey, secretKey, region, streamName, consumerError, flinkPort, flinkConfig);
        consumerThread.start();
        // reshard the Kinesis stream while the producer / and consumers are running
        Runnable splitShard = new Runnable() {

            @Override
            public void run() {
                try {
                    // first, split shard in the middle of the hash range
                    Thread.sleep(5000);
                    LOG.info("Splitting shard ...");
                    client.splitShard(streamName, KinesisShardIdGenerator.generateFromShardOrder(0), "170141183460469231731687303715884105727");
                    // wait until the split shard operation finishes updating ...
                    DescribeStreamResult status;
                    Random rand = new Random();
                    do {
                        status = null;
                        while (status == null) {
                            // retry until we get status
                            try {
                                status = client.describeStream(streamName);
                            } catch (LimitExceededException lee) {
                                LOG.warn("LimitExceededException while describing stream ... retrying ...");
                                Thread.sleep(rand.nextInt(1200));
                            }
                        }
                    } while (!status.getStreamDescription().getStreamStatus().equals("ACTIVE"));
                    // then merge again
                    Thread.sleep(7000);
                    LOG.info("Merging shards ...");
                    client.mergeShards(streamName, KinesisShardIdGenerator.generateFromShardOrder(1), KinesisShardIdGenerator.generateFromShardOrder(2));
                } catch (InterruptedException iex) {
                //
                }
            }
        };
        Thread splitShardThread = new Thread(splitShard);
        splitShardThread.start();
        boolean deadlinePassed = false;
        // wait at most for five minutes
        long deadline = System.currentTimeMillis() + (1000 * 5 * 60);
        // wait until both producer and consumer finishes, or an unexpected error is thrown
        while ((consumerThread.isAlive() || producerThread.isAlive()) && (producerError.get() == null && consumerError.get() == null)) {
            Thread.sleep(1000);
            if (System.currentTimeMillis() >= deadline) {
                LOG.warn("Deadline passed");
                deadlinePassed = true;
                // enough waiting
                break;
            }
        }
        if (producerThread.isAlive()) {
            producerThread.interrupt();
        }
        if (consumerThread.isAlive()) {
            consumerThread.interrupt();
        }
        if (producerError.get() != null) {
            LOG.info("+++ TEST failed! +++");
            throw new RuntimeException("Producer failed", producerError.get());
        }
        if (consumerError.get() != null) {
            LOG.info("+++ TEST failed! +++");
            throw new RuntimeException("Consumer failed", consumerError.get());
        }
        if (!deadlinePassed) {
            LOG.info("+++ TEST passed! +++");
        } else {
            LOG.info("+++ TEST failed! +++");
        }
    } finally {
        client.deleteStream(streamName);
        client.shutdown();
        // stopping flink
        flink.stop();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) PutRecordsRequestEntry(com.amazonaws.services.kinesis.model.PutRecordsRequestEntry) Configuration(org.apache.flink.configuration.Configuration) Properties(java.util.Properties) LocalFlinkMiniCluster(org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster) PutRecordsResult(com.amazonaws.services.kinesis.model.PutRecordsResult) Random(java.util.Random) PutRecordsRequest(com.amazonaws.services.kinesis.model.PutRecordsRequest) HashSet(java.util.HashSet) AmazonKinesisClient(com.amazonaws.services.kinesis.AmazonKinesisClient) AtomicReference(java.util.concurrent.atomic.AtomicReference) LimitExceededException(com.amazonaws.services.kinesis.model.LimitExceededException) ExactlyOnceValidatingConsumerThread(org.apache.flink.streaming.connectors.kinesis.testutils.ExactlyOnceValidatingConsumerThread) LimitExceededException(com.amazonaws.services.kinesis.model.LimitExceededException) DescribeStreamResult(com.amazonaws.services.kinesis.model.DescribeStreamResult)

Example 24 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class ManualProducerTest method main.

public static void main(String[] args) throws Exception {
    ParameterTool pt = ParameterTool.fromArgs(args);
    StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment();
    see.setParallelism(4);
    DataStream<String> simpleStringStream = see.addSource(new ProduceIntoKinesis.EventsGenerator());
    Properties kinesisProducerConfig = new Properties();
    kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_REGION, pt.getRequired("region"));
    kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_ACCESS_KEY_ID, pt.getRequired("accessKey"));
    kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_SECRET_ACCESS_KEY, pt.getRequired("secretKey"));
    FlinkKinesisProducer<String> kinesis = new FlinkKinesisProducer<>(new KinesisSerializationSchema<String>() {

        @Override
        public ByteBuffer serialize(String element) {
            return ByteBuffer.wrap(element.getBytes(ConfigConstants.DEFAULT_CHARSET));
        }

        // every 10th element goes into a different stream
        @Override
        public String getTargetStream(String element) {
            if (element.split("-")[0].endsWith("0")) {
                return "flink-test-2";
            }
            // send to default stream
            return null;
        }
    }, kinesisProducerConfig);
    kinesis.setFailOnError(true);
    kinesis.setDefaultStream("test-flink");
    kinesis.setDefaultPartition("0");
    kinesis.setCustomPartitioner(new KinesisPartitioner<String>() {

        @Override
        public String getPartitionId(String element) {
            int l = element.length();
            return element.substring(l - 1, l);
        }
    });
    simpleStringStream.addSink(kinesis);
    see.execute();
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) Properties(java.util.Properties) ByteBuffer(java.nio.ByteBuffer) FlinkKinesisProducer(org.apache.flink.streaming.connectors.kinesis.FlinkKinesisProducer) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) ProduceIntoKinesis(org.apache.flink.streaming.connectors.kinesis.examples.ProduceIntoKinesis)

Example 25 with ParameterTool

use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.

the class KMeansDataGenerator method main.

/**
	 * Main method to generate data for the {@link KMeans} example program.
	 * <p>
	 * The generator creates to files:
	 * <ul>
	 * <li><code>&lt; output-path &gt;/points</code> for the data points
	 * <li><code>&lt; output-path &gt;/centers</code> for the cluster centers
	 * </ul> 
	 * 
	 * @param args 
	 * <ol>
	 * <li>Int: Number of data points
	 * <li>Int: Number of cluster centers
	 * <li><b>Optional</b> String: Output path, default value is {tmp.dir}
	 * <li><b>Optional</b> Double: Standard deviation of data points
	 * <li><b>Optional</b> Double: Value range of cluster centers
	 * <li><b>Optional</b> Long: Random seed
	 * </ol>
	 *
	 * @throws IOException
	 */
public static void main(String[] args) throws IOException {
    // check parameter count
    if (args.length < 2) {
        System.out.println("KMeansDataGenerator -points <num> -k <num clusters> [-output <output-path>] [-stddev <relative stddev>] [-range <centroid range>] [-seed <seed>]");
        System.exit(1);
    }
    // parse parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    final int numDataPoints = params.getInt("points");
    final int k = params.getInt("k");
    final String outDir = params.get("output", System.getProperty("java.io.tmpdir"));
    final double stddev = params.getDouble("stddev", RELATIVE_STDDEV);
    final double range = params.getDouble("range", DEFAULT_VALUE_RANGE);
    final long firstSeed = params.getLong("seed", DEFAULT_SEED);
    final double absoluteStdDev = stddev * range;
    final Random random = new Random(firstSeed);
    // the means around which data points are distributed
    final double[][] means = uniformRandomCenters(random, k, DIMENSIONALITY, range);
    // write the points out
    BufferedWriter pointsOut = null;
    try {
        pointsOut = new BufferedWriter(new FileWriter(new File(outDir + "/" + POINTS_FILE)));
        StringBuilder buffer = new StringBuilder();
        double[] point = new double[DIMENSIONALITY];
        int nextCentroid = 0;
        for (int i = 1; i <= numDataPoints; i++) {
            // generate a point for the current centroid
            double[] centroid = means[nextCentroid];
            for (int d = 0; d < DIMENSIONALITY; d++) {
                point[d] = (random.nextGaussian() * absoluteStdDev) + centroid[d];
            }
            writePoint(point, buffer, pointsOut);
            nextCentroid = (nextCentroid + 1) % k;
        }
    } finally {
        if (pointsOut != null) {
            pointsOut.close();
        }
    }
    // write the uniformly distributed centers to a file
    BufferedWriter centersOut = null;
    try {
        centersOut = new BufferedWriter(new FileWriter(new File(outDir + "/" + CENTERS_FILE)));
        StringBuilder buffer = new StringBuilder();
        double[][] centers = uniformRandomCenters(random, k, DIMENSIONALITY, range);
        for (int i = 0; i < k; i++) {
            writeCenter(i + 1, centers[i], buffer, centersOut);
        }
    } finally {
        if (centersOut != null) {
            centersOut.close();
        }
    }
    System.out.println("Wrote " + numDataPoints + " data points to " + outDir + "/" + POINTS_FILE);
    System.out.println("Wrote " + k + " cluster centers to " + outDir + "/" + CENTERS_FILE);
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) Random(java.util.Random) FileWriter(java.io.FileWriter) File(java.io.File) BufferedWriter(java.io.BufferedWriter)

Aggregations

ParameterTool (org.apache.flink.api.java.utils.ParameterTool)43 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)19 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)19 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)15 JobExecutionResult (org.apache.flink.api.common.JobExecutionResult)7 NumberFormat (java.text.NumberFormat)6 Properties (java.util.Properties)6 ProgramParametrizationException (org.apache.flink.client.program.ProgramParametrizationException)6 JDKRandomGeneratorFactory (org.apache.flink.graph.generator.random.JDKRandomGeneratorFactory)6 LongValue (org.apache.flink.types.LongValue)6 NullValue (org.apache.flink.types.NullValue)6 Graph (org.apache.flink.graph.Graph)5 GraphCsvReader (org.apache.flink.graph.GraphCsvReader)5 LongValueToUnsignedIntValue (org.apache.flink.graph.asm.translate.translators.LongValueToUnsignedIntValue)5 RMatGraph (org.apache.flink.graph.generator.RMatGraph)5 RandomGenerableFactory (org.apache.flink.graph.generator.random.RandomGenerableFactory)5 SimpleStringSchema (org.apache.flink.streaming.util.serialization.SimpleStringSchema)5 IntValue (org.apache.flink.types.IntValue)5 StringValue (org.apache.flink.types.StringValue)4 DataSet (org.apache.flink.api.java.DataSet)3