use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class WordCount method main.
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
// Checking input parameters
final ParameterTool params = ParameterTool.fromArgs(args);
// set up the execution environment
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// get input data
DataStream<String> text;
if (params.has("input")) {
// read the text file from given input path
text = env.readTextFile(params.get("input"));
} else {
System.out.println("Executing WordCount example with default input data set.");
System.out.println("Use --input to specify file input.");
// get default test text data
text = env.fromElements(WordCountData.WORDS);
}
DataStream<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
text.flatMap(new Tokenizer()).keyBy(0).sum(1);
// emit result
if (params.has("output")) {
counts.writeAsText(params.get("output"));
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
counts.print();
}
// execute program
env.execute("Streaming WordCount");
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class ManualConsumerProducerTest method main.
public static void main(String[] args) throws Exception {
ParameterTool pt = ParameterTool.fromArgs(args);
StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment();
see.setParallelism(4);
DataStream<String> simpleStringStream = see.addSource(new ProduceIntoKinesis.EventsGenerator());
Properties kinesisProducerConfig = new Properties();
kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_REGION, pt.getRequired("region"));
kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_ACCESS_KEY_ID, pt.getRequired("accessKey"));
kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_SECRET_ACCESS_KEY, pt.getRequired("secretKey"));
FlinkKinesisProducer<String> kinesis = new FlinkKinesisProducer<>(new KinesisSerializationSchema<String>() {
@Override
public ByteBuffer serialize(String element) {
return ByteBuffer.wrap(element.getBytes(ConfigConstants.DEFAULT_CHARSET));
}
// every 10th element goes into a different stream
@Override
public String getTargetStream(String element) {
if (element.split("-")[0].endsWith("0")) {
return "flink-test-2";
}
// send to default stream
return null;
}
}, kinesisProducerConfig);
kinesis.setFailOnError(true);
kinesis.setDefaultStream("test-flink");
kinesis.setDefaultPartition("0");
kinesis.setCustomPartitioner(new KinesisPartitioner<String>() {
@Override
public String getPartitionId(String element) {
int l = element.length();
return element.substring(l - 1, l);
}
});
simpleStringStream.addSink(kinesis);
// consuming topology
Properties consumerProps = new Properties();
consumerProps.setProperty(ConsumerConfigConstants.AWS_ACCESS_KEY_ID, pt.getRequired("accessKey"));
consumerProps.setProperty(ConsumerConfigConstants.AWS_SECRET_ACCESS_KEY, pt.getRequired("secretKey"));
consumerProps.setProperty(ConsumerConfigConstants.AWS_REGION, pt.getRequired("region"));
DataStream<String> consuming = see.addSource(new FlinkKinesisConsumer<>("test-flink", new SimpleStringSchema(), consumerProps));
// validate consumed records for correctness
consuming.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
String[] parts = value.split("-");
try {
long l = Long.parseLong(parts[0]);
if (l < 0) {
throw new RuntimeException("Negative");
}
} catch (NumberFormatException nfe) {
throw new RuntimeException("First part of '" + value + "' is not a valid numeric type");
}
if (parts[1].length() != 12) {
throw new RuntimeException("Second part of '" + value + "' doesn't have 12 characters");
}
}
});
consuming.print();
see.execute();
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class ManualExactlyOnceWithStreamReshardingTest method main.
public static void main(String[] args) throws Exception {
final ParameterTool pt = ParameterTool.fromArgs(args);
LOG.info("Starting exactly once with stream resharding test");
final String streamName = "flink-test-" + UUID.randomUUID().toString();
final String accessKey = pt.getRequired("accessKey");
final String secretKey = pt.getRequired("secretKey");
final String region = pt.getRequired("region");
final Properties configProps = new Properties();
configProps.setProperty(ConsumerConfigConstants.AWS_ACCESS_KEY_ID, accessKey);
configProps.setProperty(ConsumerConfigConstants.AWS_SECRET_ACCESS_KEY, secretKey);
configProps.setProperty(ConsumerConfigConstants.AWS_REGION, region);
configProps.setProperty(ConsumerConfigConstants.SHARD_DISCOVERY_INTERVAL_MILLIS, "0");
final AmazonKinesisClient client = AWSUtil.createKinesisClient(configProps);
// the stream is first created with 1 shard
client.createStream(streamName, 1);
// wait until stream has been created
DescribeStreamResult status = client.describeStream(streamName);
LOG.info("status {}", status);
while (!status.getStreamDescription().getStreamStatus().equals("ACTIVE")) {
status = client.describeStream(streamName);
LOG.info("Status of stream {}", status);
Thread.sleep(1000);
}
final Configuration flinkConfig = new Configuration();
flinkConfig.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
flinkConfig.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 8);
flinkConfig.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 16);
flinkConfig.setString(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY, "0 s");
LocalFlinkMiniCluster flink = new LocalFlinkMiniCluster(flinkConfig, false);
flink.start();
final int flinkPort = flink.getLeaderRPCPort();
try {
// we have to use a manual generator here instead of the FlinkKinesisProducer
// because the FlinkKinesisProducer currently has a problem where records will be resent to a shard
// when resharding happens; this affects the consumer exactly-once validation test and will never pass
final AtomicReference<Throwable> producerError = new AtomicReference<>();
Runnable manualGenerate = new Runnable() {
@Override
public void run() {
AmazonKinesisClient client = AWSUtil.createKinesisClient(configProps);
int count = 0;
final int batchSize = 30;
while (true) {
try {
Thread.sleep(10);
Set<PutRecordsRequestEntry> batch = new HashSet<>();
for (int i = count; i < count + batchSize; i++) {
if (i >= TOTAL_EVENT_COUNT) {
break;
}
batch.add(new PutRecordsRequestEntry().withData(ByteBuffer.wrap(((i) + "-" + RandomStringUtils.randomAlphabetic(12)).getBytes(ConfigConstants.DEFAULT_CHARSET))).withPartitionKey(UUID.randomUUID().toString()));
}
count += batchSize;
PutRecordsResult result = client.putRecords(new PutRecordsRequest().withStreamName(streamName).withRecords(batch));
// and let this test fail
if (result.getFailedRecordCount() > 0) {
producerError.set(new RuntimeException("The producer has failed records in one of the put batch attempts."));
break;
}
if (count >= TOTAL_EVENT_COUNT) {
break;
}
} catch (Exception e) {
producerError.set(e);
}
}
}
};
Thread producerThread = new Thread(manualGenerate);
producerThread.start();
final AtomicReference<Throwable> consumerError = new AtomicReference<>();
Thread consumerThread = ExactlyOnceValidatingConsumerThread.create(TOTAL_EVENT_COUNT, 10000, 2, 500, 500, accessKey, secretKey, region, streamName, consumerError, flinkPort, flinkConfig);
consumerThread.start();
// reshard the Kinesis stream while the producer / and consumers are running
Runnable splitShard = new Runnable() {
@Override
public void run() {
try {
// first, split shard in the middle of the hash range
Thread.sleep(5000);
LOG.info("Splitting shard ...");
client.splitShard(streamName, KinesisShardIdGenerator.generateFromShardOrder(0), "170141183460469231731687303715884105727");
// wait until the split shard operation finishes updating ...
DescribeStreamResult status;
Random rand = new Random();
do {
status = null;
while (status == null) {
// retry until we get status
try {
status = client.describeStream(streamName);
} catch (LimitExceededException lee) {
LOG.warn("LimitExceededException while describing stream ... retrying ...");
Thread.sleep(rand.nextInt(1200));
}
}
} while (!status.getStreamDescription().getStreamStatus().equals("ACTIVE"));
// then merge again
Thread.sleep(7000);
LOG.info("Merging shards ...");
client.mergeShards(streamName, KinesisShardIdGenerator.generateFromShardOrder(1), KinesisShardIdGenerator.generateFromShardOrder(2));
} catch (InterruptedException iex) {
//
}
}
};
Thread splitShardThread = new Thread(splitShard);
splitShardThread.start();
boolean deadlinePassed = false;
// wait at most for five minutes
long deadline = System.currentTimeMillis() + (1000 * 5 * 60);
// wait until both producer and consumer finishes, or an unexpected error is thrown
while ((consumerThread.isAlive() || producerThread.isAlive()) && (producerError.get() == null && consumerError.get() == null)) {
Thread.sleep(1000);
if (System.currentTimeMillis() >= deadline) {
LOG.warn("Deadline passed");
deadlinePassed = true;
// enough waiting
break;
}
}
if (producerThread.isAlive()) {
producerThread.interrupt();
}
if (consumerThread.isAlive()) {
consumerThread.interrupt();
}
if (producerError.get() != null) {
LOG.info("+++ TEST failed! +++");
throw new RuntimeException("Producer failed", producerError.get());
}
if (consumerError.get() != null) {
LOG.info("+++ TEST failed! +++");
throw new RuntimeException("Consumer failed", consumerError.get());
}
if (!deadlinePassed) {
LOG.info("+++ TEST passed! +++");
} else {
LOG.info("+++ TEST failed! +++");
}
} finally {
client.deleteStream(streamName);
client.shutdown();
// stopping flink
flink.stop();
}
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class ManualProducerTest method main.
public static void main(String[] args) throws Exception {
ParameterTool pt = ParameterTool.fromArgs(args);
StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment();
see.setParallelism(4);
DataStream<String> simpleStringStream = see.addSource(new ProduceIntoKinesis.EventsGenerator());
Properties kinesisProducerConfig = new Properties();
kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_REGION, pt.getRequired("region"));
kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_ACCESS_KEY_ID, pt.getRequired("accessKey"));
kinesisProducerConfig.setProperty(ProducerConfigConstants.AWS_SECRET_ACCESS_KEY, pt.getRequired("secretKey"));
FlinkKinesisProducer<String> kinesis = new FlinkKinesisProducer<>(new KinesisSerializationSchema<String>() {
@Override
public ByteBuffer serialize(String element) {
return ByteBuffer.wrap(element.getBytes(ConfigConstants.DEFAULT_CHARSET));
}
// every 10th element goes into a different stream
@Override
public String getTargetStream(String element) {
if (element.split("-")[0].endsWith("0")) {
return "flink-test-2";
}
// send to default stream
return null;
}
}, kinesisProducerConfig);
kinesis.setFailOnError(true);
kinesis.setDefaultStream("test-flink");
kinesis.setDefaultPartition("0");
kinesis.setCustomPartitioner(new KinesisPartitioner<String>() {
@Override
public String getPartitionId(String element) {
int l = element.length();
return element.substring(l - 1, l);
}
});
simpleStringStream.addSink(kinesis);
see.execute();
}
use of org.apache.flink.api.java.utils.ParameterTool in project flink by apache.
the class KMeansDataGenerator method main.
/**
* Main method to generate data for the {@link KMeans} example program.
* <p>
* The generator creates to files:
* <ul>
* <li><code>< output-path >/points</code> for the data points
* <li><code>< output-path >/centers</code> for the cluster centers
* </ul>
*
* @param args
* <ol>
* <li>Int: Number of data points
* <li>Int: Number of cluster centers
* <li><b>Optional</b> String: Output path, default value is {tmp.dir}
* <li><b>Optional</b> Double: Standard deviation of data points
* <li><b>Optional</b> Double: Value range of cluster centers
* <li><b>Optional</b> Long: Random seed
* </ol>
*
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// check parameter count
if (args.length < 2) {
System.out.println("KMeansDataGenerator -points <num> -k <num clusters> [-output <output-path>] [-stddev <relative stddev>] [-range <centroid range>] [-seed <seed>]");
System.exit(1);
}
// parse parameters
final ParameterTool params = ParameterTool.fromArgs(args);
final int numDataPoints = params.getInt("points");
final int k = params.getInt("k");
final String outDir = params.get("output", System.getProperty("java.io.tmpdir"));
final double stddev = params.getDouble("stddev", RELATIVE_STDDEV);
final double range = params.getDouble("range", DEFAULT_VALUE_RANGE);
final long firstSeed = params.getLong("seed", DEFAULT_SEED);
final double absoluteStdDev = stddev * range;
final Random random = new Random(firstSeed);
// the means around which data points are distributed
final double[][] means = uniformRandomCenters(random, k, DIMENSIONALITY, range);
// write the points out
BufferedWriter pointsOut = null;
try {
pointsOut = new BufferedWriter(new FileWriter(new File(outDir + "/" + POINTS_FILE)));
StringBuilder buffer = new StringBuilder();
double[] point = new double[DIMENSIONALITY];
int nextCentroid = 0;
for (int i = 1; i <= numDataPoints; i++) {
// generate a point for the current centroid
double[] centroid = means[nextCentroid];
for (int d = 0; d < DIMENSIONALITY; d++) {
point[d] = (random.nextGaussian() * absoluteStdDev) + centroid[d];
}
writePoint(point, buffer, pointsOut);
nextCentroid = (nextCentroid + 1) % k;
}
} finally {
if (pointsOut != null) {
pointsOut.close();
}
}
// write the uniformly distributed centers to a file
BufferedWriter centersOut = null;
try {
centersOut = new BufferedWriter(new FileWriter(new File(outDir + "/" + CENTERS_FILE)));
StringBuilder buffer = new StringBuilder();
double[][] centers = uniformRandomCenters(random, k, DIMENSIONALITY, range);
for (int i = 0; i < k; i++) {
writeCenter(i + 1, centers[i], buffer, centersOut);
}
} finally {
if (centersOut != null) {
centersOut.close();
}
}
System.out.println("Wrote " + numDataPoints + " data points to " + outDir + "/" + POINTS_FILE);
System.out.println("Wrote " + k + " cluster centers to " + outDir + "/" + CENTERS_FILE);
}
Aggregations