use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.
the class RollingSinkITCase method testUserDefinedConfiguration.
/**
* This tests user defined hdfs configuration
* @throws Exception
*/
@Test
public void testUserDefinedConfiguration() throws Exception {
final int NUM_ELEMENTS = 20;
final int PARALLELISM = 2;
final String outPath = hdfsURI + "/string-non-rolling-with-config";
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(PARALLELISM);
DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
Configuration conf = new Configuration();
conf.set("io.file.buffer.size", "40960");
RollingSink<String> sink = new RollingSink<String>(outPath).setFSConfig(conf).setWriter(new StreamWriterWithConfigCheck<String>("io.file.buffer.size", "40960")).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
source.map(new MapFunction<Tuple2<Integer, String>, String>() {
private static final long serialVersionUID = 1L;
@Override
public String map(Tuple2<Integer, String> value) throws Exception {
return value.f1;
}
}).addSink(sink);
env.execute("RollingSink with configuration Test");
FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 0; i < NUM_ELEMENTS; i += 2) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
inStream = dfs.open(new Path(outPath + "/part-1-0"));
br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 1; i < NUM_ELEMENTS; i += 2) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
}
use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.
the class BucketingSinkTest method testNonRollingSequenceFileWithoutCompressionWriter.
/**
* This tests {@link SequenceFileWriter}
* with non-rolling output and without compression.
*/
@Test
public void testNonRollingSequenceFileWithoutCompressionWriter() throws Exception {
final String outPath = hdfsURI + "/seq-no-comp-non-rolling-out";
final int numElements = 20;
BucketingSink<Tuple2<IntWritable, Text>> sink = new BucketingSink<Tuple2<IntWritable, Text>>(outPath).setWriter(new SequenceFileWriter<IntWritable, Text>()).setBucketer(new BasePathBucketer<Tuple2<IntWritable, Text>>()).setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");
sink.setInputType(TypeInformation.of(new TypeHint<Tuple2<IntWritable, Text>>() {
}), new ExecutionConfig());
OneInputStreamOperatorTestHarness<Tuple2<IntWritable, Text>, Object> testHarness = createTestSink(sink, 1, 0);
testHarness.setProcessingTime(0L);
testHarness.setup();
testHarness.open();
for (int i = 0; i < numElements; i++) {
testHarness.processElement(new StreamRecord<>(Tuple2.of(new IntWritable(i), new Text("message #" + Integer.toString(i)))));
}
testHarness.close();
FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));
SequenceFile.Reader reader = new SequenceFile.Reader(inStream, 1000, 0, 100000, new Configuration());
IntWritable intWritable = new IntWritable();
Text txt = new Text();
for (int i = 0; i < numElements; i++) {
reader.next(intWritable, txt);
Assert.assertEquals(i, intWritable.get());
Assert.assertEquals("message #" + i, txt.toString());
}
reader.close();
inStream.close();
}
use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.
the class KafkaConsumerTestBase method readSequence.
/**
* Variant of {@link KafkaConsumerTestBase#readSequence(StreamExecutionEnvironment, StartupMode, Map, Properties, String, Map)} to
* expect reading from the same start offset and the same value count for all partitions of a single Kafka topic.
*/
protected void readSequence(final StreamExecutionEnvironment env, final StartupMode startupMode, final Map<KafkaTopicPartition, Long> specificStartupOffsets, final Properties cc, final int sourceParallelism, final String topicName, final int valuesCount, final int startFrom) throws Exception {
HashMap<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset = new HashMap<>();
for (int i = 0; i < sourceParallelism; i++) {
partitionsToValuesCountAndStartOffset.put(i, new Tuple2<>(valuesCount, startFrom));
}
readSequence(env, startupMode, specificStartupOffsets, cc, topicName, partitionsToValuesCountAndStartOffset);
}
use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.
the class KafkaConsumerTestBase method runStartFromKafkaCommitOffsets.
/**
* This test first writes a total of 300 records to a test topic, reads the first 150 so that some offsets are
* committed to Kafka, and then startup the consumer again to read the remaining records starting from the committed offsets.
* The test ensures that whatever offsets were committed to Kafka, the consumer correctly picks them up
* and starts at the correct position.
*/
public void runStartFromKafkaCommitOffsets() throws Exception {
final int parallelism = 3;
final int recordsInEachPartition = 300;
final String topicName = writeSequence("testStartFromKafkaCommitOffsetsTopic", recordsInEachPartition, parallelism, 1);
KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
Long o1;
Long o2;
Long o3;
int attempt = 0;
// make sure that o1, o2, o3 are not all null before proceeding
do {
attempt++;
LOG.info("Attempt " + attempt + " to read records and commit some offsets to Kafka");
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.getConfig().disableSysoutLogging();
env.getConfig().setRestartStrategy(RestartStrategies.noRestart());
env.setParallelism(parallelism);
// fast checkpoints to make sure we commit some offsets
env.enableCheckpointing(20);
env.addSource(kafkaServer.getConsumer(topicName, new SimpleStringSchema(), standardProps)).map(new ThrottledMapper<String>(50)).map(new MapFunction<String, Object>() {
int count = 0;
@Override
public Object map(String value) throws Exception {
count++;
if (count == 150) {
throw new SuccessException();
}
return null;
}
}).addSink(new DiscardingSink<>());
tryExecute(env, "Read some records to commit offsets to Kafka");
o1 = kafkaOffsetHandler.getCommittedOffset(topicName, 0);
o2 = kafkaOffsetHandler.getCommittedOffset(topicName, 1);
o3 = kafkaOffsetHandler.getCommittedOffset(topicName, 2);
} while (o1 == null && o2 == null && o3 == null && attempt < 3);
if (o1 == null && o2 == null && o3 == null) {
throw new RuntimeException("No offsets have been committed after 3 attempts");
}
LOG.info("Got final committed offsets from Kafka o1={}, o2={}, o3={}", o1, o2, o3);
final StreamExecutionEnvironment env2 = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env2.getConfig().disableSysoutLogging();
env2.getConfig().setRestartStrategy(RestartStrategies.noRestart());
env2.setParallelism(parallelism);
// whatever offsets were committed for each partition, the consumer should pick
// them up and start from the correct position so that the remaining records are all read
HashMap<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset = new HashMap<>();
partitionsToValuesCountAndStartOffset.put(0, new Tuple2<>((o1 != null) ? (int) (recordsInEachPartition - o1) : recordsInEachPartition, (o1 != null) ? o1.intValue() : 0));
partitionsToValuesCountAndStartOffset.put(1, new Tuple2<>((o2 != null) ? (int) (recordsInEachPartition - o2) : recordsInEachPartition, (o2 != null) ? o2.intValue() : 0));
partitionsToValuesCountAndStartOffset.put(2, new Tuple2<>((o3 != null) ? (int) (recordsInEachPartition - o3) : recordsInEachPartition, (o3 != null) ? o3.intValue() : 0));
readSequence(env2, StartupMode.GROUP_OFFSETS, null, standardProps, topicName, partitionsToValuesCountAndStartOffset);
kafkaOffsetHandler.close();
deleteTestTopic(topicName);
}
use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.
the class KafkaConsumerTestBase method readSequence.
// ------------------------------------------------------------------------
// Reading writing test data sets
// ------------------------------------------------------------------------
/**
* Runs a job using the provided environment to read a sequence of records from a single Kafka topic.
* The method allows to individually specify the expected starting offset and total read value count of each partition.
* The job will be considered successful only if all partition read results match the start offset and value count criteria.
*/
protected void readSequence(final StreamExecutionEnvironment env, final StartupMode startupMode, final Map<KafkaTopicPartition, Long> specificStartupOffsets, final Properties cc, final String topicName, final Map<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset) throws Exception {
final int sourceParallelism = partitionsToValuesCountAndStartOffset.keySet().size();
int finalCountTmp = 0;
for (Map.Entry<Integer, Tuple2<Integer, Integer>> valuesCountAndStartOffset : partitionsToValuesCountAndStartOffset.entrySet()) {
finalCountTmp += valuesCountAndStartOffset.getValue().f0;
}
final int finalCount = finalCountTmp;
final TypeInformation<Tuple2<Integer, Integer>> intIntTupleType = TypeInfoParser.parse("Tuple2<Integer, Integer>");
final TypeInformationSerializationSchema<Tuple2<Integer, Integer>> deser = new TypeInformationSerializationSchema<>(intIntTupleType, env.getConfig());
// create the consumer
cc.putAll(secureProps);
FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> consumer = kafkaServer.getConsumer(topicName, deser, cc);
switch(startupMode) {
case EARLIEST:
consumer.setStartFromEarliest();
break;
case LATEST:
consumer.setStartFromLatest();
break;
case SPECIFIC_OFFSETS:
consumer.setStartFromSpecificOffsets(specificStartupOffsets);
break;
case GROUP_OFFSETS:
consumer.setStartFromGroupOffsets();
break;
}
DataStream<Tuple2<Integer, Integer>> source = env.addSource(consumer).setParallelism(sourceParallelism).map(new ThrottledMapper<Tuple2<Integer, Integer>>(20)).setParallelism(sourceParallelism);
// verify data
source.flatMap(new RichFlatMapFunction<Tuple2<Integer, Integer>, Integer>() {
private HashMap<Integer, BitSet> partitionsToValueCheck;
private int count = 0;
@Override
public void open(Configuration parameters) throws Exception {
partitionsToValueCheck = new HashMap<>();
for (Integer partition : partitionsToValuesCountAndStartOffset.keySet()) {
partitionsToValueCheck.put(partition, new BitSet());
}
}
@Override
public void flatMap(Tuple2<Integer, Integer> value, Collector<Integer> out) throws Exception {
int partition = value.f0;
int val = value.f1;
BitSet bitSet = partitionsToValueCheck.get(partition);
if (bitSet == null) {
throw new RuntimeException("Got a record from an unknown partition");
} else {
bitSet.set(val - partitionsToValuesCountAndStartOffset.get(partition).f1);
}
count++;
LOG.info("Received message {}, total {} messages", value, count);
// verify if we've seen everything
if (count == finalCount) {
for (Map.Entry<Integer, BitSet> partitionsToValueCheck : this.partitionsToValueCheck.entrySet()) {
BitSet check = partitionsToValueCheck.getValue();
int expectedValueCount = partitionsToValuesCountAndStartOffset.get(partitionsToValueCheck.getKey()).f0;
if (check.cardinality() != expectedValueCount) {
throw new RuntimeException("Expected cardinality to be " + expectedValueCount + ", but was " + check.cardinality());
} else if (check.nextClearBit(0) != expectedValueCount) {
throw new RuntimeException("Expected next clear bit to be " + expectedValueCount + ", but was " + check.cardinality());
}
}
// test has passed
throw new SuccessException();
}
}
}).setParallelism(1);
tryExecute(env, "Read data from Kafka");
LOG.info("Successfully read sequence for verification");
}
Aggregations