Search in sources :

Example 71 with Tuple2

use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.

the class RollingSinkITCase method testUserDefinedConfiguration.

/**
	 * This tests user defined hdfs configuration
	 * @throws Exception
     */
@Test
public void testUserDefinedConfiguration() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/string-non-rolling-with-config";
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);
    DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
    Configuration conf = new Configuration();
    conf.set("io.file.buffer.size", "40960");
    RollingSink<String> sink = new RollingSink<String>(outPath).setFSConfig(conf).setWriter(new StreamWriterWithConfigCheck<String>("io.file.buffer.size", "40960")).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
    source.map(new MapFunction<Tuple2<Integer, String>, String>() {

        private static final long serialVersionUID = 1L;

        @Override
        public String map(Tuple2<Integer, String> value) throws Exception {
            return value.f1;
        }
    }).addSink(sink);
    env.execute("RollingSink with configuration Test");
    FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
    BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
    for (int i = 0; i < NUM_ELEMENTS; i += 2) {
        String line = br.readLine();
        Assert.assertEquals("message #" + i, line);
    }
    inStream.close();
    inStream = dfs.open(new Path(outPath + "/part-1-0"));
    br = new BufferedReader(new InputStreamReader(inStream));
    for (int i = 1; i < NUM_ELEMENTS; i += 2) {
        String line = br.readLine();
        Assert.assertEquals("message #" + i, line);
    }
    inStream.close();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) InputStreamReader(java.io.InputStreamReader) MapFunction(org.apache.flink.api.common.functions.MapFunction) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test)

Example 72 with Tuple2

use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.

the class BucketingSinkTest method testNonRollingSequenceFileWithoutCompressionWriter.

/**
	 * This tests {@link SequenceFileWriter}
	 * with non-rolling output and without compression.
	 */
@Test
public void testNonRollingSequenceFileWithoutCompressionWriter() throws Exception {
    final String outPath = hdfsURI + "/seq-no-comp-non-rolling-out";
    final int numElements = 20;
    BucketingSink<Tuple2<IntWritable, Text>> sink = new BucketingSink<Tuple2<IntWritable, Text>>(outPath).setWriter(new SequenceFileWriter<IntWritable, Text>()).setBucketer(new BasePathBucketer<Tuple2<IntWritable, Text>>()).setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");
    sink.setInputType(TypeInformation.of(new TypeHint<Tuple2<IntWritable, Text>>() {
    }), new ExecutionConfig());
    OneInputStreamOperatorTestHarness<Tuple2<IntWritable, Text>, Object> testHarness = createTestSink(sink, 1, 0);
    testHarness.setProcessingTime(0L);
    testHarness.setup();
    testHarness.open();
    for (int i = 0; i < numElements; i++) {
        testHarness.processElement(new StreamRecord<>(Tuple2.of(new IntWritable(i), new Text("message #" + Integer.toString(i)))));
    }
    testHarness.close();
    FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));
    SequenceFile.Reader reader = new SequenceFile.Reader(inStream, 1000, 0, 100000, new Configuration());
    IntWritable intWritable = new IntWritable();
    Text txt = new Text();
    for (int i = 0; i < numElements; i++) {
        reader.next(intWritable, txt);
        Assert.assertEquals(i, intWritable.get());
        Assert.assertEquals("message #" + i, txt.toString());
    }
    reader.close();
    inStream.close();
}
Also used : Path(org.apache.hadoop.fs.Path) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) Configuration(org.apache.hadoop.conf.Configuration) InputStreamReader(java.io.InputStreamReader) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) BufferedReader(java.io.BufferedReader) Text(org.apache.hadoop.io.Text) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) SequenceFile(org.apache.hadoop.io.SequenceFile) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 73 with Tuple2

use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.

the class KafkaConsumerTestBase method readSequence.

/**
	 * Variant of {@link KafkaConsumerTestBase#readSequence(StreamExecutionEnvironment, StartupMode, Map, Properties, String, Map)} to
	 * expect reading from the same start offset and the same value count for all partitions of a single Kafka topic.
	 */
protected void readSequence(final StreamExecutionEnvironment env, final StartupMode startupMode, final Map<KafkaTopicPartition, Long> specificStartupOffsets, final Properties cc, final int sourceParallelism, final String topicName, final int valuesCount, final int startFrom) throws Exception {
    HashMap<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset = new HashMap<>();
    for (int i = 0; i < sourceParallelism; i++) {
        partitionsToValuesCountAndStartOffset.put(i, new Tuple2<>(valuesCount, startFrom));
    }
    readSequence(env, startupMode, specificStartupOffsets, cc, topicName, partitionsToValuesCountAndStartOffset);
}
Also used : HashMap(java.util.HashMap) Tuple2(org.apache.flink.api.java.tuple.Tuple2) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint)

Example 74 with Tuple2

use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.

the class KafkaConsumerTestBase method runStartFromKafkaCommitOffsets.

/**
	 * This test first writes a total of 300 records to a test topic, reads the first 150 so that some offsets are
	 * committed to Kafka, and then startup the consumer again to read the remaining records starting from the committed offsets.
	 * The test ensures that whatever offsets were committed to Kafka, the consumer correctly picks them up
	 * and starts at the correct position.
	 */
public void runStartFromKafkaCommitOffsets() throws Exception {
    final int parallelism = 3;
    final int recordsInEachPartition = 300;
    final String topicName = writeSequence("testStartFromKafkaCommitOffsetsTopic", recordsInEachPartition, parallelism, 1);
    KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
    Long o1;
    Long o2;
    Long o3;
    int attempt = 0;
    // make sure that o1, o2, o3 are not all null before proceeding
    do {
        attempt++;
        LOG.info("Attempt " + attempt + " to read records and commit some offsets to Kafka");
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
        env.getConfig().disableSysoutLogging();
        env.getConfig().setRestartStrategy(RestartStrategies.noRestart());
        env.setParallelism(parallelism);
        // fast checkpoints to make sure we commit some offsets
        env.enableCheckpointing(20);
        env.addSource(kafkaServer.getConsumer(topicName, new SimpleStringSchema(), standardProps)).map(new ThrottledMapper<String>(50)).map(new MapFunction<String, Object>() {

            int count = 0;

            @Override
            public Object map(String value) throws Exception {
                count++;
                if (count == 150) {
                    throw new SuccessException();
                }
                return null;
            }
        }).addSink(new DiscardingSink<>());
        tryExecute(env, "Read some records to commit offsets to Kafka");
        o1 = kafkaOffsetHandler.getCommittedOffset(topicName, 0);
        o2 = kafkaOffsetHandler.getCommittedOffset(topicName, 1);
        o3 = kafkaOffsetHandler.getCommittedOffset(topicName, 2);
    } while (o1 == null && o2 == null && o3 == null && attempt < 3);
    if (o1 == null && o2 == null && o3 == null) {
        throw new RuntimeException("No offsets have been committed after 3 attempts");
    }
    LOG.info("Got final committed offsets from Kafka o1={}, o2={}, o3={}", o1, o2, o3);
    final StreamExecutionEnvironment env2 = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env2.getConfig().disableSysoutLogging();
    env2.getConfig().setRestartStrategy(RestartStrategies.noRestart());
    env2.setParallelism(parallelism);
    // whatever offsets were committed for each partition, the consumer should pick
    // them up and start from the correct position so that the remaining records are all read
    HashMap<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset = new HashMap<>();
    partitionsToValuesCountAndStartOffset.put(0, new Tuple2<>((o1 != null) ? (int) (recordsInEachPartition - o1) : recordsInEachPartition, (o1 != null) ? o1.intValue() : 0));
    partitionsToValuesCountAndStartOffset.put(1, new Tuple2<>((o2 != null) ? (int) (recordsInEachPartition - o2) : recordsInEachPartition, (o2 != null) ? o2.intValue() : 0));
    partitionsToValuesCountAndStartOffset.put(2, new Tuple2<>((o3 != null) ? (int) (recordsInEachPartition - o3) : recordsInEachPartition, (o3 != null) ? o3.intValue() : 0));
    readSequence(env2, StartupMode.GROUP_OFFSETS, null, standardProps, topicName, partitionsToValuesCountAndStartOffset);
    kafkaOffsetHandler.close();
    deleteTestTopic(topicName);
}
Also used : HashMap(java.util.HashMap) MapFunction(org.apache.flink.api.common.functions.MapFunction) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SimpleStringSchema(org.apache.flink.streaming.util.serialization.SimpleStringSchema) SuccessException(org.apache.flink.test.util.SuccessException) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 75 with Tuple2

use of org.apache.flink.api.java.tuple.Tuple2 in project flink by apache.

the class KafkaConsumerTestBase method readSequence.

// ------------------------------------------------------------------------
//  Reading writing test data sets
// ------------------------------------------------------------------------
/**
	 * Runs a job using the provided environment to read a sequence of records from a single Kafka topic.
	 * The method allows to individually specify the expected starting offset and total read value count of each partition.
	 * The job will be considered successful only if all partition read results match the start offset and value count criteria.
	 */
protected void readSequence(final StreamExecutionEnvironment env, final StartupMode startupMode, final Map<KafkaTopicPartition, Long> specificStartupOffsets, final Properties cc, final String topicName, final Map<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset) throws Exception {
    final int sourceParallelism = partitionsToValuesCountAndStartOffset.keySet().size();
    int finalCountTmp = 0;
    for (Map.Entry<Integer, Tuple2<Integer, Integer>> valuesCountAndStartOffset : partitionsToValuesCountAndStartOffset.entrySet()) {
        finalCountTmp += valuesCountAndStartOffset.getValue().f0;
    }
    final int finalCount = finalCountTmp;
    final TypeInformation<Tuple2<Integer, Integer>> intIntTupleType = TypeInfoParser.parse("Tuple2<Integer, Integer>");
    final TypeInformationSerializationSchema<Tuple2<Integer, Integer>> deser = new TypeInformationSerializationSchema<>(intIntTupleType, env.getConfig());
    // create the consumer
    cc.putAll(secureProps);
    FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> consumer = kafkaServer.getConsumer(topicName, deser, cc);
    switch(startupMode) {
        case EARLIEST:
            consumer.setStartFromEarliest();
            break;
        case LATEST:
            consumer.setStartFromLatest();
            break;
        case SPECIFIC_OFFSETS:
            consumer.setStartFromSpecificOffsets(specificStartupOffsets);
            break;
        case GROUP_OFFSETS:
            consumer.setStartFromGroupOffsets();
            break;
    }
    DataStream<Tuple2<Integer, Integer>> source = env.addSource(consumer).setParallelism(sourceParallelism).map(new ThrottledMapper<Tuple2<Integer, Integer>>(20)).setParallelism(sourceParallelism);
    // verify data
    source.flatMap(new RichFlatMapFunction<Tuple2<Integer, Integer>, Integer>() {

        private HashMap<Integer, BitSet> partitionsToValueCheck;

        private int count = 0;

        @Override
        public void open(Configuration parameters) throws Exception {
            partitionsToValueCheck = new HashMap<>();
            for (Integer partition : partitionsToValuesCountAndStartOffset.keySet()) {
                partitionsToValueCheck.put(partition, new BitSet());
            }
        }

        @Override
        public void flatMap(Tuple2<Integer, Integer> value, Collector<Integer> out) throws Exception {
            int partition = value.f0;
            int val = value.f1;
            BitSet bitSet = partitionsToValueCheck.get(partition);
            if (bitSet == null) {
                throw new RuntimeException("Got a record from an unknown partition");
            } else {
                bitSet.set(val - partitionsToValuesCountAndStartOffset.get(partition).f1);
            }
            count++;
            LOG.info("Received message {}, total {} messages", value, count);
            // verify if we've seen everything
            if (count == finalCount) {
                for (Map.Entry<Integer, BitSet> partitionsToValueCheck : this.partitionsToValueCheck.entrySet()) {
                    BitSet check = partitionsToValueCheck.getValue();
                    int expectedValueCount = partitionsToValuesCountAndStartOffset.get(partitionsToValueCheck.getKey()).f0;
                    if (check.cardinality() != expectedValueCount) {
                        throw new RuntimeException("Expected cardinality to be " + expectedValueCount + ", but was " + check.cardinality());
                    } else if (check.nextClearBit(0) != expectedValueCount) {
                        throw new RuntimeException("Expected next clear bit to be " + expectedValueCount + ", but was " + check.cardinality());
                    }
                }
                // test has passed
                throw new SuccessException();
            }
        }
    }).setParallelism(1);
    tryExecute(env, "Read data from Kafka");
    LOG.info("Successfully read sequence for verification");
}
Also used : Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) BitSet(java.util.BitSet) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) ThrottledMapper(org.apache.flink.streaming.connectors.kafka.testutils.ThrottledMapper) TypeInformationSerializationSchema(org.apache.flink.streaming.util.serialization.TypeInformationSerializationSchema) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) Collector(org.apache.flink.util.Collector) SuccessException(org.apache.flink.test.util.SuccessException) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

Tuple2 (org.apache.flink.api.java.tuple.Tuple2)1159 Test (org.junit.Test)871 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)486 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)266 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)195 TimeWindow (org.apache.flink.streaming.api.windowing.windows.TimeWindow)137 ArrayList (java.util.ArrayList)136 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)103 Plan (org.apache.flink.api.common.Plan)103 TypeHint (org.apache.flink.api.common.typeinfo.TypeHint)103 OptimizedPlan (org.apache.flink.optimizer.plan.OptimizedPlan)99 Configuration (org.apache.flink.configuration.Configuration)87 List (java.util.List)82 IOException (java.io.IOException)79 OneInputTransformation (org.apache.flink.streaming.api.transformations.OneInputTransformation)77 ListStateDescriptor (org.apache.flink.api.common.state.ListStateDescriptor)74 HashMap (java.util.HashMap)72 SinkPlanNode (org.apache.flink.optimizer.plan.SinkPlanNode)66 Collection (java.util.Collection)61 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)60