Examples with DataSet - org.apache.flink.api.java.DataSet

Example 51 with DataSet

use of org.apache.flink.api.java.DataSet in project flink by apache.

the class JobGraphGeneratorTest method testGeneratingJobGraphWithUnconsumedResultPartition.

@Test
public void testGeneratingJobGraphWithUnconsumedResultPartition() {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<Long, Long>> input = env.fromElements(new Tuple2<>(1L, 2L)).setParallelism(1);
    DataSet<Tuple2<Long, Long>> ds = input.map(new IdentityMapper<>()).setParallelism(3);
    AbstractID intermediateDataSetID = new AbstractID();
    // this output branch will be excluded.
    ds.output(BlockingShuffleOutputFormat.createOutputFormat(intermediateDataSetID)).setParallelism(1);
    // this is the normal output branch.
    ds.output(new DiscardingOutputFormat<>()).setParallelism(1);
    JobGraph jobGraph = compileJob(env);
    Assert.assertEquals(3, jobGraph.getVerticesSortedTopologicallyFromSources().size());
    JobVertex mapVertex = jobGraph.getVerticesSortedTopologicallyFromSources().get(1);
    Assert.assertThat(mapVertex, Matchers.instanceOf(JobVertex.class));
    // there are 2 output result with one of them is ResultPartitionType.BLOCKING_PERSISTENT
    Assert.assertEquals(2, mapVertex.getProducedDataSets().size());
    Assert.assertTrue(mapVertex.getProducedDataSets().stream().anyMatch(dataSet -> dataSet.getId().equals(new IntermediateDataSetID(intermediateDataSetID)) && dataSet.getResultType() == ResultPartitionType.BLOCKING_PERSISTENT));
}

Also used : CoreMatchers.is(org.hamcrest.CoreMatchers.is) LongSumAggregator(org.apache.flink.api.common.aggregators.LongSumAggregator) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ResultPartitionType(org.apache.flink.runtime.io.network.partition.ResultPartitionType) HashMap(java.util.HashMap) JobType(org.apache.flink.runtime.jobgraph.JobType) MapFunction(org.apache.flink.api.common.functions.MapFunction) DataSink(org.apache.flink.api.java.operators.DataSink) Assert.assertThat(org.junit.Assert.assertThat) DataSet(org.apache.flink.api.java.DataSet) JobGraphUtils(org.apache.flink.runtime.jobgraph.JobGraphUtils) DeltaIteration(org.apache.flink.api.java.operators.DeltaIteration) ResourceSpec(org.apache.flink.api.common.operators.ResourceSpec) Map(java.util.Map) Plan(org.apache.flink.api.common.Plan) Optimizer(org.apache.flink.optimizer.Optimizer) BlockingShuffleOutputFormat(org.apache.flink.api.java.io.BlockingShuffleOutputFormat) IdentityMapper(org.apache.flink.optimizer.testfunctions.IdentityMapper) Method(java.lang.reflect.Method) Path(java.nio.file.Path) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) DiscardingOutputFormat(org.apache.flink.api.java.io.DiscardingOutputFormat) Files(java.nio.file.Files) AbstractID(org.apache.flink.util.AbstractID) Assert.assertNotNull(org.junit.Assert.assertNotNull) IterativeDataSet(org.apache.flink.api.java.operators.IterativeDataSet) Configuration(org.apache.flink.configuration.Configuration) Matchers(org.hamcrest.Matchers) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) IOException(java.io.IOException) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) DistributedCache(org.apache.flink.api.common.cache.DistributedCache) Operator(org.apache.flink.api.java.operators.Operator) FilterFunction(org.apache.flink.api.common.functions.FilterFunction) JobID(org.apache.flink.api.common.JobID) Rule(org.junit.Rule) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Assert.assertFalse(org.junit.Assert.assertFalse) Assert(org.junit.Assert) TemporaryFolder(org.junit.rules.TemporaryFolder) Assert.assertEquals(org.junit.Assert.assertEquals) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) IdentityMapper(org.apache.flink.optimizer.testfunctions.IdentityMapper) Tuple2(org.apache.flink.api.java.tuple.Tuple2) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) AbstractID(org.apache.flink.util.AbstractID) DiscardingOutputFormat(org.apache.flink.api.java.io.DiscardingOutputFormat) Test(org.junit.Test)

Example 52 with DataSet

use of org.apache.flink.api.java.DataSet in project flink by apache.

the class UnionReplacementTest method testUnionForwardOutput.

/**
 * Tests that a the outgoing connection of a Union node is FORWARD. See FLINK-9031 for a bug
 * report.
 *
 * <p>The issue is quite hard to reproduce as the plan choice seems to depend on the enumeration
 * order due to lack of plan costs. This test is a smaller variant of the job that was reported
 * to fail.
 *
 * <p>/-\ /- PreFilter1 -\-/- Union - PostFilter1 - Reducer1 -\ Src -< >- Union -< X >- Union -
 * Out \-/ \- PreFilter2 -/-\- Union - PostFilter2 - Reducer2 -/
 */
@Test
public void testUnionForwardOutput() throws Exception {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(DEFAULT_PARALLELISM);
    DataSet<Tuple2<Long, Long>> src1 = env.fromElements(new Tuple2<>(0L, 0L));
    DataSet<Tuple2<Long, Long>> u1 = src1.union(src1).map(new IdentityMapper<>());
    DataSet<Tuple2<Long, Long>> s1 = u1.filter(x -> true).name("preFilter1");
    DataSet<Tuple2<Long, Long>> s2 = u1.filter(x -> true).name("preFilter2");
    DataSet<Tuple2<Long, Long>> reduced1 = s1.union(s2).filter(x -> true).name("postFilter1").groupBy(0).reduceGroup(new IdentityGroupReducer<>()).name("reducer1");
    DataSet<Tuple2<Long, Long>> reduced2 = s1.union(s2).filter(x -> true).name("postFilter2").groupBy(1).reduceGroup(new IdentityGroupReducer<>()).name("reducer2");
    reduced1.union(reduced2).output(new DiscardingOutputFormat<>());
    // -----------------------------------------------------------------------------------------
    // Verify optimized plan
    // -----------------------------------------------------------------------------------------
    OptimizedPlan optimizedPlan = compileNoStats(env.createProgramPlan());
    OptimizerPlanNodeResolver resolver = getOptimizerPlanNodeResolver(optimizedPlan);
    SingleInputPlanNode unionOut1 = resolver.getNode("postFilter1");
    SingleInputPlanNode unionOut2 = resolver.getNode("postFilter2");
    assertEquals(ShipStrategyType.FORWARD, unionOut1.getInput().getShipStrategy());
    assertEquals(ShipStrategyType.FORWARD, unionOut2.getInput().getShipStrategy());
}

Also used : Ordering(org.apache.flink.api.common.operators.Ordering) JoinOperatorBase(org.apache.flink.api.common.operators.base.JoinOperatorBase) Tuple2(org.apache.flink.api.java.tuple.Tuple2) ShipStrategyType(org.apache.flink.runtime.operators.shipping.ShipStrategyType) DataSet(org.apache.flink.api.java.DataSet) CompilerTestBase(org.apache.flink.optimizer.util.CompilerTestBase) IdentityGroupReducer(org.apache.flink.optimizer.testfunctions.IdentityGroupReducer) Plan(org.apache.flink.api.common.Plan) IdentityMapper(org.apache.flink.optimizer.testfunctions.IdentityMapper) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) SourcePlanNode(org.apache.flink.optimizer.plan.SourcePlanNode) DiscardingOutputFormat(org.apache.flink.api.java.io.DiscardingOutputFormat) Test(org.junit.Test) Channel(org.apache.flink.optimizer.plan.Channel) FieldList(org.apache.flink.api.common.operators.util.FieldList) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) DriverStrategy(org.apache.flink.runtime.operators.DriverStrategy) List(java.util.List) JobGraphGenerator(org.apache.flink.optimizer.plantranslate.JobGraphGenerator) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) SingleInputPlanNode(org.apache.flink.optimizer.plan.SingleInputPlanNode) Assert(org.junit.Assert) Order(org.apache.flink.api.common.operators.Order) PartitioningProperty(org.apache.flink.optimizer.dataproperties.PartitioningProperty) NAryUnionPlanNode(org.apache.flink.optimizer.plan.NAryUnionPlanNode) SingleInputPlanNode(org.apache.flink.optimizer.plan.SingleInputPlanNode) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple2(org.apache.flink.api.java.tuple.Tuple2) IdentityGroupReducer(org.apache.flink.optimizer.testfunctions.IdentityGroupReducer) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) Test(org.junit.Test)

Example 53 with DataSet

use of org.apache.flink.api.java.DataSet in project flink by apache.

the class AvroTypeExtractionTest method testSerializeWithAvro.

@Test
public void testSerializeWithAvro() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.getConfig().enableForceAvro();
    Path in = new Path(inFile.getAbsoluteFile().toURI());
    AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
    DataSet<User> usersDS = env.createInput(users).map((MapFunction<User, User>) value -> {
        Map<CharSequence, Long> ab = new HashMap<>(1);
        ab.put("hehe", 12L);
        value.setTypeMap(ab);
        return value;
    });
    usersDS.writeAsText(resultPath);
    env.execute("Simple Avro read job");
    expected = "{\"name\": \"Alyssa\", \"favorite_number\": 256, \"favorite_color\": null," + " \"type_long_test\": null, \"type_double_test\": 123.45, \"type_null_test\": null," + " \"type_bool_test\": true, \"type_array_string\": [\"ELEMENT 1\", \"ELEMENT 2\"]," + " \"type_array_boolean\": [true, false], \"type_nullable_array\": null, \"type_enum\": \"GREEN\"," + " \"type_map\": {\"hehe\": 12}, \"type_fixed\": null, \"type_union\": null," + " \"type_nested\": {\"num\": 239, \"street\": \"Baker Street\", \"city\": \"London\"," + " \"state\": \"London\", \"zip\": \"NW1 6XE\"}," + " \"type_bytes\": \"\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\", " + "\"type_date\": 2014-03-01, \"type_time_millis\": 12:12:12, \"type_time_micros\": 00:00:00.123456, " + "\"type_timestamp_millis\": 2014-03-01T12:12:12.321Z, " + "\"type_timestamp_micros\": 1970-01-01T00:00:00.123456Z, \"type_decimal_bytes\": \"\\u0007Ð\", " + "\"type_decimal_fixed\": [7, -48]}\n" + "{\"name\": \"Charlie\", \"favorite_number\": null, " + "\"favorite_color\": \"blue\", \"type_long_test\": 1337, \"type_double_test\": 1.337, " + "\"type_null_test\": null, \"type_bool_test\": false, \"type_array_string\": [], " + "\"type_array_boolean\": [], \"type_nullable_array\": null, \"type_enum\": \"RED\", " + "\"type_map\": {\"hehe\": 12}, \"type_fixed\": null, \"type_union\": null, " + "\"type_nested\": {\"num\": 239, \"street\": \"Baker Street\", \"city\": \"London\", \"state\": \"London\", " + "\"zip\": \"NW1 6XE\"}, " + "\"type_bytes\": \"\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\\u0000\", " + "\"type_date\": 2014-03-01, \"type_time_millis\": 12:12:12, \"type_time_micros\": 00:00:00.123456, " + "\"type_timestamp_millis\": 2014-03-01T12:12:12.321Z, " + "\"type_timestamp_micros\": 1970-01-01T00:00:00.123456Z, \"type_decimal_bytes\": \"\\u0007Ð\", " + "\"type_decimal_fixed\": [7, -48]}\n";
}

Also used : Path(org.apache.flink.core.fs.Path) Arrays(java.util.Arrays) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RunWith(org.junit.runner.RunWith) HashMap(java.util.HashMap) MultipleProgramsTestBase(org.apache.flink.test.util.MultipleProgramsTestBase) MapFunction(org.apache.flink.api.common.functions.MapFunction) AvroRecordInputFormatTest(org.apache.flink.formats.avro.AvroRecordInputFormatTest) DataSet(org.apache.flink.api.java.DataSet) Path(org.apache.flink.core.fs.Path) After(org.junit.After) Map(java.util.Map) Parameterized(org.junit.runners.Parameterized) Before(org.junit.Before) Types(org.apache.flink.api.common.typeinfo.Types) AvroInputFormat(org.apache.flink.formats.avro.AvroInputFormat) Fixed16(org.apache.flink.formats.avro.generated.Fixed16) KeySelector(org.apache.flink.api.java.functions.KeySelector) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) Test(org.junit.Test) File(java.io.File) Rule(org.junit.Rule) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) User(org.apache.flink.formats.avro.generated.User) Assert(org.junit.Assert) TemporaryFolder(org.junit.rules.TemporaryFolder) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) User(org.apache.flink.formats.avro.generated.User) AvroInputFormat(org.apache.flink.formats.avro.AvroInputFormat) HashMap(java.util.HashMap) Map(java.util.Map) AvroRecordInputFormatTest(org.apache.flink.formats.avro.AvroRecordInputFormatTest) Test(org.junit.Test)

Example 54 with DataSet

use of org.apache.flink.api.java.DataSet in project flink by apache.

the class AvroTypeExtractionTest method testWithAvroGenericSer.

@Test
public void testWithAvroGenericSer() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.getConfig().enableForceAvro();
    Path in = new Path(inFile.getAbsoluteFile().toURI());
    AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
    DataSet<User> usersDS = env.createInput(users);
    DataSet<Tuple2<String, Integer>> res = usersDS.groupBy((KeySelector<User, String>) value -> String.valueOf(value.getName())).reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
        for (User u : values) {
            out.collect(new Tuple2<>(u.getName().toString(), 1));
        }
    }).returns(Types.TUPLE(Types.STRING, Types.INT));
    res.writeAsText(resultPath);
    env.execute("Avro Key selection");
    expected = "(Charlie,1)\n(Alyssa,1)\n";
}

Also used : Path(org.apache.flink.core.fs.Path) Arrays(java.util.Arrays) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RunWith(org.junit.runner.RunWith) HashMap(java.util.HashMap) MultipleProgramsTestBase(org.apache.flink.test.util.MultipleProgramsTestBase) MapFunction(org.apache.flink.api.common.functions.MapFunction) AvroRecordInputFormatTest(org.apache.flink.formats.avro.AvroRecordInputFormatTest) DataSet(org.apache.flink.api.java.DataSet) Path(org.apache.flink.core.fs.Path) After(org.junit.After) Map(java.util.Map) Parameterized(org.junit.runners.Parameterized) Before(org.junit.Before) Types(org.apache.flink.api.common.typeinfo.Types) AvroInputFormat(org.apache.flink.formats.avro.AvroInputFormat) Fixed16(org.apache.flink.formats.avro.generated.Fixed16) KeySelector(org.apache.flink.api.java.functions.KeySelector) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) Test(org.junit.Test) File(java.io.File) Rule(org.junit.Rule) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) User(org.apache.flink.formats.avro.generated.User) Assert(org.junit.Assert) TemporaryFolder(org.junit.rules.TemporaryFolder) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) User(org.apache.flink.formats.avro.generated.User) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) AvroInputFormat(org.apache.flink.formats.avro.AvroInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) AvroRecordInputFormatTest(org.apache.flink.formats.avro.AvroRecordInputFormatTest) Test(org.junit.Test)

Example 55 with DataSet

use of org.apache.flink.api.java.DataSet in project flink by apache.

the class UnionTranslationTest method translateUnion3SortedGroup.

@Test
public void translateUnion3SortedGroup() {
    try {
        final int parallelism = 4;
        ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism);
        DataSet<Tuple3<Double, StringValue, LongValue>> dataset1 = getSourceDataSet(env, 2);
        DataSet<Tuple3<Double, StringValue, LongValue>> dataset2 = getSourceDataSet(env, 3);
        DataSet<Tuple3<Double, StringValue, LongValue>> dataset3 = getSourceDataSet(env, -1);
        dataset1.union(dataset2).union(dataset3).groupBy((KeySelector<Tuple3<Double, StringValue, LongValue>, String>) value -> "").sortGroup((KeySelector<Tuple3<Double, StringValue, LongValue>, String>) value -> "", Order.ASCENDING).reduceGroup((GroupReduceFunction<Tuple3<Double, StringValue, LongValue>, String>) (values, out) -> {
        }).returns(String.class).output(new DiscardingOutputFormat<>());
        Plan p = env.createProgramPlan();
        // The plan should look like the following one.
        // 
        // DataSet1(2) - MapOperator(2)-+
        // |- Union(-1) -+
        // DataSet2(3) - MapOperator(3)-+             |- Union(-1) - SingleInputOperator - Sink
        // |
        // DataSet3(-1) - MapOperator(-1)-+
        GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next();
        Union secondUnionOperator = (Union) ((SingleInputOperator) sink.getInput()).getInput();
        // The first input of the second union should be the first union.
        Union firstUnionOperator = (Union) secondUnionOperator.getFirstInput();
        // The key mapper should be added to the second input stream of the second union.
        assertTrue(secondUnionOperator.getSecondInput() instanceof MapOperatorBase<?, ?, ?>);
        // The key mappers should be added to both of the two input streams for the first union.
        assertTrue(firstUnionOperator.getFirstInput() instanceof MapOperatorBase<?, ?, ?>);
        assertTrue(firstUnionOperator.getSecondInput() instanceof MapOperatorBase<?, ?, ?>);
        // The parallelisms of the key mappers should be equal to those of their inputs.
        assertEquals(firstUnionOperator.getFirstInput().getParallelism(), 2);
        assertEquals(firstUnionOperator.getSecondInput().getParallelism(), 3);
        assertEquals(secondUnionOperator.getSecondInput().getParallelism(), -1);
        // The union should always have the default parallelism.
        assertEquals(secondUnionOperator.getParallelism(), ExecutionConfig.PARALLELISM_DEFAULT);
        assertEquals(firstUnionOperator.getParallelism(), ExecutionConfig.PARALLELISM_DEFAULT);
    } catch (Exception e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
        fail("Test caused an error: " + e.getMessage());
    }
}

Also used : KeySelector(org.apache.flink.api.java.functions.KeySelector) Tuple3(org.apache.flink.api.java.tuple.Tuple3) DiscardingOutputFormat(org.apache.flink.api.java.io.DiscardingOutputFormat) LongValue(org.apache.flink.types.LongValue) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) MapOperatorBase(org.apache.flink.api.common.operators.base.MapOperatorBase) Union(org.apache.flink.api.common.operators.Union) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) SingleInputOperator(org.apache.flink.api.common.operators.SingleInputOperator) DataSet(org.apache.flink.api.java.DataSet) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) StringValue(org.apache.flink.types.StringValue) GenericDataSinkBase(org.apache.flink.api.common.operators.GenericDataSinkBase) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Plan(org.apache.flink.api.common.Plan) Assert.fail(org.junit.Assert.fail) Order(org.apache.flink.api.common.operators.Order) Assert.assertEquals(org.junit.Assert.assertEquals) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) KeySelector(org.apache.flink.api.java.functions.KeySelector) Plan(org.apache.flink.api.common.Plan) Union(org.apache.flink.api.common.operators.Union) Tuple3(org.apache.flink.api.java.tuple.Tuple3) LongValue(org.apache.flink.types.LongValue) StringValue(org.apache.flink.types.StringValue) Test(org.junit.Test)

Aggregations

DataSet (org.apache.flink.api.java.DataSet)56 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)31 Test (org.junit.Test)24 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)17 DiscardingOutputFormat (org.apache.flink.api.java.io.DiscardingOutputFormat)11 Plan (org.apache.flink.api.common.Plan)10 Types (org.apache.flink.api.common.typeinfo.Types)10 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)10 Assert (org.junit.Assert)10 Arrays (java.util.Arrays)9 Rule (org.junit.Rule)9 List (java.util.List)8 MapFunction (org.apache.flink.api.common.functions.MapFunction)8 Configuration (org.apache.flink.configuration.Configuration)7 Graph (org.apache.flink.graph.Graph)7 NullValue (org.apache.flink.types.NullValue)7 ArrayList (java.util.ArrayList)6 GroupReduceFunction (org.apache.flink.api.common.functions.GroupReduceFunction)6 KeySelector (org.apache.flink.api.java.functions.KeySelector)6 PythonMapPartition (org.apache.flink.python.api.functions.PythonMapPartition)6