Search in sources :

Example 61 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class ReplicatingDataSourceTest method checkCrossWithReplicatedSourceInput.

/**
	 * Tests cross program with replicated data source.
	 */
@Test
public void checkCrossWithReplicatedSourceInput() {
    ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(DEFAULT_PARALLELISM);
    TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
    ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif = new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));
    DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
    DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);
    DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1.cross(source2).writeAsText("/some/newpath");
    Plan plan = env.createProgramPlan();
    // submit the plan to the compiler
    OptimizedPlan oPlan = compileNoStats(plan);
    // check the optimized Plan
    // when cross should have forward strategy on both sides
    SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
    DualInputPlanNode crossNode = (DualInputPlanNode) sinkNode.getPredecessor();
    ShipStrategyType crossIn1 = crossNode.getInput1().getShipStrategy();
    ShipStrategyType crossIn2 = crossNode.getInput2().getShipStrategy();
    Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, crossIn1);
    Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, crossIn2);
}
Also used : Path(org.apache.flink.core.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Plan(org.apache.flink.api.common.Plan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) ShipStrategyType(org.apache.flink.runtime.operators.shipping.ShipStrategyType) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) ReplicatingInputFormat(org.apache.flink.api.common.io.ReplicatingInputFormat) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SinkPlanNode(org.apache.flink.optimizer.plan.SinkPlanNode) Test(org.junit.Test)

Example 62 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class ReplicatingDataSourceTest method checkJoinWithReplicatedSourceInputBehindMapPartition.

/**
	 * Tests join program with replicated data source behind map partition.
	 */
@Test
public void checkJoinWithReplicatedSourceInputBehindMapPartition() {
    ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(DEFAULT_PARALLELISM);
    TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
    ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif = new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));
    DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
    DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);
    DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1.mapPartition(new IdPMap()).join(source2).where("*").equalTo("*").writeAsText("/some/newpath");
    Plan plan = env.createProgramPlan();
    // submit the plan to the compiler
    OptimizedPlan oPlan = compileNoStats(plan);
    // check the optimized Plan
    // when join should have forward strategy on both sides
    SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
    DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();
    ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
    ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();
    Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
    Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
Also used : Path(org.apache.flink.core.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Plan(org.apache.flink.api.common.Plan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) ShipStrategyType(org.apache.flink.runtime.operators.shipping.ShipStrategyType) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) ReplicatingInputFormat(org.apache.flink.api.common.io.ReplicatingInputFormat) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SinkPlanNode(org.apache.flink.optimizer.plan.SinkPlanNode) Test(org.junit.Test)

Example 63 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class ReplicatingDataSourceTest method checkJoinWithReplicatedSourceInput.

/**
	 * Tests join program with replicated data source.
	 */
@Test
public void checkJoinWithReplicatedSourceInput() {
    ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(DEFAULT_PARALLELISM);
    TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
    ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif = new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));
    DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
    DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);
    DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1.join(source2).where("*").equalTo("*").writeAsText("/some/newpath");
    Plan plan = env.createProgramPlan();
    // submit the plan to the compiler
    OptimizedPlan oPlan = compileNoStats(plan);
    // check the optimized Plan
    // when join should have forward strategy on both sides
    SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
    DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();
    ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
    ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();
    Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
    Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
Also used : Path(org.apache.flink.core.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Plan(org.apache.flink.api.common.Plan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) ShipStrategyType(org.apache.flink.runtime.operators.shipping.ShipStrategyType) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) ReplicatingInputFormat(org.apache.flink.api.common.io.ReplicatingInputFormat) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SinkPlanNode(org.apache.flink.optimizer.plan.SinkPlanNode) Test(org.junit.Test)

Example 64 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class ReplicatingDataSourceTest method checkCrossWithReplicatedSourceInputBehindMap.

/**
	 * Tests cross program with replicated data source behind map and filter.
	 */
@Test
public void checkCrossWithReplicatedSourceInputBehindMap() {
    ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(DEFAULT_PARALLELISM);
    TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
    ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif = new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));
    DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
    DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);
    DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1.map(new IdMap()).filter(new NoFilter()).cross(source2).writeAsText("/some/newpath");
    Plan plan = env.createProgramPlan();
    // submit the plan to the compiler
    OptimizedPlan oPlan = compileNoStats(plan);
    // check the optimized Plan
    // when cross should have forward strategy on both sides
    SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
    DualInputPlanNode crossNode = (DualInputPlanNode) sinkNode.getPredecessor();
    ShipStrategyType crossIn1 = crossNode.getInput1().getShipStrategy();
    ShipStrategyType crossIn2 = crossNode.getInput2().getShipStrategy();
    Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, crossIn1);
    Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, crossIn2);
}
Also used : Path(org.apache.flink.core.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Plan(org.apache.flink.api.common.Plan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) ShipStrategyType(org.apache.flink.runtime.operators.shipping.ShipStrategyType) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) ReplicatingInputFormat(org.apache.flink.api.common.io.ReplicatingInputFormat) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SinkPlanNode(org.apache.flink.optimizer.plan.SinkPlanNode) Test(org.junit.Test)

Example 65 with FileInputSplit

use of org.apache.flink.core.fs.FileInputSplit in project flink by apache.

the class MockInputSplitProvider method addInputSplits.

/**
	 * Generates a set of input splits from an input path
	 * 
	 * @param path
	 *        the path of the local file to generate the input splits from
	 * @param noSplits
	 *        the number of input splits to be generated from the given input file
	 */
public void addInputSplits(final String path, final int noSplits) {
    final InputSplit[] tmp = new InputSplit[noSplits];
    final String[] hosts = { "localhost" };
    final String localPath;
    try {
        localPath = new URI(path).getPath();
    } catch (URISyntaxException e) {
        throw new IllegalArgumentException("Path URI can not be transformed to local path.");
    }
    final File inFile = new File(localPath);
    final long splitLength = inFile.length() / noSplits;
    long pos = 0;
    for (int i = 0; i < noSplits - 1; i++) {
        tmp[i] = new FileInputSplit(i, new Path(path), pos, splitLength, hosts);
        pos += splitLength;
    }
    tmp[noSplits - 1] = new FileInputSplit(noSplits - 1, new Path(path), pos, inFile.length() - pos, hosts);
    this.inputSplits = tmp;
}
Also used : Path(org.apache.flink.core.fs.Path) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) URISyntaxException(java.net.URISyntaxException) InputSplit(org.apache.flink.core.io.InputSplit) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) URI(java.net.URI) File(java.io.File)

Aggregations

FileInputSplit (org.apache.flink.core.fs.FileInputSplit)140 Test (org.junit.Test)119 Configuration (org.apache.flink.configuration.Configuration)93 Path (org.apache.flink.core.fs.Path)59 IOException (java.io.IOException)45 File (java.io.File)36 FileOutputStream (java.io.FileOutputStream)23 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)20 Row (org.apache.flink.types.Row)20 OutputStreamWriter (java.io.OutputStreamWriter)18 ParseException (org.apache.flink.api.common.io.ParseException)17 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)17 DoubleValue (org.apache.flink.types.DoubleValue)17 IntValue (org.apache.flink.types.IntValue)17 LongValue (org.apache.flink.types.LongValue)17 StringValue (org.apache.flink.types.StringValue)17 Value (org.apache.flink.types.Value)17 Plan (org.apache.flink.api.common.Plan)12 ReplicatingInputFormat (org.apache.flink.api.common.io.ReplicatingInputFormat)12 Tuple1 (org.apache.flink.api.java.tuple.Tuple1)12