use of org.apache.flink.streaming.api.functions.source.ParallelSourceFunction in project flink by apache.
the class StreamExecutionEnvironment method addSource.
/**
* Ads a data source with a custom type information thus opening a
* {@link DataStream}. Only in very special cases does the user need to
* support type information. Otherwise use
* {@link #addSource(org.apache.flink.streaming.api.functions.source.SourceFunction)}
*
* @param function
* the user defined function
* @param sourceName
* Name of the data source
* @param <OUT>
* type of the returned stream
* @param typeInfo
* the user defined type information for the stream
* @return the data stream constructed
*/
@SuppressWarnings("unchecked")
public <OUT> DataStreamSource<OUT> addSource(SourceFunction<OUT> function, String sourceName, TypeInformation<OUT> typeInfo) {
if (typeInfo == null) {
if (function instanceof ResultTypeQueryable) {
typeInfo = ((ResultTypeQueryable<OUT>) function).getProducedType();
} else {
try {
typeInfo = TypeExtractor.createTypeInfo(SourceFunction.class, function.getClass(), 0, null, null);
} catch (final InvalidTypesException e) {
typeInfo = (TypeInformation<OUT>) new MissingTypeInfo(sourceName, e);
}
}
}
boolean isParallel = function instanceof ParallelSourceFunction;
clean(function);
StreamSource<OUT, ?> sourceOperator;
if (function instanceof StoppableFunction) {
sourceOperator = new StoppableStreamSource<>(cast2StoppableSourceFunction(function));
} else {
sourceOperator = new StreamSource<>(function);
}
return new DataStreamSource<>(this, typeInfo, sourceOperator, isParallel, sourceName);
}
use of org.apache.flink.streaming.api.functions.source.ParallelSourceFunction in project flink by apache.
the class CommonExecTableSourceScan method createSourceFunctionTransformation.
/**
* Adopted from {@link StreamExecutionEnvironment#addSource(SourceFunction, String,
* TypeInformation)} but with custom {@link Boundedness}.
*/
protected Transformation<RowData> createSourceFunctionTransformation(StreamExecutionEnvironment env, SourceFunction<RowData> function, boolean isBounded, String operatorName, TypeInformation<RowData> outputTypeInfo) {
env.clean(function);
final int parallelism;
if (function instanceof ParallelSourceFunction) {
parallelism = env.getParallelism();
} else {
parallelism = 1;
}
final Boundedness boundedness;
if (isBounded) {
boundedness = Boundedness.BOUNDED;
} else {
boundedness = Boundedness.CONTINUOUS_UNBOUNDED;
}
final StreamSource<RowData, ?> sourceOperator = new StreamSource<>(function, !isBounded);
return new LegacySourceTransformation<>(operatorName, sourceOperator, outputTypeInfo, parallelism, boundedness);
}
use of org.apache.flink.streaming.api.functions.source.ParallelSourceFunction in project flink by apache.
the class StreamingJobGraphGeneratorTest method createJobGraphForManagedMemoryFractionTest.
private JobGraph createJobGraphForManagedMemoryFractionTest(final List<ResourceSpec> resourceSpecs, final List<Map<ManagedMemoryUseCase, Integer>> operatorScopeUseCaseWeights, final List<Set<ManagedMemoryUseCase>> slotScopeUseCases) throws Exception {
final Method opMethod = getSetResourcesMethodAndSetAccessible(SingleOutputStreamOperator.class);
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
final DataStream<Integer> source = env.addSource(new ParallelSourceFunction<Integer>() {
@Override
public void run(SourceContext<Integer> ctx) {
}
@Override
public void cancel() {
}
});
opMethod.invoke(source, resourceSpecs.get(0));
// CHAIN(source -> map1) in default slot sharing group
final DataStream<Integer> map1 = source.map((MapFunction<Integer, Integer>) value -> value);
opMethod.invoke(map1, resourceSpecs.get(1));
// CHAIN(map2) in default slot sharing group
final DataStream<Integer> map2 = map1.rebalance().map((MapFunction<Integer, Integer>) value -> value);
opMethod.invoke(map2, resourceSpecs.get(2));
// CHAIN(map3) in test slot sharing group
final DataStream<Integer> map3 = map2.rebalance().map(value -> value).slotSharingGroup("test");
opMethod.invoke(map3, resourceSpecs.get(3));
declareManagedMemoryUseCaseForTranformation(source.getTransformation(), operatorScopeUseCaseWeights.get(0), slotScopeUseCases.get(0));
declareManagedMemoryUseCaseForTranformation(map1.getTransformation(), operatorScopeUseCaseWeights.get(1), slotScopeUseCases.get(1));
declareManagedMemoryUseCaseForTranformation(map2.getTransformation(), operatorScopeUseCaseWeights.get(2), slotScopeUseCases.get(2));
declareManagedMemoryUseCaseForTranformation(map3.getTransformation(), operatorScopeUseCaseWeights.get(3), slotScopeUseCases.get(3));
return StreamingJobGraphGenerator.createJobGraph(env.getStreamGraph());
}
use of org.apache.flink.streaming.api.functions.source.ParallelSourceFunction in project flink by apache.
the class StreamingJobGraphGeneratorTest method testResourcesForIteration.
/**
* Verifies that the resources are merged correctly for chained operators (covers middle
* chaining and iteration cases) when generating job graph.
*/
@Test
public void testResourcesForIteration() throws Exception {
ResourceSpec resource1 = ResourceSpec.newBuilder(0.1, 100).build();
ResourceSpec resource2 = ResourceSpec.newBuilder(0.2, 200).build();
ResourceSpec resource3 = ResourceSpec.newBuilder(0.3, 300).build();
ResourceSpec resource4 = ResourceSpec.newBuilder(0.4, 400).build();
ResourceSpec resource5 = ResourceSpec.newBuilder(0.5, 500).build();
Method opMethod = getSetResourcesMethodAndSetAccessible(SingleOutputStreamOperator.class);
Method sinkMethod = getSetResourcesMethodAndSetAccessible(DataStreamSink.class);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<Integer> source = env.addSource(new ParallelSourceFunction<Integer>() {
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
}
@Override
public void cancel() {
}
}).name("test_source");
opMethod.invoke(source, resource1);
IterativeStream<Integer> iteration = source.iterate(3000);
opMethod.invoke(iteration, resource2);
DataStream<Integer> flatMap = iteration.flatMap(new FlatMapFunction<Integer, Integer>() {
@Override
public void flatMap(Integer value, Collector<Integer> out) throws Exception {
out.collect(value);
}
}).name("test_flatMap");
opMethod.invoke(flatMap, resource3);
// CHAIN(flatMap -> Filter)
DataStream<Integer> increment = flatMap.filter(new FilterFunction<Integer>() {
@Override
public boolean filter(Integer value) throws Exception {
return false;
}
}).name("test_filter");
opMethod.invoke(increment, resource4);
DataStreamSink<Integer> sink = iteration.closeWith(increment).addSink(new SinkFunction<Integer>() {
@Override
public void invoke(Integer value) throws Exception {
}
}).disableChaining().name("test_sink");
sinkMethod.invoke(sink, resource5);
JobGraph jobGraph = StreamingJobGraphGenerator.createJobGraph(env.getStreamGraph());
for (JobVertex jobVertex : jobGraph.getVertices()) {
if (jobVertex.getName().contains("test_source")) {
assertTrue(jobVertex.getMinResources().equals(resource1));
} else if (jobVertex.getName().contains("Iteration_Source")) {
assertTrue(jobVertex.getPreferredResources().equals(resource2));
} else if (jobVertex.getName().contains("test_flatMap")) {
assertTrue(jobVertex.getMinResources().equals(resource3.merge(resource4)));
} else if (jobVertex.getName().contains("Iteration_Tail")) {
assertTrue(jobVertex.getPreferredResources().equals(ResourceSpec.DEFAULT));
} else if (jobVertex.getName().contains("test_sink")) {
assertTrue(jobVertex.getMinResources().equals(resource5));
}
}
}
use of org.apache.flink.streaming.api.functions.source.ParallelSourceFunction in project flink by apache.
the class RescalePartitionerTest method testExecutionGraphGeneration.
@Test
public void testExecutionGraphGeneration() throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(4);
// get input data
DataStream<String> text = env.addSource(new ParallelSourceFunction<String>() {
private static final long serialVersionUID = 7772338606389180774L;
@Override
public void run(SourceContext<String> ctx) throws Exception {
}
@Override
public void cancel() {
}
}).setParallelism(2);
DataStream<Tuple2<String, Integer>> counts = text.rescale().flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
private static final long serialVersionUID = -5255930322161596829L;
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
}
});
counts.rescale().print().setParallelism(2);
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
List<JobVertex> jobVertices = jobGraph.getVerticesSortedTopologicallyFromSources();
JobVertex sourceVertex = jobVertices.get(0);
JobVertex mapVertex = jobVertices.get(1);
JobVertex sinkVertex = jobVertices.get(2);
assertEquals(2, sourceVertex.getParallelism());
assertEquals(4, mapVertex.getParallelism());
assertEquals(2, sinkVertex.getParallelism());
ExecutionGraph eg = TestingDefaultExecutionGraphBuilder.newBuilder().setVertexParallelismStore(SchedulerBase.computeVertexParallelismStore(jobGraph)).build();
try {
eg.attachJobGraph(jobVertices);
} catch (JobException e) {
e.printStackTrace();
fail("Building ExecutionGraph failed: " + e.getMessage());
}
ExecutionJobVertex execSourceVertex = eg.getJobVertex(sourceVertex.getID());
ExecutionJobVertex execMapVertex = eg.getJobVertex(mapVertex.getID());
ExecutionJobVertex execSinkVertex = eg.getJobVertex(sinkVertex.getID());
assertEquals(0, execSourceVertex.getInputs().size());
assertEquals(1, execMapVertex.getInputs().size());
assertEquals(4, execMapVertex.getParallelism());
ExecutionVertex[] mapTaskVertices = execMapVertex.getTaskVertices();
// verify that we have each parallel input partition exactly twice, i.e. that one source
// sends to two unique mappers
Map<Integer, Integer> mapInputPartitionCounts = new HashMap<>();
for (ExecutionVertex mapTaskVertex : mapTaskVertices) {
assertEquals(1, mapTaskVertex.getNumberOfInputs());
assertEquals(1, mapTaskVertex.getConsumedPartitionGroup(0).size());
IntermediateResultPartitionID consumedPartitionId = mapTaskVertex.getConsumedPartitionGroup(0).getFirst();
assertEquals(sourceVertex.getID(), mapTaskVertex.getExecutionGraphAccessor().getResultPartitionOrThrow(consumedPartitionId).getProducer().getJobvertexId());
int inputPartition = consumedPartitionId.getPartitionNumber();
if (!mapInputPartitionCounts.containsKey(inputPartition)) {
mapInputPartitionCounts.put(inputPartition, 1);
} else {
mapInputPartitionCounts.put(inputPartition, mapInputPartitionCounts.get(inputPartition) + 1);
}
}
assertEquals(2, mapInputPartitionCounts.size());
for (int count : mapInputPartitionCounts.values()) {
assertEquals(2, count);
}
assertEquals(1, execSinkVertex.getInputs().size());
assertEquals(2, execSinkVertex.getParallelism());
ExecutionVertex[] sinkTaskVertices = execSinkVertex.getTaskVertices();
InternalExecutionGraphAccessor executionGraphAccessor = execSinkVertex.getGraph();
// verify each sink instance has two inputs from the map and that each map subpartition
// only occurs in one unique input edge
Set<Integer> mapSubpartitions = new HashSet<>();
for (ExecutionVertex sinkTaskVertex : sinkTaskVertices) {
assertEquals(1, sinkTaskVertex.getNumberOfInputs());
assertEquals(2, sinkTaskVertex.getConsumedPartitionGroup(0).size());
for (IntermediateResultPartitionID consumedPartitionId : sinkTaskVertex.getConsumedPartitionGroup(0)) {
IntermediateResultPartition consumedPartition = executionGraphAccessor.getResultPartitionOrThrow(consumedPartitionId);
assertEquals(mapVertex.getID(), consumedPartition.getProducer().getJobvertexId());
int partitionNumber = consumedPartition.getPartitionNumber();
assertFalse(mapSubpartitions.contains(partitionNumber));
mapSubpartitions.add(partitionNumber);
}
}
assertEquals(4, mapSubpartitions.size());
}
Aggregations