use of com.datatorrent.api.Partitioner.Partition in project apex-malhar by apache.
the class AbstractFileInputOperatorTest method testPartitioningStateTransferInterrupted.
/**
* Test for testing dynamic partitioning.
* - Create 4 file with 3 records each.
* - Create a single partition, and read some records, populating pending files in operator.
* - Split it in two operators
* - Try to emit the remaining records.
*/
@Test
public void testPartitioningStateTransferInterrupted() throws Exception {
LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
oper.setScanIntervalMillis(0);
oper.setEmitBatchSize(2);
LineByLineFileInputOperator initialState = new Kryo().copy(oper);
// Create 4 files with 3 records each.
Path path = new Path(new File(testMeta.dir).getAbsolutePath());
FileContext.getLocalFSFileContext().delete(path, true);
int file;
for (file = 0; file < 4; file++) {
FileUtils.write(new File(testMeta.dir, "partition00" + file), "a\nb\nc\n");
}
CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
@SuppressWarnings({ "unchecked", "rawtypes" }) CollectorTestSink<Object> sink = (CollectorTestSink) queryResults;
oper.output.setSink(sink);
int wid = 0;
// Read some records
oper.setup(testMeta.context);
for (int i = 0; i < 5; i++) {
oper.beginWindow(wid);
oper.emitTuples();
oper.endWindow();
wid++;
}
Assert.assertEquals("Partial tuples read ", 6, sink.collectedTuples.size());
Assert.assertEquals(1, initialState.getCurrentPartitions());
initialState.setPartitionCount(2);
StatsListener.Response rsp = initialState.processStats(null);
Assert.assertEquals(true, rsp.repartitionRequired);
// Create partitions of the operator.
List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
// incremental capacity controlled partitionCount property
Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = initialState.definePartitions(partitions, new PartitioningContextImpl(null, 0));
Assert.assertEquals(2, newPartitions.size());
Assert.assertEquals(1, initialState.getCurrentPartitions());
Map<Integer, Partition<AbstractFileInputOperator<String>>> m = Maps.newHashMap();
for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
m.put(m.size(), p);
}
initialState.partitioned(m);
Assert.assertEquals(2, initialState.getCurrentPartitions());
/* Collect all operators in a list */
List<AbstractFileInputOperator<String>> opers = Lists.newArrayList();
for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
LineByLineFileInputOperator oi = (LineByLineFileInputOperator) p.getPartitionedInstance();
oi.setup(testMeta.context);
oi.output.setSink(sink);
opers.add(oi);
}
sink.clear();
for (int i = 0; i < 10; i++) {
for (AbstractFileInputOperator<String> o : opers) {
o.beginWindow(wid);
o.emitTuples();
o.endWindow();
}
wid++;
}
Assert.assertEquals("Remaining tuples read ", 6, sink.collectedTuples.size());
}
use of com.datatorrent.api.Partitioner.Partition in project apex-malhar by apache.
the class AbstractFileInputOperatorTest method testWindowDataManagerPartitioning.
@Test
public void testWindowDataManagerPartitioning() throws Exception {
LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
oper.setWindowDataManager(new FSWindowDataManager());
oper.operatorId = 7;
Path path = new Path(new File(testMeta.dir).getAbsolutePath());
FileContext.getLocalFSFileContext().delete(path, true);
for (int file = 0; file < 4; file++) {
FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
}
List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
Assert.assertEquals(2, newPartitions.size());
Assert.assertEquals(1, oper.getCurrentPartitions());
List<FSWindowDataManager> storageManagers = Lists.newLinkedList();
for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
storageManagers.add((FSWindowDataManager) p.getPartitionedInstance().getWindowDataManager());
}
Assert.assertEquals("count of storage managers", 2, storageManagers.size());
int countOfDeleteManagers = 0;
FSWindowDataManager deleteManager = null;
for (FSWindowDataManager storageManager : storageManagers) {
if (storageManager.getDeletedOperators() != null) {
countOfDeleteManagers++;
deleteManager = storageManager;
}
}
Assert.assertEquals("count of delete managers", 1, countOfDeleteManagers);
Assert.assertNotNull("deleted operators manager", deleteManager);
Assert.assertEquals("deleted operators", Sets.newHashSet(7), deleteManager.getDeletedOperators());
}
use of com.datatorrent.api.Partitioner.Partition in project apex-malhar by apache.
the class AbstractFileInputOperatorTest method testPartitioning.
@Test
public void testPartitioning() throws Exception {
LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
Path path = new Path(new File(testMeta.dir).getAbsolutePath());
FileContext.getLocalFSFileContext().delete(path, true);
for (int file = 0; file < 4; file++) {
FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
}
List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
Assert.assertEquals(2, newPartitions.size());
// partitioned() wasn't called
Assert.assertEquals(1, oper.getCurrentPartitions());
for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
Assert.assertNotSame(oper, p.getPartitionedInstance());
Assert.assertNotSame(oper.getScanner(), p.getPartitionedInstance().getScanner());
Set<String> consumed = Sets.newHashSet();
LinkedHashSet<Path> files = p.getPartitionedInstance().getScanner().scan(FileSystem.getLocal(new Configuration(false)), path, consumed);
Assert.assertEquals("partition " + files, 3, files.size());
}
}
use of com.datatorrent.api.Partitioner.Partition in project apex-malhar by apache.
the class AbstractFileInputOperatorTest method testWithCustomScanner.
/**
* Partition the operator in 2
* create ten files with index of the file at the start, i.e 1_file, 2_file .. etc.
* The scanner returns this index from getPartition method.
* each partition should read 5 files as file index are from 0 to 9 (including 0 and 9).
* @throws Exception
*/
@Test
public void testWithCustomScanner() throws Exception {
LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
oper.setScanner(new MyScanner());
oper.getScanner().setFilePatternRegexp(".*partition_([\\d]*)");
oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
Random rand = new Random();
Path path = new Path(new File(testMeta.dir).getAbsolutePath());
FileContext.getLocalFSFileContext().delete(path, true);
for (int file = 0; file < 10; file++) {
FileUtils.write(new File(testMeta.dir, file + "_partition_00" + rand.nextInt(100)), "");
}
List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
Assert.assertEquals(2, newPartitions.size());
// partitioned() wasn't called
Assert.assertEquals(1, oper.getCurrentPartitions());
for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
Assert.assertNotSame(oper, p.getPartitionedInstance());
Assert.assertNotSame(oper.getScanner(), p.getPartitionedInstance().getScanner());
Set<String> consumed = Sets.newHashSet();
LinkedHashSet<Path> files = p.getPartitionedInstance().getScanner().scan(FileSystem.getLocal(new Configuration(false)), path, consumed);
Assert.assertEquals("partition " + files, 6, files.size());
}
}
use of com.datatorrent.api.Partitioner.Partition in project apex-core by apache.
the class PhysicalPlan method redoPartitions.
private void redoPartitions(PMapping currentMapping, String note) {
Partitioner<Operator> partitioner = getPartitioner(currentMapping);
if (partitioner == null) {
LOG.warn("No partitioner for {}", currentMapping.logicalOperator);
return;
}
RepartitionContext mainPC = new RepartitionContext(partitioner, currentMapping, 0);
if (mainPC.newPartitions.isEmpty()) {
LOG.warn("Empty partition list after repartition: {}", currentMapping.logicalOperator);
return;
}
int memoryPerPartition = currentMapping.logicalOperator.getValue(OperatorContext.MEMORY_MB);
for (Map.Entry<OutputPortMeta, StreamMeta> stream : currentMapping.logicalOperator.getOutputStreams().entrySet()) {
if (stream.getValue().getLocality() != Locality.THREAD_LOCAL && stream.getValue().getLocality() != Locality.CONTAINER_LOCAL) {
memoryPerPartition += stream.getKey().getValue(PortContext.BUFFER_MEMORY_MB);
}
}
for (OperatorMeta pp : currentMapping.parallelPartitions) {
for (Map.Entry<OutputPortMeta, StreamMeta> stream : pp.getOutputStreams().entrySet()) {
if (stream.getValue().getLocality() != Locality.THREAD_LOCAL && stream.getValue().getLocality() != Locality.CONTAINER_LOCAL) {
memoryPerPartition += stream.getKey().getValue(PortContext.BUFFER_MEMORY_MB);
}
}
memoryPerPartition += pp.getValue(OperatorContext.MEMORY_MB);
}
int requiredMemoryMB = (mainPC.newPartitions.size() - mainPC.currentPartitions.size()) * memoryPerPartition;
if (requiredMemoryMB > availableMemoryMB) {
LOG.warn("Insufficient headroom for repartitioning: available {}m required {}m", availableMemoryMB, requiredMemoryMB);
return;
}
List<Partition<Operator>> addedPartitions = new ArrayList<>();
// determine modifications of partition set, identify affected operator instance(s)
for (Partition<Operator> newPartition : mainPC.newPartitions) {
PTOperator op = mainPC.currentPartitionMap.remove(newPartition);
if (op == null) {
addedPartitions.add(newPartition);
} else {
// check whether mapping was changed
for (DefaultPartition<Operator> pi : mainPC.currentPartitions) {
if (pi == newPartition && pi.isModified()) {
// existing partition changed (operator or partition keys)
// remove/add to update subscribers and state
mainPC.currentPartitionMap.put(newPartition, op);
addedPartitions.add(newPartition);
}
}
}
}
// remaining entries represent deprecated partitions
this.undeployOpers.addAll(mainPC.currentPartitionMap.values());
// downstream dependencies require redeploy, resolve prior to modifying plan
Set<PTOperator> deps = this.getDependents(mainPC.currentPartitionMap.values());
this.undeployOpers.addAll(deps);
// dependencies need redeploy, except operators excluded in remove
this.deployOpers.addAll(deps);
// process parallel partitions before removing operators from the plan
LinkedHashMap<PMapping, RepartitionContext> partitionContexts = Maps.newLinkedHashMap();
Stack<OperatorMeta> parallelPartitions = new Stack<>();
parallelPartitions.addAll(currentMapping.parallelPartitions);
pendingLoop: while (!parallelPartitions.isEmpty()) {
OperatorMeta ppMeta = parallelPartitions.pop();
for (StreamMeta s : ppMeta.getInputStreams().values()) {
if (currentMapping.parallelPartitions.contains(s.getSource().getOperatorMeta()) && parallelPartitions.contains(s.getSource().getOperatorMeta())) {
parallelPartitions.push(ppMeta);
parallelPartitions.remove(s.getSource().getOperatorMeta());
parallelPartitions.push(s.getSource().getOperatorMeta());
continue pendingLoop;
}
}
LOG.debug("Processing parallel partition {}", ppMeta);
PMapping ppm = this.logicalToPTOperator.get(ppMeta);
Partitioner<Operator> ppp = getPartitioner(ppm);
if (ppp == null) {
partitionContexts.put(ppm, null);
} else {
RepartitionContext pc = new RepartitionContext(ppp, ppm, mainPC.newPartitions.size());
if (pc.newPartitions == null) {
throw new IllegalStateException("Partitioner returns null for parallel partition " + ppm.logicalOperator);
}
partitionContexts.put(ppm, pc);
}
}
// plan updates start here, after all changes were identified
// remove obsolete operators first, any freed resources
// can subsequently be used for new/modified partitions
List<PTOperator> copyPartitions = Lists.newArrayList(currentMapping.partitions);
// remove deprecated partitions from plan
for (PTOperator p : mainPC.currentPartitionMap.values()) {
copyPartitions.remove(p);
removePartition(p, currentMapping);
mainPC.operatorIdToPartition.remove(p.getId());
}
currentMapping.partitions = copyPartitions;
// add new operators
for (Partition<Operator> newPartition : addedPartitions) {
PTOperator p = addPTOperator(currentMapping, newPartition, mainPC.minCheckpoint);
mainPC.operatorIdToPartition.put(p.getId(), newPartition);
}
// process parallel partition changes
for (Map.Entry<PMapping, RepartitionContext> e : partitionContexts.entrySet()) {
if (e.getValue() == null) {
// no partitioner, add required operators
for (int i = 0; i < addedPartitions.size(); i++) {
LOG.debug("Automatically adding to parallel partition {}", e.getKey());
// set activation windowId to confirm to upstream checkpoints
addPTOperator(e.getKey(), null, mainPC.minCheckpoint);
}
} else {
RepartitionContext pc = e.getValue();
// track previous parallel partition mapping
Map<Partition<Operator>, Partition<Operator>> prevMapping = Maps.newHashMap();
for (int i = 0; i < mainPC.currentPartitions.size(); i++) {
prevMapping.put(pc.currentPartitions.get(i), mainPC.currentPartitions.get(i));
}
// determine which new partitions match upstream, remaining to be treated as new operator
Map<Partition<Operator>, Partition<Operator>> newMapping = Maps.newHashMap();
Iterator<Partition<Operator>> itMain = mainPC.newPartitions.iterator();
Iterator<Partition<Operator>> itParallel = pc.newPartitions.iterator();
while (itMain.hasNext() && itParallel.hasNext()) {
newMapping.put(itParallel.next(), itMain.next());
}
for (Partition<Operator> newPartition : pc.newPartitions) {
PTOperator op = pc.currentPartitionMap.remove(newPartition);
if (op == null) {
pc.addedPartitions.add(newPartition);
} else if (prevMapping.get(newPartition) != newMapping.get(newPartition)) {
// upstream partitions don't match, remove/add to replace with new operator
pc.currentPartitionMap.put(newPartition, op);
pc.addedPartitions.add(newPartition);
} else {
// check whether mapping was changed - based on DefaultPartition implementation
for (DefaultPartition<Operator> pi : pc.currentPartitions) {
if (pi == newPartition && pi.isModified()) {
// existing partition changed (operator or partition keys)
// remove/add to update subscribers and state
mainPC.currentPartitionMap.put(newPartition, op);
pc.addedPartitions.add(newPartition);
}
}
}
}
if (!pc.currentPartitionMap.isEmpty()) {
// remove obsolete partitions
List<PTOperator> cowPartitions = Lists.newArrayList(e.getKey().partitions);
for (PTOperator p : pc.currentPartitionMap.values()) {
cowPartitions.remove(p);
removePartition(p, e.getKey());
pc.operatorIdToPartition.remove(p.getId());
}
e.getKey().partitions = cowPartitions;
}
// add new partitions
for (Partition<Operator> newPartition : pc.addedPartitions) {
PTOperator oper = addPTOperator(e.getKey(), newPartition, mainPC.minCheckpoint);
pc.operatorIdToPartition.put(oper.getId(), newPartition);
}
getPartitioner(e.getKey()).partitioned(pc.operatorIdToPartition);
}
}
updateStreamMappings(currentMapping);
for (PMapping pp : partitionContexts.keySet()) {
updateStreamMappings(pp);
}
deployChanges();
if (mainPC.currentPartitions.size() != mainPC.newPartitions.size()) {
StramEvent ev = new StramEvent.PartitionEvent(currentMapping.logicalOperator.getName(), mainPC.currentPartitions.size(), mainPC.newPartitions.size());
ev.setReason(note);
this.ctx.recordEventAsync(ev);
}
partitioner.partitioned(mainPC.operatorIdToPartition);
}
Aggregations