Search in sources :

Example 6 with Partition

use of com.datatorrent.api.Partitioner.Partition in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testPartitioningStateTransferInterrupted.

/**
 * Test for testing dynamic partitioning.
 * - Create 4 file with 3 records each.
 * - Create a single partition, and read some records, populating pending files in operator.
 * - Split it in two operators
 * - Try to emit the remaining records.
 */
@Test
public void testPartitioningStateTransferInterrupted() throws Exception {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    oper.setScanIntervalMillis(0);
    oper.setEmitBatchSize(2);
    LineByLineFileInputOperator initialState = new Kryo().copy(oper);
    // Create 4 files with 3 records each.
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    int file;
    for (file = 0; file < 4; file++) {
        FileUtils.write(new File(testMeta.dir, "partition00" + file), "a\nb\nc\n");
    }
    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({ "unchecked", "rawtypes" }) CollectorTestSink<Object> sink = (CollectorTestSink) queryResults;
    oper.output.setSink(sink);
    int wid = 0;
    // Read some records
    oper.setup(testMeta.context);
    for (int i = 0; i < 5; i++) {
        oper.beginWindow(wid);
        oper.emitTuples();
        oper.endWindow();
        wid++;
    }
    Assert.assertEquals("Partial tuples read ", 6, sink.collectedTuples.size());
    Assert.assertEquals(1, initialState.getCurrentPartitions());
    initialState.setPartitionCount(2);
    StatsListener.Response rsp = initialState.processStats(null);
    Assert.assertEquals(true, rsp.repartitionRequired);
    // Create partitions of the operator.
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    // incremental capacity controlled partitionCount property
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = initialState.definePartitions(partitions, new PartitioningContextImpl(null, 0));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, initialState.getCurrentPartitions());
    Map<Integer, Partition<AbstractFileInputOperator<String>>> m = Maps.newHashMap();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
        m.put(m.size(), p);
    }
    initialState.partitioned(m);
    Assert.assertEquals(2, initialState.getCurrentPartitions());
    /* Collect all operators in a list */
    List<AbstractFileInputOperator<String>> opers = Lists.newArrayList();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
        LineByLineFileInputOperator oi = (LineByLineFileInputOperator) p.getPartitionedInstance();
        oi.setup(testMeta.context);
        oi.output.setSink(sink);
        opers.add(oi);
    }
    sink.clear();
    for (int i = 0; i < 10; i++) {
        for (AbstractFileInputOperator<String> o : opers) {
            o.beginWindow(wid);
            o.emitTuples();
            o.endWindow();
        }
        wid++;
    }
    Assert.assertEquals("Remaining tuples read ", 6, sink.collectedTuples.size());
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(com.datatorrent.api.Partitioner.Partition) DefaultPartition(com.datatorrent.api.DefaultPartition) StatsListener(com.datatorrent.api.StatsListener) PartitioningContextImpl(org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) Kryo(com.esotericsoftware.kryo.Kryo) CollectorTestSink(org.apache.apex.malhar.lib.testbench.CollectorTestSink) Test(org.junit.Test)

Example 7 with Partition

use of com.datatorrent.api.Partitioner.Partition in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testWindowDataManagerPartitioning.

@Test
public void testWindowDataManagerPartitioning() throws Exception {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    oper.setWindowDataManager(new FSWindowDataManager());
    oper.operatorId = 7;
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 4; file++) {
        FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
    }
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, oper.getCurrentPartitions());
    List<FSWindowDataManager> storageManagers = Lists.newLinkedList();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
        storageManagers.add((FSWindowDataManager) p.getPartitionedInstance().getWindowDataManager());
    }
    Assert.assertEquals("count of storage managers", 2, storageManagers.size());
    int countOfDeleteManagers = 0;
    FSWindowDataManager deleteManager = null;
    for (FSWindowDataManager storageManager : storageManagers) {
        if (storageManager.getDeletedOperators() != null) {
            countOfDeleteManagers++;
            deleteManager = storageManager;
        }
    }
    Assert.assertEquals("count of delete managers", 1, countOfDeleteManagers);
    Assert.assertNotNull("deleted operators manager", deleteManager);
    Assert.assertEquals("deleted operators", Sets.newHashSet(7), deleteManager.getDeletedOperators());
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(com.datatorrent.api.Partitioner.Partition) DefaultPartition(com.datatorrent.api.DefaultPartition) FSWindowDataManager(org.apache.apex.malhar.lib.wal.FSWindowDataManager) PartitioningContextImpl(org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) Test(org.junit.Test)

Example 8 with Partition

use of com.datatorrent.api.Partitioner.Partition in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testPartitioning.

@Test
public void testPartitioning() throws Exception {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 4; file++) {
        FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
    }
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    // partitioned() wasn't called
    Assert.assertEquals(1, oper.getCurrentPartitions());
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
        Assert.assertNotSame(oper, p.getPartitionedInstance());
        Assert.assertNotSame(oper.getScanner(), p.getPartitionedInstance().getScanner());
        Set<String> consumed = Sets.newHashSet();
        LinkedHashSet<Path> files = p.getPartitionedInstance().getScanner().scan(FileSystem.getLocal(new Configuration(false)), path, consumed);
        Assert.assertEquals("partition " + files, 3, files.size());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(com.datatorrent.api.Partitioner.Partition) DefaultPartition(com.datatorrent.api.DefaultPartition) Configuration(org.apache.hadoop.conf.Configuration) PartitioningContextImpl(org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) Test(org.junit.Test)

Example 9 with Partition

use of com.datatorrent.api.Partitioner.Partition in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testWithCustomScanner.

/**
 * Partition the operator in 2
 * create ten files with index of the file at the start, i.e 1_file, 2_file .. etc.
 * The scanner returns this index from getPartition method.
 * each partition should read 5 files as file index are from 0 to 9 (including 0 and 9).
 * @throws Exception
 */
@Test
public void testWithCustomScanner() throws Exception {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.setScanner(new MyScanner());
    oper.getScanner().setFilePatternRegexp(".*partition_([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    Random rand = new Random();
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 10; file++) {
        FileUtils.write(new File(testMeta.dir, file + "_partition_00" + rand.nextInt(100)), "");
    }
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    // partitioned() wasn't called
    Assert.assertEquals(1, oper.getCurrentPartitions());
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
        Assert.assertNotSame(oper, p.getPartitionedInstance());
        Assert.assertNotSame(oper.getScanner(), p.getPartitionedInstance().getScanner());
        Set<String> consumed = Sets.newHashSet();
        LinkedHashSet<Path> files = p.getPartitionedInstance().getScanner().scan(FileSystem.getLocal(new Configuration(false)), path, consumed);
        Assert.assertEquals("partition " + files, 6, files.size());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(com.datatorrent.api.Partitioner.Partition) DefaultPartition(com.datatorrent.api.DefaultPartition) Configuration(org.apache.hadoop.conf.Configuration) Random(java.util.Random) PartitioningContextImpl(org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) Test(org.junit.Test)

Example 10 with Partition

use of com.datatorrent.api.Partitioner.Partition in project apex-core by apache.

the class PhysicalPlan method redoPartitions.

private void redoPartitions(PMapping currentMapping, String note) {
    Partitioner<Operator> partitioner = getPartitioner(currentMapping);
    if (partitioner == null) {
        LOG.warn("No partitioner for {}", currentMapping.logicalOperator);
        return;
    }
    RepartitionContext mainPC = new RepartitionContext(partitioner, currentMapping, 0);
    if (mainPC.newPartitions.isEmpty()) {
        LOG.warn("Empty partition list after repartition: {}", currentMapping.logicalOperator);
        return;
    }
    int memoryPerPartition = currentMapping.logicalOperator.getValue(OperatorContext.MEMORY_MB);
    for (Map.Entry<OutputPortMeta, StreamMeta> stream : currentMapping.logicalOperator.getOutputStreams().entrySet()) {
        if (stream.getValue().getLocality() != Locality.THREAD_LOCAL && stream.getValue().getLocality() != Locality.CONTAINER_LOCAL) {
            memoryPerPartition += stream.getKey().getValue(PortContext.BUFFER_MEMORY_MB);
        }
    }
    for (OperatorMeta pp : currentMapping.parallelPartitions) {
        for (Map.Entry<OutputPortMeta, StreamMeta> stream : pp.getOutputStreams().entrySet()) {
            if (stream.getValue().getLocality() != Locality.THREAD_LOCAL && stream.getValue().getLocality() != Locality.CONTAINER_LOCAL) {
                memoryPerPartition += stream.getKey().getValue(PortContext.BUFFER_MEMORY_MB);
            }
        }
        memoryPerPartition += pp.getValue(OperatorContext.MEMORY_MB);
    }
    int requiredMemoryMB = (mainPC.newPartitions.size() - mainPC.currentPartitions.size()) * memoryPerPartition;
    if (requiredMemoryMB > availableMemoryMB) {
        LOG.warn("Insufficient headroom for repartitioning: available {}m required {}m", availableMemoryMB, requiredMemoryMB);
        return;
    }
    List<Partition<Operator>> addedPartitions = new ArrayList<>();
    // determine modifications of partition set, identify affected operator instance(s)
    for (Partition<Operator> newPartition : mainPC.newPartitions) {
        PTOperator op = mainPC.currentPartitionMap.remove(newPartition);
        if (op == null) {
            addedPartitions.add(newPartition);
        } else {
            // check whether mapping was changed
            for (DefaultPartition<Operator> pi : mainPC.currentPartitions) {
                if (pi == newPartition && pi.isModified()) {
                    // existing partition changed (operator or partition keys)
                    // remove/add to update subscribers and state
                    mainPC.currentPartitionMap.put(newPartition, op);
                    addedPartitions.add(newPartition);
                }
            }
        }
    }
    // remaining entries represent deprecated partitions
    this.undeployOpers.addAll(mainPC.currentPartitionMap.values());
    // downstream dependencies require redeploy, resolve prior to modifying plan
    Set<PTOperator> deps = this.getDependents(mainPC.currentPartitionMap.values());
    this.undeployOpers.addAll(deps);
    // dependencies need redeploy, except operators excluded in remove
    this.deployOpers.addAll(deps);
    // process parallel partitions before removing operators from the plan
    LinkedHashMap<PMapping, RepartitionContext> partitionContexts = Maps.newLinkedHashMap();
    Stack<OperatorMeta> parallelPartitions = new Stack<>();
    parallelPartitions.addAll(currentMapping.parallelPartitions);
    pendingLoop: while (!parallelPartitions.isEmpty()) {
        OperatorMeta ppMeta = parallelPartitions.pop();
        for (StreamMeta s : ppMeta.getInputStreams().values()) {
            if (currentMapping.parallelPartitions.contains(s.getSource().getOperatorMeta()) && parallelPartitions.contains(s.getSource().getOperatorMeta())) {
                parallelPartitions.push(ppMeta);
                parallelPartitions.remove(s.getSource().getOperatorMeta());
                parallelPartitions.push(s.getSource().getOperatorMeta());
                continue pendingLoop;
            }
        }
        LOG.debug("Processing parallel partition {}", ppMeta);
        PMapping ppm = this.logicalToPTOperator.get(ppMeta);
        Partitioner<Operator> ppp = getPartitioner(ppm);
        if (ppp == null) {
            partitionContexts.put(ppm, null);
        } else {
            RepartitionContext pc = new RepartitionContext(ppp, ppm, mainPC.newPartitions.size());
            if (pc.newPartitions == null) {
                throw new IllegalStateException("Partitioner returns null for parallel partition " + ppm.logicalOperator);
            }
            partitionContexts.put(ppm, pc);
        }
    }
    // plan updates start here, after all changes were identified
    // remove obsolete operators first, any freed resources
    // can subsequently be used for new/modified partitions
    List<PTOperator> copyPartitions = Lists.newArrayList(currentMapping.partitions);
    // remove deprecated partitions from plan
    for (PTOperator p : mainPC.currentPartitionMap.values()) {
        copyPartitions.remove(p);
        removePartition(p, currentMapping);
        mainPC.operatorIdToPartition.remove(p.getId());
    }
    currentMapping.partitions = copyPartitions;
    // add new operators
    for (Partition<Operator> newPartition : addedPartitions) {
        PTOperator p = addPTOperator(currentMapping, newPartition, mainPC.minCheckpoint);
        mainPC.operatorIdToPartition.put(p.getId(), newPartition);
    }
    // process parallel partition changes
    for (Map.Entry<PMapping, RepartitionContext> e : partitionContexts.entrySet()) {
        if (e.getValue() == null) {
            // no partitioner, add required operators
            for (int i = 0; i < addedPartitions.size(); i++) {
                LOG.debug("Automatically adding to parallel partition {}", e.getKey());
                // set activation windowId to confirm to upstream checkpoints
                addPTOperator(e.getKey(), null, mainPC.minCheckpoint);
            }
        } else {
            RepartitionContext pc = e.getValue();
            // track previous parallel partition mapping
            Map<Partition<Operator>, Partition<Operator>> prevMapping = Maps.newHashMap();
            for (int i = 0; i < mainPC.currentPartitions.size(); i++) {
                prevMapping.put(pc.currentPartitions.get(i), mainPC.currentPartitions.get(i));
            }
            // determine which new partitions match upstream, remaining to be treated as new operator
            Map<Partition<Operator>, Partition<Operator>> newMapping = Maps.newHashMap();
            Iterator<Partition<Operator>> itMain = mainPC.newPartitions.iterator();
            Iterator<Partition<Operator>> itParallel = pc.newPartitions.iterator();
            while (itMain.hasNext() && itParallel.hasNext()) {
                newMapping.put(itParallel.next(), itMain.next());
            }
            for (Partition<Operator> newPartition : pc.newPartitions) {
                PTOperator op = pc.currentPartitionMap.remove(newPartition);
                if (op == null) {
                    pc.addedPartitions.add(newPartition);
                } else if (prevMapping.get(newPartition) != newMapping.get(newPartition)) {
                    // upstream partitions don't match, remove/add to replace with new operator
                    pc.currentPartitionMap.put(newPartition, op);
                    pc.addedPartitions.add(newPartition);
                } else {
                    // check whether mapping was changed - based on DefaultPartition implementation
                    for (DefaultPartition<Operator> pi : pc.currentPartitions) {
                        if (pi == newPartition && pi.isModified()) {
                            // existing partition changed (operator or partition keys)
                            // remove/add to update subscribers and state
                            mainPC.currentPartitionMap.put(newPartition, op);
                            pc.addedPartitions.add(newPartition);
                        }
                    }
                }
            }
            if (!pc.currentPartitionMap.isEmpty()) {
                // remove obsolete partitions
                List<PTOperator> cowPartitions = Lists.newArrayList(e.getKey().partitions);
                for (PTOperator p : pc.currentPartitionMap.values()) {
                    cowPartitions.remove(p);
                    removePartition(p, e.getKey());
                    pc.operatorIdToPartition.remove(p.getId());
                }
                e.getKey().partitions = cowPartitions;
            }
            // add new partitions
            for (Partition<Operator> newPartition : pc.addedPartitions) {
                PTOperator oper = addPTOperator(e.getKey(), newPartition, mainPC.minCheckpoint);
                pc.operatorIdToPartition.put(oper.getId(), newPartition);
            }
            getPartitioner(e.getKey()).partitioned(pc.operatorIdToPartition);
        }
    }
    updateStreamMappings(currentMapping);
    for (PMapping pp : partitionContexts.keySet()) {
        updateStreamMappings(pp);
    }
    deployChanges();
    if (mainPC.currentPartitions.size() != mainPC.newPartitions.size()) {
        StramEvent ev = new StramEvent.PartitionEvent(currentMapping.logicalOperator.getName(), mainPC.currentPartitions.size(), mainPC.newPartitions.size());
        ev.setReason(note);
        this.ctx.recordEventAsync(ev);
    }
    partitioner.partitioned(mainPC.operatorIdToPartition);
}
Also used : Operator(com.datatorrent.api.Operator) StramEvent(com.datatorrent.stram.api.StramEvent) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ArrayList(java.util.ArrayList) StreamMeta(com.datatorrent.stram.plan.logical.LogicalPlan.StreamMeta) OutputPortMeta(com.datatorrent.stram.plan.logical.LogicalPlan.OutputPortMeta) Partitioner(com.datatorrent.api.Partitioner) Partition(com.datatorrent.api.Partitioner.Partition) DefaultPartition(com.datatorrent.api.DefaultPartition) OperatorMeta(com.datatorrent.stram.plan.logical.LogicalPlan.OperatorMeta) Checkpoint(com.datatorrent.stram.api.Checkpoint) Stack(java.util.Stack) DefaultPartition(com.datatorrent.api.DefaultPartition) Map(java.util.Map) HashMap(java.util.HashMap) ConcurrentMap(java.util.concurrent.ConcurrentMap) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

Partition (com.datatorrent.api.Partitioner.Partition)11 DefaultPartition (com.datatorrent.api.DefaultPartition)9 Test (org.junit.Test)8 PartitioningContextImpl (org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl)7 File (java.io.File)6 LineByLineFileInputOperator (org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator)6 Path (org.apache.hadoop.fs.Path)6 StatsListener (com.datatorrent.api.StatsListener)4 ArrayList (java.util.ArrayList)4 CollectorTestSink (org.apache.apex.malhar.lib.testbench.CollectorTestSink)4 Operator (com.datatorrent.api.Operator)3 Checkpoint (com.datatorrent.stram.api.Checkpoint)3 Kryo (com.esotericsoftware.kryo.Kryo)3 Partitioner (com.datatorrent.api.Partitioner)2 PartitionKeys (com.datatorrent.api.Partitioner.PartitionKeys)2 HashMap (java.util.HashMap)2 CopyOnWriteArrayList (java.util.concurrent.CopyOnWriteArrayList)2 Configuration (org.apache.hadoop.conf.Configuration)2 CouchbaseClient (com.couchbase.client.CouchbaseClient)1 CouchbaseConnectionFactoryBuilder (com.couchbase.client.CouchbaseConnectionFactoryBuilder)1