Search in sources :

Example 1 with RestorableIterator

use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.

the class SortJoinUtils method outerJoin.

/**
 * This util can be used to perform disk based inner join operations.
 */
public static Iterator<JoinedTuple> outerJoin(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator, CommunicationContext.JoinType outerJoinType) {
    return new Iterator<JoinedTuple>() {

        private JoinedTuple nextJoinTuple;

        private Tuple currentLeft;

        private Tuple currentRight;

        // backup variables will hold a Tuple temporary if had to call .next()
        // once during the join operation before creating a iterator restore point.
        private Tuple backedUpLeft;

        private Tuple backedUpRight;

        // flags to mark the required side of iteration
        private boolean shouldDoLeftIterations = false;

        private boolean shouldDoRightIterations = false;

        private JoinedTuple doLeftIteration() {
            if (!shouldDoLeftIterations) {
                return null;
            }
            JoinedTuple jtFromLeftIt = null;
            if (leftIt.hasNext()) {
                Tuple l = leftIt.next();
                if (comparator.compare(l, this.currentRight) == 0) {
                    jtFromLeftIt = new JoinedTuple<>(l.getKey(), l.getValue(), this.currentRight.getValue());
                } else {
                    this.backedUpLeft = l;
                }
            }
            /*
         if this is the end of left iteration(jtFromLeftIt == null), configure the right iterations
         to run next and restore left iterator
        */
            if (jtFromLeftIt == null) {
                this.shouldDoLeftIterations = false;
                this.shouldDoRightIterations = true;
            }
            return jtFromLeftIt;
        }

        private JoinedTuple doRightIteration() {
            if (!shouldDoRightIterations) {
                return null;
            }
            JoinedTuple jtFromRightIt = null;
            if (rightIt.hasNext()) {
                Tuple l = rightIt.next();
                if (comparator.compare(this.currentLeft, l) == 0) {
                    jtFromRightIt = new JoinedTuple<>(l.getKey(), this.currentLeft.getValue(), l.getValue());
                } else {
                    this.backedUpRight = l;
                }
            }
            /*
         if this is the end of left iteration(jtFromRightIt == null), configure the right iterations
         to run next and restore left iterator
        */
            if (jtFromRightIt == null) {
                this.shouldDoRightIterations = false;
            }
            return jtFromRightIt;
        }

        private void makeNextJoinTuple() {
            nextJoinTuple = this.doLeftIteration();
            if (nextJoinTuple == null) {
                nextJoinTuple = this.doRightIteration();
            }
            while (nextJoinTuple == null && (this.backedUpLeft != null || leftIt.hasNext()) && (this.backedUpRight != null || rightIt.hasNext())) {
                this.currentLeft = this.backedUpLeft != null ? this.backedUpLeft : leftIt.next();
                // we used the backup, so setting to null
                this.backedUpLeft = null;
                this.currentRight = this.backedUpRight != null ? this.backedUpRight : rightIt.next();
                this.backedUpRight = null;
                // still we don't need left or right iterations at this point
                this.shouldDoLeftIterations = false;
                this.shouldDoRightIterations = false;
                if (comparator.compare(this.currentLeft, this.currentRight) == 0) {
                    this.nextJoinTuple = new JoinedTuple<>(this.currentLeft.getKey(), this.currentLeft.getValue(), this.currentRight.getValue());
                    // schedule to run the left iteration next.
                    // Left iteration at the end will schedule right iteration
                    this.shouldDoLeftIterations = true;
                    break;
                } else if (comparator.compare(this.currentLeft, this.currentRight) < 0) {
                    if (outerJoinType.includeLeft()) {
                        this.nextJoinTuple = new JoinedTuple<>(this.currentLeft.getKey(), this.currentLeft.getValue(), null);
                    }
                    if (leftIt.hasNext()) {
                        this.backedUpLeft = leftIt.next();
                    }
                    this.backedUpRight = this.currentRight;
                } else {
                    if (outerJoinType.includeRight()) {
                        this.nextJoinTuple = new JoinedTuple<>(this.currentRight.getKey(), null, this.currentRight.getValue());
                    }
                    if (rightIt.hasNext()) {
                        this.backedUpRight = rightIt.next();
                    }
                    this.backedUpLeft = this.currentLeft;
                }
            }
        }

        {
            // start by creating the first join tuple
            this.makeNextJoinTuple();
        }

        @Override
        public boolean hasNext() {
            return nextJoinTuple != null;
        }

        @Override
        public JoinedTuple next() {
            JoinedTuple current = nextJoinTuple;
            this.makeNextJoinTuple();
            return current;
        }
    };
}
Also used : RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Example 2 with RestorableIterator

use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.

the class SortJoinUtils method innerJoin.

/**
 * This util can be used to perform disk based inner join operations.
 */
public static Iterator<JoinedTuple> innerJoin(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator) {
    return new Iterator<JoinedTuple>() {

        private JoinedTuple nextJoinTuple;

        private Tuple currentLeft;

        private Tuple currentRight;

        // backup variables will hold a Tuple temporary if had to call .next()
        // once during the join operation before creating a iterator restore point.
        private Tuple backedUpLeft;

        private Tuple backedUpRight;

        // flags to mark the required side of iteration
        private boolean shouldDoLeftIterations = false;

        private boolean shouldDoRightIterations = false;

        // keeps the no of iterations done on each side of the relationship while keeping the
        // other side constant
        private int leftIterations = 0;

        private int rightIterations = 0;

        private JoinedTuple doLeftIteration() {
            if (!shouldDoLeftIterations) {
                return null;
            }
            JoinedTuple jtFromLeftIt = null;
            if (leftIt.hasNext()) {
                Tuple l = leftIt.next();
                if (this.leftIterations == 0) {
                    this.backedUpLeft = l;
                }
                if (comparator.compare(l, this.currentRight) == 0) {
                    if (this.leftIterations == 0) {
                        leftIt.createRestorePoint();
                    }
                    this.leftIterations++;
                    jtFromLeftIt = new JoinedTuple<>(l.getKey(), l.getValue(), this.currentRight.getValue());
                }
            }
            /*
         if this is the end of left iteration(jtFromLeftIt == null), configure the right iterations
         to run next and restore left iterator
        */
            if (jtFromLeftIt == null) {
                this.leftIterations = 0;
                this.shouldDoLeftIterations = false;
                this.shouldDoRightIterations = true;
                if (leftIt.hasRestorePoint()) {
                    leftIt.restore();
                    leftIt.clearRestorePoint();
                }
            }
            return jtFromLeftIt;
        }

        private JoinedTuple doRightIteration() {
            if (!shouldDoRightIterations) {
                return null;
            }
            JoinedTuple jtFromRightIt = null;
            if (rightIt.hasNext()) {
                Tuple l = rightIt.next();
                if (this.rightIterations == 0) {
                    this.backedUpRight = l;
                }
                if (comparator.compare(this.currentLeft, l) == 0) {
                    if (this.rightIterations == 0) {
                        rightIt.createRestorePoint();
                    }
                    this.rightIterations++;
                    jtFromRightIt = new JoinedTuple<>(l.getKey(), this.currentLeft.getValue(), l.getValue());
                }
            }
            /*
         if this is the end of left iteration(jtFromRightIt == null), configure the right iterations
         to run next and restore left iterator
        */
            if (jtFromRightIt == null) {
                this.rightIterations = 0;
                this.shouldDoRightIterations = false;
                if (rightIt.hasRestorePoint()) {
                    rightIt.restore();
                    rightIt.clearRestorePoint();
                }
            }
            return jtFromRightIt;
        }

        private void makeNextJoinTuple() {
            nextJoinTuple = this.doLeftIteration();
            if (nextJoinTuple == null) {
                nextJoinTuple = this.doRightIteration();
            }
            while (nextJoinTuple == null && (this.backedUpLeft != null || leftIt.hasNext()) && (this.backedUpRight != null || rightIt.hasNext())) {
                this.currentLeft = this.backedUpLeft != null ? this.backedUpLeft : leftIt.next();
                // we used the backup, so setting to null
                this.backedUpLeft = null;
                this.currentRight = this.backedUpRight != null ? this.backedUpRight : rightIt.next();
                this.backedUpRight = null;
                // still we don't need left or right iterations at this point
                this.shouldDoLeftIterations = false;
                this.shouldDoRightIterations = false;
                if (comparator.compare(this.currentLeft, this.currentRight) == 0) {
                    this.nextJoinTuple = new JoinedTuple<>(this.currentLeft.getKey(), this.currentLeft.getValue(), this.currentRight.getValue());
                    // schedule to run the left iteration next.
                    // Left iteration at the end will schedule right iteration
                    this.shouldDoLeftIterations = true;
                    break;
                } else if (comparator.compare(this.currentLeft, this.currentRight) < 0) {
                    if (leftIt.hasNext()) {
                        this.backedUpLeft = leftIt.next();
                    }
                    this.backedUpRight = this.currentRight;
                } else {
                    if (rightIt.hasNext()) {
                        this.backedUpRight = rightIt.next();
                    }
                    this.backedUpLeft = this.currentLeft;
                }
            }
        }

        {
            // start by creating the first join tuple
            this.makeNextJoinTuple();
        }

        @Override
        public boolean hasNext() {
            return nextJoinTuple != null;
        }

        @Override
        public JoinedTuple next() {
            JoinedTuple current = nextJoinTuple;
            this.makeNextJoinTuple();
            return current;
        }
    };
}
Also used : RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Example 3 with RestorableIterator

use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.

the class SortJoinUtilsTest method innerJoinWithDiskBasedListComparision.

/**
 * This test compares the results of in memory and disk based inner joins.
 * Purpose is to verify the accuracy of disk based inner join
 */
@Test
public void innerJoinWithDiskBasedListComparision() {
    List<Tuple> left = new ArrayList<>();
    List<Tuple> right = new ArrayList<>();
    Random random = new Random();
    for (int i = 0; i < 100; i++) {
        left.add(Tuple.of(random.nextInt(10), random.nextInt()));
        right.add(Tuple.of(random.nextInt(10), random.nextInt()));
    }
    FSKeyedSortedMerger2 fsk1 = new FSKeyedSortedMerger2(10, 100, "/tmp", "op-1-" + UUID.randomUUID().toString(), MessageTypes.INTEGER, MessageTypes.INTEGER, (Comparator<Integer>) Integer::compare, 0, false, 1);
    for (Tuple tuple : left) {
        byte[] data = MessageTypes.INTEGER.getDataPacker().packToByteArray((Integer) tuple.getValue());
        fsk1.add(tuple.getKey(), data, data.length);
        fsk1.run();
    }
    FSKeyedSortedMerger2 fsk2 = new FSKeyedSortedMerger2(10, 100, "/tmp", "op-2-" + UUID.randomUUID().toString(), MessageTypes.INTEGER, MessageTypes.INTEGER, (Comparator<Integer>) Integer::compare, 0, false, 1);
    for (Tuple tuple : right) {
        byte[] data = MessageTypes.INTEGER.getDataPacker().packToByteArray((Integer) tuple.getValue());
        fsk2.add(tuple.getKey(), data, data.length);
        fsk2.run();
    }
    CommonThreadPool.init(Config.newBuilder().build());
    fsk1.switchToReading();
    fsk2.switchToReading();
    Iterator iterator = SortJoinUtils.joinWithCache((RestorableIterator) fsk1.readIterator(), (RestorableIterator) fsk2.readIterator(), new KeyComparatorWrapper((Comparator<Integer>) Integer::compare), CommunicationContext.JoinType.INNER, Config.newBuilder().build());
    List<Object> objects = SortJoinUtils.innerJoin(left, right, new KeyComparatorWrapper(Comparator.naturalOrder()));
    objects.sort(Comparator.comparingInt(o -> (Integer) ((JoinedTuple) o).getKey()));
    int i = 0;
    while (iterator.hasNext()) {
        JoinedTuple nextFromIt = (JoinedTuple) iterator.next();
        JoinedTuple nextFromList = (JoinedTuple) objects.get(i++);
        Assert.assertEquals(nextFromIt.getKey(), nextFromList.getKey());
    }
    Assert.assertEquals(i, objects.size());
}
Also used : CommunicationContext(edu.iu.dsc.tws.api.comms.CommunicationContext) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) FSKeyedSortedMerger2(edu.iu.dsc.tws.comms.shuffle.FSKeyedSortedMerger2) RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) CommonThreadPool(edu.iu.dsc.tws.api.util.CommonThreadPool) Random(java.util.Random) Test(org.junit.Test) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Config(edu.iu.dsc.tws.api.config.Config) UUID(java.util.UUID) MessageTypes(edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes) Logger(java.util.logging.Logger) ArrayList(java.util.ArrayList) List(java.util.List) Comparator(java.util.Comparator) Assert(org.junit.Assert) FSKeyedSortedMerger2(edu.iu.dsc.tws.comms.shuffle.FSKeyedSortedMerger2) ArrayList(java.util.ArrayList) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Comparator(java.util.Comparator) Random(java.util.Random) RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Test(org.junit.Test)

Example 4 with RestorableIterator

use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.

the class SortJoinUtilsTest method leftOuterJoinComparision.

/**
 * This test compares the results of in memory and disk based left outer joins.
 * Purpose is to verify the accuracy of disk based left outer join
 */
@Test
public void leftOuterJoinComparision() {
    List<Tuple> left = new ArrayList<>();
    List<Tuple> right = new ArrayList<>();
    Random random = new Random();
    for (int i = 0; i < 100; i++) {
        left.add(Tuple.of(random.nextInt(10), random.nextInt()));
        right.add(Tuple.of(random.nextInt(10), random.nextInt()));
    }
    FSKeyedSortedMerger2 fsk1 = new FSKeyedSortedMerger2(10, 100, "/tmp", "op-1-" + UUID.randomUUID().toString(), MessageTypes.INTEGER, MessageTypes.INTEGER, (Comparator<Integer>) Integer::compare, 0, false, 1);
    for (Tuple tuple : left) {
        byte[] data = MessageTypes.INTEGER.getDataPacker().packToByteArray((Integer) tuple.getValue());
        fsk1.add(tuple.getKey(), data, data.length);
        fsk1.run();
    }
    FSKeyedSortedMerger2 fsk2 = new FSKeyedSortedMerger2(10, 100, "/tmp", "op-2-" + UUID.randomUUID().toString(), MessageTypes.INTEGER, MessageTypes.INTEGER, (Comparator<Integer>) Integer::compare, 0, false, 1);
    for (Tuple tuple : right) {
        byte[] data = MessageTypes.INTEGER.getDataPacker().packToByteArray((Integer) tuple.getValue());
        fsk2.add(tuple.getKey(), data, data.length);
        fsk2.run();
    }
    CommonThreadPool.init(Config.newBuilder().build());
    fsk1.switchToReading();
    fsk2.switchToReading();
    Iterator iterator = SortJoinUtils.leftOuterJoin((RestorableIterator) fsk1.readIterator(), (RestorableIterator) fsk2.readIterator(), new KeyComparatorWrapper((Comparator<Integer>) Integer::compare));
    List<Object> objects = SortJoinUtils.leftOuterJoin(left, right, new KeyComparatorWrapper(Comparator.naturalOrder()));
    objects.sort(Comparator.comparingInt(o -> (Integer) ((JoinedTuple) o).getKey()));
    int i = 0;
    while (iterator.hasNext()) {
        JoinedTuple nextFromIt = (JoinedTuple) iterator.next();
        JoinedTuple nextFromList = (JoinedTuple) objects.get(i++);
        Assert.assertEquals(nextFromIt.getKey(), nextFromList.getKey());
    }
    Assert.assertEquals(i, objects.size());
}
Also used : CommunicationContext(edu.iu.dsc.tws.api.comms.CommunicationContext) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) FSKeyedSortedMerger2(edu.iu.dsc.tws.comms.shuffle.FSKeyedSortedMerger2) RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) CommonThreadPool(edu.iu.dsc.tws.api.util.CommonThreadPool) Random(java.util.Random) Test(org.junit.Test) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Config(edu.iu.dsc.tws.api.config.Config) UUID(java.util.UUID) MessageTypes(edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes) Logger(java.util.logging.Logger) ArrayList(java.util.ArrayList) List(java.util.List) Comparator(java.util.Comparator) Assert(org.junit.Assert) FSKeyedSortedMerger2(edu.iu.dsc.tws.comms.shuffle.FSKeyedSortedMerger2) ArrayList(java.util.ArrayList) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Comparator(java.util.Comparator) Random(java.util.Random) RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Test(org.junit.Test)

Example 5 with RestorableIterator

use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.

the class SortJoinUtils method joinWithCache.

/**
 * This method avoid having to scan back and forth of the files by reading data iterators once
 * and backup them into a {@link DiskBasedList}, which has a memory buffer
 */
public static Iterator<JoinedTuple> joinWithCache(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator, CommunicationContext.JoinType joinType, Config config) {
    LOG.info("Performing join with cache....");
    return new Iterator<JoinedTuple>() {

        private final List<DiskBasedList> oldLists = new ArrayList<>();

        private DiskBasedList leftList;

        private DiskBasedList rightList;

        // if we had to keep next() to check the next tuple, these variables can be used to keep them
        private Tuple leftBackup;

        private Tuple rightBackup;

        private Iterator<JoinedTuple> localJoinIterator;

        /**
         * Advances two iterators by reading onto memory
         *
         * @return true if advance() should be called again
         */
        private boolean advance() {
            if (this.leftList != null) {
                this.leftList.dispose();
                this.oldLists.add(this.leftList);
            }
            if (this.rightList != null) {
                this.rightList.dispose();
                this.oldLists.add(this.rightList);
            }
            long maxRecordsInMemory = CommunicationContext.getShuffleMaxRecordsInMemory(config) / 2;
            // previous lists are now garbage collectible
            this.leftList = new DiskBasedList(config, MessageTypes.OBJECT);
            this.rightList = new DiskBasedList(config, MessageTypes.OBJECT);
            Tuple currentTuple = null;
            // read from left iterator
            while (leftIt.hasNext() || this.leftBackup != null) {
                Tuple<?, ?> nextLeft = this.leftBackup != null ? this.leftBackup : leftIt.next();
                // we used the backup
                this.leftBackup = null;
                if (currentTuple == null) {
                    currentTuple = nextLeft;
                }
                if (comparator.compare(currentTuple, nextLeft) == 0) {
                    this.leftList.add(nextLeft);
                } else if (comparator.compare(currentTuple, nextLeft) < 0 && this.leftList.size() < maxRecordsInMemory) {
                    currentTuple = nextLeft;
                    this.leftList.add(nextLeft);
                } else {
                    this.leftBackup = nextLeft;
                    break;
                }
            }
            // read from right iterator
            while (rightIt.hasNext() || this.rightBackup != null) {
                Tuple<?, ?> nextRight = this.rightBackup != null ? this.rightBackup : rightIt.next();
                this.rightBackup = null;
                if (currentTuple == null) {
                    currentTuple = nextRight;
                }
                if (comparator.compare(currentTuple, nextRight) >= 0) {
                    this.rightList.add(nextRight);
                } else {
                    this.rightBackup = nextRight;
                    break;
                }
            }
            this.localJoinIterator = join(new ListBasedRestorableIterator(this.leftList), new ListBasedRestorableIterator(this.rightList), comparator, joinType);
            // data iterators, let's advance() again
            return !this.localJoinIterator.hasNext() && (leftBackup != null || rightBackup != null || leftIt.hasNext() || rightIt.hasNext());
        }

        private void callAdvanceIt() {
            boolean shouldCall = true;
            while (shouldCall) {
                shouldCall = this.advance();
            }
        }

        {
            this.callAdvanceIt();
            // add a shutdown hook to cleanup
            Runtime.getRuntime().addShutdownHook(new Thread() {

                @Override
                public synchronized void start() {
                    LOG.info("Cleaning up disk based caches used for join...");
                    for (DiskBasedList oldList : oldLists) {
                        oldList.clear();
                    }
                }
            });
        }

        @Override
        public boolean hasNext() {
            return this.localJoinIterator != null && this.localJoinIterator.hasNext();
        }

        @Override
        public JoinedTuple next() {
            JoinedTuple next = this.localJoinIterator.next();
            if (!this.localJoinIterator.hasNext()) {
                this.callAdvanceIt();
            }
            return next;
        }
    };
}
Also used : RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) List(java.util.List) ArrayList(java.util.ArrayList) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Aggregations

RestorableIterator (edu.iu.dsc.tws.comms.shuffle.RestorableIterator)9 JoinedTuple (edu.iu.dsc.tws.api.comms.structs.JoinedTuple)8 Tuple (edu.iu.dsc.tws.api.comms.structs.Tuple)8 Iterator (java.util.Iterator)8 ArrayList (java.util.ArrayList)6 List (java.util.List)6 CommunicationContext (edu.iu.dsc.tws.api.comms.CommunicationContext)5 MessageTypes (edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes)5 Config (edu.iu.dsc.tws.api.config.Config)5 CommonThreadPool (edu.iu.dsc.tws.api.util.CommonThreadPool)5 FSKeyedSortedMerger2 (edu.iu.dsc.tws.comms.shuffle.FSKeyedSortedMerger2)5 Comparator (java.util.Comparator)5 Random (java.util.Random)5 UUID (java.util.UUID)5 Logger (java.util.logging.Logger)5 Assert (org.junit.Assert)5 Test (org.junit.Test)5