Search in sources :

Example 11 with JoinedTuple

use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.

the class SortJoinUtils method joinWithCache.

/**
 * This method avoid having to scan back and forth of the files by reading data iterators once
 * and backup them into a {@link DiskBasedList}, which has a memory buffer
 */
public static Iterator<JoinedTuple> joinWithCache(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator, CommunicationContext.JoinType joinType, Config config) {
    LOG.info("Performing join with cache....");
    return new Iterator<JoinedTuple>() {

        private final List<DiskBasedList> oldLists = new ArrayList<>();

        private DiskBasedList leftList;

        private DiskBasedList rightList;

        // if we had to keep next() to check the next tuple, these variables can be used to keep them
        private Tuple leftBackup;

        private Tuple rightBackup;

        private Iterator<JoinedTuple> localJoinIterator;

        /**
         * Advances two iterators by reading onto memory
         *
         * @return true if advance() should be called again
         */
        private boolean advance() {
            if (this.leftList != null) {
                this.leftList.dispose();
                this.oldLists.add(this.leftList);
            }
            if (this.rightList != null) {
                this.rightList.dispose();
                this.oldLists.add(this.rightList);
            }
            long maxRecordsInMemory = CommunicationContext.getShuffleMaxRecordsInMemory(config) / 2;
            // previous lists are now garbage collectible
            this.leftList = new DiskBasedList(config, MessageTypes.OBJECT);
            this.rightList = new DiskBasedList(config, MessageTypes.OBJECT);
            Tuple currentTuple = null;
            // read from left iterator
            while (leftIt.hasNext() || this.leftBackup != null) {
                Tuple<?, ?> nextLeft = this.leftBackup != null ? this.leftBackup : leftIt.next();
                // we used the backup
                this.leftBackup = null;
                if (currentTuple == null) {
                    currentTuple = nextLeft;
                }
                if (comparator.compare(currentTuple, nextLeft) == 0) {
                    this.leftList.add(nextLeft);
                } else if (comparator.compare(currentTuple, nextLeft) < 0 && this.leftList.size() < maxRecordsInMemory) {
                    currentTuple = nextLeft;
                    this.leftList.add(nextLeft);
                } else {
                    this.leftBackup = nextLeft;
                    break;
                }
            }
            // read from right iterator
            while (rightIt.hasNext() || this.rightBackup != null) {
                Tuple<?, ?> nextRight = this.rightBackup != null ? this.rightBackup : rightIt.next();
                this.rightBackup = null;
                if (currentTuple == null) {
                    currentTuple = nextRight;
                }
                if (comparator.compare(currentTuple, nextRight) >= 0) {
                    this.rightList.add(nextRight);
                } else {
                    this.rightBackup = nextRight;
                    break;
                }
            }
            this.localJoinIterator = join(new ListBasedRestorableIterator(this.leftList), new ListBasedRestorableIterator(this.rightList), comparator, joinType);
            // data iterators, let's advance() again
            return !this.localJoinIterator.hasNext() && (leftBackup != null || rightBackup != null || leftIt.hasNext() || rightIt.hasNext());
        }

        private void callAdvanceIt() {
            boolean shouldCall = true;
            while (shouldCall) {
                shouldCall = this.advance();
            }
        }

        {
            this.callAdvanceIt();
            // add a shutdown hook to cleanup
            Runtime.getRuntime().addShutdownHook(new Thread() {

                @Override
                public synchronized void start() {
                    LOG.info("Cleaning up disk based caches used for join...");
                    for (DiskBasedList oldList : oldLists) {
                        oldList.clear();
                    }
                }
            });
        }

        @Override
        public boolean hasNext() {
            return this.localJoinIterator != null && this.localJoinIterator.hasNext();
        }

        @Override
        public JoinedTuple next() {
            JoinedTuple next = this.localJoinIterator.next();
            if (!this.localJoinIterator.hasNext()) {
                this.callAdvanceIt();
            }
            return next;
        }
    };
}
Also used : RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) List(java.util.List) ArrayList(java.util.ArrayList) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Example 12 with JoinedTuple

use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.

the class SortJoinUtils method innerJoin.

/**
 * Inner join the left and right relation using the tuple key
 *
 * @param leftRelation left relation
 * @param rightRelation right relation
 * @param comparator comparator
 * @return the joined relation
 */
public static List<Object> innerJoin(List<Tuple> leftRelation, List<Tuple> rightRelation, KeyComparatorWrapper comparator) {
    int leftIndex = 0;
    int rightIndex = 0;
    leftRelation.sort(comparator);
    rightRelation.sort(comparator);
    List<Object> outPut = new ArrayList<>();
    while (leftIndex < leftRelation.size() && rightIndex < rightRelation.size()) {
        Tuple left = leftRelation.get(leftIndex);
        Tuple right = rightRelation.get(rightIndex);
        if (comparator.compare(left, right) == 0) {
            outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), right.getValue()));
            int index = leftIndex + 1;
            while (index < leftRelation.size()) {
                Tuple l = leftRelation.get(index);
                if (comparator.compare(l, right) == 0) {
                    outPut.add(new JoinedTuple<>(l.getKey(), l.getValue(), right.getValue()));
                } else {
                    break;
                }
                index++;
            }
            index = rightIndex + 1;
            while (index < rightRelation.size()) {
                Tuple r = rightRelation.get(index);
                if (comparator.compare(left, r) == 0) {
                    outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), r.getValue()));
                } else {
                    break;
                }
                index++;
            }
            leftIndex++;
            rightIndex++;
        } else if (comparator.compare(left, right) < 0) {
            leftIndex++;
        } else {
            rightIndex++;
        }
    }
    return outPut;
}
Also used : ArrayList(java.util.ArrayList) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Example 13 with JoinedTuple

use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.

the class HashJoinUtilsTest method innerJoinDiskTest.

@Test
public void innerJoinDiskTest() {
    int noOfTuples = 1000;
    List<Integer> keys1 = new ArrayList<>();
    List<Integer> keys2 = new ArrayList<>();
    for (int i = 0; i < noOfTuples; i++) {
        keys1.add(i);
        keys2.add(i);
    }
    Collections.shuffle(keys1);
    Collections.shuffle(keys2);
    FSKeyedMerger fsMerger1 = new FSKeyedMerger(0, 0, "/tmp", "op-left", MessageTypes.INTEGER, MessageTypes.INTEGER);
    FSKeyedMerger fsMerger2 = new FSKeyedMerger(0, 0, "/tmp", "op-right", MessageTypes.INTEGER, MessageTypes.INTEGER);
    byte[] key1 = ByteBuffer.wrap(new byte[4]).putInt(1).array();
    byte[] key2 = ByteBuffer.wrap(new byte[4]).putInt(2).array();
    for (int i = 0; i < noOfTuples; i++) {
        fsMerger1.add(keys1.get(i), key1, Integer.BYTES);
        fsMerger2.add(keys2.get(i), key2, Integer.BYTES);
        fsMerger1.run();
        fsMerger2.run();
    }
    fsMerger1.switchToReading();
    fsMerger2.switchToReading();
    ResettableIterator it1 = fsMerger1.readIterator();
    ResettableIterator it2 = fsMerger2.readIterator();
    Iterator<JoinedTuple> iterator = HashJoinUtils.innerJoin(it1, it2, MessageTypes.INTEGER);
    Set<Integer> keysReceived = new HashSet<>();
    while (iterator.hasNext()) {
        JoinedTuple joinedTuple = iterator.next();
        Assert.assertEquals(1, joinedTuple.getLeftValue());
        Assert.assertEquals(2, joinedTuple.getRightValue());
        keysReceived.add((Integer) joinedTuple.getKey());
    }
    Assert.assertEquals(noOfTuples, keysReceived.size());
    fsMerger1.clean();
    fsMerger2.clean();
}
Also used : FSKeyedMerger(edu.iu.dsc.tws.comms.shuffle.FSKeyedMerger) ArrayList(java.util.ArrayList) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) ResettableIterator(edu.iu.dsc.tws.comms.shuffle.ResettableIterator) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 14 with JoinedTuple

use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.

the class HashJoinUtilsTest method rightJoinDiskTest.

@Test
public void rightJoinDiskTest() {
    int noOfTuples = 1000;
    Random random = new Random(System.currentTimeMillis());
    List<Integer> keys1 = new ArrayList<>();
    List<Integer> keys2 = new ArrayList<>();
    for (int i = 0; i < noOfTuples; i++) {
        keys1.add(i);
        if (random.nextBoolean()) {
            keys2.add(i);
        }
    }
    Collections.shuffle(keys1);
    Collections.shuffle(keys2);
    FSKeyedMerger fsMerger1 = new FSKeyedMerger(0, 0, "/tmp", "op-left", MessageTypes.INTEGER, MessageTypes.INTEGER);
    FSKeyedMerger fsMerger2 = new FSKeyedMerger(0, 0, "/tmp", "op-right", MessageTypes.INTEGER, MessageTypes.INTEGER);
    byte[] key1 = ByteBuffer.wrap(new byte[4]).putInt(1).array();
    byte[] key2 = ByteBuffer.wrap(new byte[4]).putInt(2).array();
    for (int i = 0; i < keys1.size(); i++) {
        fsMerger1.add(keys1.get(i), key1, Integer.BYTES);
        fsMerger1.run();
    }
    for (int i = 0; i < keys2.size(); i++) {
        fsMerger2.add(keys2.get(i), key2, Integer.BYTES);
        fsMerger2.run();
    }
    fsMerger1.switchToReading();
    fsMerger2.switchToReading();
    ResettableIterator it1 = fsMerger1.readIterator();
    ResettableIterator it2 = fsMerger2.readIterator();
    Iterator<JoinedTuple> iterator = HashJoinUtils.rightJoin(it1, it2, MessageTypes.INTEGER);
    Set<Integer> keysReceived = new HashSet<>();
    Set<Integer> leftKeyLookup = new HashSet<>(keys1);
    while (iterator.hasNext()) {
        JoinedTuple joinedTuple = iterator.next();
        Assert.assertEquals(2, joinedTuple.getRightValue());
        if (leftKeyLookup.contains(joinedTuple.getKey())) {
            Assert.assertEquals(1, joinedTuple.getLeftValue());
        } else {
            Assert.assertNull(joinedTuple.getLeftValue());
        }
        keysReceived.add((Integer) joinedTuple.getKey());
    }
    Assert.assertEquals(keys2.size(), keysReceived.size());
    fsMerger1.clean();
    fsMerger2.clean();
}
Also used : Random(java.util.Random) FSKeyedMerger(edu.iu.dsc.tws.comms.shuffle.FSKeyedMerger) ArrayList(java.util.ArrayList) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) ResettableIterator(edu.iu.dsc.tws.comms.shuffle.ResettableIterator) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 15 with JoinedTuple

use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.

the class SortJoinUtilsTest method getFullOuterJoined.

private List<Object> getFullOuterJoined() {
    List<Object> innerJoined = new ArrayList<>();
    innerJoined.add(new JoinedTuple(34, "Robinson", "Clerical"));
    innerJoined.add(new JoinedTuple(33, "Jones", "Engineering"));
    innerJoined.add(new JoinedTuple(34, "Smith", "Clerical"));
    innerJoined.add(new JoinedTuple(null, "Williams", null));
    innerJoined.add(new JoinedTuple(33, "Heisenberg", "Engineering"));
    innerJoined.add(new JoinedTuple(31, "Rafferty", "Sales"));
    innerJoined.add(new JoinedTuple(35, null, "Marketing"));
    return innerJoined;
}
Also used : ArrayList(java.util.ArrayList) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Aggregations

JoinedTuple (edu.iu.dsc.tws.api.comms.structs.JoinedTuple)22 ArrayList (java.util.ArrayList)20 Tuple (edu.iu.dsc.tws.api.comms.structs.Tuple)11 Iterator (java.util.Iterator)9 RestorableIterator (edu.iu.dsc.tws.comms.shuffle.RestorableIterator)8 Test (org.junit.Test)8 List (java.util.List)7 Random (java.util.Random)7 CommunicationContext (edu.iu.dsc.tws.api.comms.CommunicationContext)6 Logger (java.util.logging.Logger)6 MessageTypes (edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes)5 Config (edu.iu.dsc.tws.api.config.Config)5 CommonThreadPool (edu.iu.dsc.tws.api.util.CommonThreadPool)5 FSKeyedSortedMerger2 (edu.iu.dsc.tws.comms.shuffle.FSKeyedSortedMerger2)5 Comparator (java.util.Comparator)5 UUID (java.util.UUID)5 Assert (org.junit.Assert)5 ResettableIterator (edu.iu.dsc.tws.comms.shuffle.ResettableIterator)4 FSKeyedMerger (edu.iu.dsc.tws.comms.shuffle.FSKeyedMerger)3 HashSet (java.util.HashSet)3