Search in sources :

Example 66 with Tuple

use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.

the class HashJoinUtils method leftOuterJoin.

public static List<Object> leftOuterJoin(List<Tuple> leftRelation, List<Tuple> rightRelation, MessageType messageType) {
    Map<Object, List<Tuple>> rightHash = new THashMap<>(messageType);
    List<Object> joinedTuples = new ArrayList<>();
    for (Tuple tuple : rightRelation) {
        rightHash.computeIfAbsent(tuple.getKey(), k -> new ArrayList<>()).add(tuple);
    }
    for (Tuple leftTuple : leftRelation) {
        List<Tuple> rightTuples = rightHash.getOrDefault(leftTuple.getKey(), Collections.emptyList());
        for (Tuple rightTuple : rightTuples) {
            joinedTuples.add(JoinedTuple.of(leftTuple.getKey(), leftTuple.getValue(), rightTuple.getValue()));
        }
        if (rightTuples.isEmpty()) {
            joinedTuples.add(JoinedTuple.of(leftTuple.getKey(), leftTuple.getValue(), null));
        }
    }
    return joinedTuples;
}
Also used : CommunicationContext(edu.iu.dsc.tws.api.comms.CommunicationContext) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) List(java.util.List) Iterator(java.util.Iterator) ResettableIterator(edu.iu.dsc.tws.comms.shuffle.ResettableIterator) MessageType(edu.iu.dsc.tws.api.comms.messaging.types.MessageType) Map(java.util.Map) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Logger(java.util.logging.Logger) Collections(java.util.Collections) Twister2RuntimeException(edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Example 67 with Tuple

use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.

the class HashJoinUtils method join.

/**
 * Disk based inner join
 */
public static Iterator<JoinedTuple> join(ResettableIterator<Tuple<?, ?>> leftIt, ResettableIterator<Tuple<?, ?>> rightIt, CommunicationContext.JoinType joinType, MessageType keyType) {
    // choosing hashing and probing relations
    // if inner join:
    // hashing = left
    // probing = right
    // if left join:
    // hashing = right
    // probing = left
    // if right join:
    // hashing = left
    // probing = right
    final ResettableIterator<Tuple<?, ?>> hashingRelation = joinType.equals(CommunicationContext.JoinType.LEFT) ? rightIt : leftIt;
    final ResettableIterator<Tuple<?, ?>> probingRelation = joinType.equals(CommunicationContext.JoinType.LEFT) ? leftIt : rightIt;
    // set the memory limits based on the heap allocation
    final double lowerMemoryBound = Runtime.getRuntime().totalMemory() * 0.1;
    return new Iterator<JoinedTuple>() {

        private boolean hashingDone;

        private Map<Object, List> keyHash = new THashMap<>(keyType);

        // always keep the nextJoinTuple in memory. hasNext() will use this field
        private JoinedTuple nextJoinTuple;

        /**
         * This method will perform following actions in order
         * <ol>
         *   <li>Clear existing HashMap</li>
         *   <li>Create HashMap from the hashingRelation till it hit the memory limits</li>
         *   <li>Determine whether the hashingRelation is fully consumed</li>
         * </ol>
         */
        private void doHashing() {
            this.keyHash.clear();
            // building the hash, as long as memory permits
            while (Runtime.getRuntime().freeMemory() > lowerMemoryBound && hashingRelation.hasNext()) {
                Tuple<?, ?> nextLeft = hashingRelation.next();
                keyHash.computeIfAbsent(nextLeft.getKey(), k -> new ArrayList()).add(nextLeft.getValue());
            }
            // determine whether hashRelation is fully consumed
            hashingDone = !hashingRelation.hasNext();
            if (!hashingDone && this.keyHash.isEmpty()) {
                // problem!. We have cleared the old hash, yet there's no free memory available to proceed
                throw new Twister2RuntimeException("Couldn't progress due to memory limitations." + "Available free memory : " + Runtime.getRuntime().freeMemory() + ", Expected free memory : " + lowerMemoryBound);
            }
        }

        {
            // initially do hashing & probing
            doHashing();
            doProbing();
        }

        // when iterating over the right(probing) relation, current element
        // (which has been returned by next()) will be kept in memory since it should be combined
        // with all the tuples in leftListForCurrentKey. But this has to be done on demand, on next()
        // call of joined iterator.
        private Tuple<?, ?> currentProbingTuple;

        // list of tuples from left relation(hashing relation),
        // that matches with the currentRightTuple
        private List leftListForCurrentKey;

        // keeping the index of leftListForCurrentKey
        private int leftListIndex = 0;

        /**
         * This method should be guaranteed to create a {@link JoinedTuple}. If a tuple can't be
         * created, caller should determine that before calling this method.
         * Additionally, this method should clear everything if everything related to
         * currentRightTuple is processed.
         */
        private void progressProbing() {
            Object key = this.currentProbingTuple.getKey();
            // we have interchanged original iterators based on the join type.
            // that should be taken into consideration when creating the JoinedTuple
            Object left = joinType.equals(CommunicationContext.JoinType.LEFT) ? this.currentProbingTuple.getValue() : leftListForCurrentKey.get(leftListIndex);
            Object right = joinType.equals(CommunicationContext.JoinType.LEFT) ? leftListForCurrentKey.get(leftListIndex) : this.currentProbingTuple.getValue();
            this.nextJoinTuple = JoinedTuple.of(key, left, right);
            leftListIndex++;
            // if end of the list has reached, reset everything!
            if (leftListIndex == leftListForCurrentKey.size()) {
                currentProbingTuple = null;
                leftListForCurrentKey = null;
                leftListIndex = 0;
            }
        }

        /**
         * This method iterates through the right relation(probing relation).
         */
        private void doProbing() {
            // if there is a non null nextJoinTuple, no need of proceeding
            while (this.nextJoinTuple == null) {
                // hashed list and still in the middle of combining that list
                if (this.currentProbingTuple == null) {
                    if (probingRelation.hasNext()) {
                        this.currentProbingTuple = probingRelation.next();
                        this.leftListForCurrentKey = this.keyHash.get(currentProbingTuple.getKey());
                        if (this.leftListForCurrentKey == null) {
                            // handle left and right joins here
                            if (joinType.equals(CommunicationContext.JoinType.LEFT)) {
                                this.nextJoinTuple = JoinedTuple.of(currentProbingTuple.getKey(), currentProbingTuple.getValue(), null);
                            } else if (joinType.equals(CommunicationContext.JoinType.RIGHT)) {
                                this.nextJoinTuple = JoinedTuple.of(currentProbingTuple.getKey(), null, currentProbingTuple.getValue());
                            }
                            // any join : We are done with currentProbingTuple
                            this.currentProbingTuple = null;
                        } else {
                            progressProbing();
                        }
                    } else {
                        // right iterator has reached to an end for current HashMap.
                        if (!hashingDone) {
                            // clear current hash and reset the right iterator
                            doHashing();
                            probingRelation.reset();
                        } else {
                            // end of join operation. Yay!
                            break;
                        }
                    }
                } else {
                    progressProbing();
                }
            }
        }

        @Override
        public boolean hasNext() {
            return this.nextJoinTuple != null;
        }

        @Override
        public JoinedTuple next() {
            if (!hasNext()) {
                throw new Twister2RuntimeException("Join operation has reached to an end. " + "Use hasNext() to check the status.");
            }
            JoinedTuple currentJoinTuple = nextJoinTuple;
            nextJoinTuple = null;
            // create the next JoinTuple before returning
            doProbing();
            return currentJoinTuple;
        }
    };
}
Also used : CommunicationContext(edu.iu.dsc.tws.api.comms.CommunicationContext) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) List(java.util.List) Iterator(java.util.Iterator) ResettableIterator(edu.iu.dsc.tws.comms.shuffle.ResettableIterator) MessageType(edu.iu.dsc.tws.api.comms.messaging.types.MessageType) Map(java.util.Map) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Logger(java.util.logging.Logger) Collections(java.util.Collections) Twister2RuntimeException(edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException) ArrayList(java.util.ArrayList) Twister2RuntimeException(edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) ArrayList(java.util.ArrayList) Iterator(java.util.Iterator) ResettableIterator(edu.iu.dsc.tws.comms.shuffle.ResettableIterator) List(java.util.List) ArrayList(java.util.ArrayList) Map(java.util.Map) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Example 68 with Tuple

use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.

the class SortJoinUtils method outerJoin.

/**
 * Full Outer join the left and right relation using the tuple key
 */
private static List<Object> outerJoin(List<Tuple> leftRelation, List<Tuple> rightRelation, KeyComparatorWrapper comparator, CommunicationContext.JoinType outerJoinType) {
    int leftIndex = 0;
    int rightIndex = 0;
    leftRelation.sort(comparator);
    rightRelation.sort(comparator);
    List<Object> outPut = new ArrayList<>();
    while (leftIndex < leftRelation.size() && rightIndex < rightRelation.size()) {
        Tuple left = leftRelation.get(leftIndex);
        Tuple right = rightRelation.get(rightIndex);
        if (comparator.compare(left, right) == 0) {
            outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), right.getValue()));
            int index = leftIndex + 1;
            while (index < leftRelation.size()) {
                Tuple l = leftRelation.get(index);
                if (comparator.compare(l, right) == 0) {
                    outPut.add(new JoinedTuple<>(l.getKey(), l.getValue(), right.getValue()));
                } else {
                    break;
                }
                index++;
            }
            leftIndex = index;
            index = rightIndex + 1;
            while (index < rightRelation.size()) {
                Tuple r = rightRelation.get(index);
                if (comparator.compare(left, r) == 0) {
                    outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), r.getValue()));
                } else {
                    break;
                }
                index++;
            }
            rightIndex = index;
        } else if (comparator.compare(left, right) < 0) {
            if (outerJoinType.includeLeft()) {
                outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), null));
            }
            leftIndex++;
        } else {
            if (outerJoinType.includeRight()) {
                outPut.add(new JoinedTuple<>(right.getKey(), null, right.getValue()));
            }
            rightIndex++;
        }
    }
    while (leftIndex < leftRelation.size() && outerJoinType.includeLeft()) {
        Tuple left = leftRelation.get(leftIndex);
        outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), null));
        leftIndex++;
    }
    while (rightIndex < rightRelation.size() && outerJoinType.includeRight()) {
        Tuple right = rightRelation.get(rightIndex);
        outPut.add(new JoinedTuple<>(right.getKey(), null, right.getValue()));
        rightIndex++;
    }
    return outPut;
}
Also used : ArrayList(java.util.ArrayList) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Example 69 with Tuple

use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.

the class SortJoinUtils method joinWithCache.

/**
 * This method avoid having to scan back and forth of the files by reading data iterators once
 * and backup them into a {@link DiskBasedList}, which has a memory buffer
 */
public static Iterator<JoinedTuple> joinWithCache(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator, CommunicationContext.JoinType joinType, Config config) {
    LOG.info("Performing join with cache....");
    return new Iterator<JoinedTuple>() {

        private final List<DiskBasedList> oldLists = new ArrayList<>();

        private DiskBasedList leftList;

        private DiskBasedList rightList;

        // if we had to keep next() to check the next tuple, these variables can be used to keep them
        private Tuple leftBackup;

        private Tuple rightBackup;

        private Iterator<JoinedTuple> localJoinIterator;

        /**
         * Advances two iterators by reading onto memory
         *
         * @return true if advance() should be called again
         */
        private boolean advance() {
            if (this.leftList != null) {
                this.leftList.dispose();
                this.oldLists.add(this.leftList);
            }
            if (this.rightList != null) {
                this.rightList.dispose();
                this.oldLists.add(this.rightList);
            }
            long maxRecordsInMemory = CommunicationContext.getShuffleMaxRecordsInMemory(config) / 2;
            // previous lists are now garbage collectible
            this.leftList = new DiskBasedList(config, MessageTypes.OBJECT);
            this.rightList = new DiskBasedList(config, MessageTypes.OBJECT);
            Tuple currentTuple = null;
            // read from left iterator
            while (leftIt.hasNext() || this.leftBackup != null) {
                Tuple<?, ?> nextLeft = this.leftBackup != null ? this.leftBackup : leftIt.next();
                // we used the backup
                this.leftBackup = null;
                if (currentTuple == null) {
                    currentTuple = nextLeft;
                }
                if (comparator.compare(currentTuple, nextLeft) == 0) {
                    this.leftList.add(nextLeft);
                } else if (comparator.compare(currentTuple, nextLeft) < 0 && this.leftList.size() < maxRecordsInMemory) {
                    currentTuple = nextLeft;
                    this.leftList.add(nextLeft);
                } else {
                    this.leftBackup = nextLeft;
                    break;
                }
            }
            // read from right iterator
            while (rightIt.hasNext() || this.rightBackup != null) {
                Tuple<?, ?> nextRight = this.rightBackup != null ? this.rightBackup : rightIt.next();
                this.rightBackup = null;
                if (currentTuple == null) {
                    currentTuple = nextRight;
                }
                if (comparator.compare(currentTuple, nextRight) >= 0) {
                    this.rightList.add(nextRight);
                } else {
                    this.rightBackup = nextRight;
                    break;
                }
            }
            this.localJoinIterator = join(new ListBasedRestorableIterator(this.leftList), new ListBasedRestorableIterator(this.rightList), comparator, joinType);
            // data iterators, let's advance() again
            return !this.localJoinIterator.hasNext() && (leftBackup != null || rightBackup != null || leftIt.hasNext() || rightIt.hasNext());
        }

        private void callAdvanceIt() {
            boolean shouldCall = true;
            while (shouldCall) {
                shouldCall = this.advance();
            }
        }

        {
            this.callAdvanceIt();
            // add a shutdown hook to cleanup
            Runtime.getRuntime().addShutdownHook(new Thread() {

                @Override
                public synchronized void start() {
                    LOG.info("Cleaning up disk based caches used for join...");
                    for (DiskBasedList oldList : oldLists) {
                        oldList.clear();
                    }
                }
            });
        }

        @Override
        public boolean hasNext() {
            return this.localJoinIterator != null && this.localJoinIterator.hasNext();
        }

        @Override
        public JoinedTuple next() {
            JoinedTuple next = this.localJoinIterator.next();
            if (!this.localJoinIterator.hasNext()) {
                this.callAdvanceIt();
            }
            return next;
        }
    };
}
Also used : RestorableIterator(edu.iu.dsc.tws.comms.shuffle.RestorableIterator) Iterator(java.util.Iterator) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple) List(java.util.List) ArrayList(java.util.ArrayList) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Example 70 with Tuple

use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.

the class SortJoinUtils method innerJoin.

/**
 * Inner join the left and right relation using the tuple key
 *
 * @param leftRelation left relation
 * @param rightRelation right relation
 * @param comparator comparator
 * @return the joined relation
 */
public static List<Object> innerJoin(List<Tuple> leftRelation, List<Tuple> rightRelation, KeyComparatorWrapper comparator) {
    int leftIndex = 0;
    int rightIndex = 0;
    leftRelation.sort(comparator);
    rightRelation.sort(comparator);
    List<Object> outPut = new ArrayList<>();
    while (leftIndex < leftRelation.size() && rightIndex < rightRelation.size()) {
        Tuple left = leftRelation.get(leftIndex);
        Tuple right = rightRelation.get(rightIndex);
        if (comparator.compare(left, right) == 0) {
            outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), right.getValue()));
            int index = leftIndex + 1;
            while (index < leftRelation.size()) {
                Tuple l = leftRelation.get(index);
                if (comparator.compare(l, right) == 0) {
                    outPut.add(new JoinedTuple<>(l.getKey(), l.getValue(), right.getValue()));
                } else {
                    break;
                }
                index++;
            }
            index = rightIndex + 1;
            while (index < rightRelation.size()) {
                Tuple r = rightRelation.get(index);
                if (comparator.compare(left, r) == 0) {
                    outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), r.getValue()));
                } else {
                    break;
                }
                index++;
            }
            leftIndex++;
            rightIndex++;
        } else if (comparator.compare(left, right) < 0) {
            leftIndex++;
        } else {
            rightIndex++;
        }
    }
    return outPut;
}
Also used : ArrayList(java.util.ArrayList) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple) JoinedTuple(edu.iu.dsc.tws.api.comms.structs.JoinedTuple)

Aggregations

Tuple (edu.iu.dsc.tws.api.comms.structs.Tuple)98 Iterator (java.util.Iterator)38 List (java.util.List)35 Logger (java.util.logging.Logger)34 ArrayList (java.util.ArrayList)29 Config (edu.iu.dsc.tws.api.config.Config)27 WorkerEnvironment (edu.iu.dsc.tws.api.resource.WorkerEnvironment)24 Test (org.junit.Test)24 BatchEnvironment (edu.iu.dsc.tws.tset.env.BatchEnvironment)18 InMessage (edu.iu.dsc.tws.comms.dfw.InMessage)17 HashMap (java.util.HashMap)16 TSetEnvironment (edu.iu.dsc.tws.tset.env.TSetEnvironment)15 JobConfig (edu.iu.dsc.tws.api.JobConfig)14 MessageTypes (edu.iu.dsc.tws.api.comms.messaging.types.MessageTypes)14 JoinedTuple (edu.iu.dsc.tws.api.comms.structs.JoinedTuple)14 ResourceAllocator (edu.iu.dsc.tws.rsched.core.ResourceAllocator)14 SourceTSet (edu.iu.dsc.tws.tset.sets.batch.SourceTSet)13 CommunicationContext (edu.iu.dsc.tws.api.comms.CommunicationContext)11 MessageType (edu.iu.dsc.tws.api.comms.messaging.types.MessageType)11 Comparator (java.util.Comparator)11