use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.
the class HashJoinUtils method leftOuterJoin.
public static List<Object> leftOuterJoin(List<Tuple> leftRelation, List<Tuple> rightRelation, MessageType messageType) {
Map<Object, List<Tuple>> rightHash = new THashMap<>(messageType);
List<Object> joinedTuples = new ArrayList<>();
for (Tuple tuple : rightRelation) {
rightHash.computeIfAbsent(tuple.getKey(), k -> new ArrayList<>()).add(tuple);
}
for (Tuple leftTuple : leftRelation) {
List<Tuple> rightTuples = rightHash.getOrDefault(leftTuple.getKey(), Collections.emptyList());
for (Tuple rightTuple : rightTuples) {
joinedTuples.add(JoinedTuple.of(leftTuple.getKey(), leftTuple.getValue(), rightTuple.getValue()));
}
if (rightTuples.isEmpty()) {
joinedTuples.add(JoinedTuple.of(leftTuple.getKey(), leftTuple.getValue(), null));
}
}
return joinedTuples;
}
use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.
the class HashJoinUtils method join.
/**
* Disk based inner join
*/
public static Iterator<JoinedTuple> join(ResettableIterator<Tuple<?, ?>> leftIt, ResettableIterator<Tuple<?, ?>> rightIt, CommunicationContext.JoinType joinType, MessageType keyType) {
// choosing hashing and probing relations
// if inner join:
// hashing = left
// probing = right
// if left join:
// hashing = right
// probing = left
// if right join:
// hashing = left
// probing = right
final ResettableIterator<Tuple<?, ?>> hashingRelation = joinType.equals(CommunicationContext.JoinType.LEFT) ? rightIt : leftIt;
final ResettableIterator<Tuple<?, ?>> probingRelation = joinType.equals(CommunicationContext.JoinType.LEFT) ? leftIt : rightIt;
// set the memory limits based on the heap allocation
final double lowerMemoryBound = Runtime.getRuntime().totalMemory() * 0.1;
return new Iterator<JoinedTuple>() {
private boolean hashingDone;
private Map<Object, List> keyHash = new THashMap<>(keyType);
// always keep the nextJoinTuple in memory. hasNext() will use this field
private JoinedTuple nextJoinTuple;
/**
* This method will perform following actions in order
* <ol>
* <li>Clear existing HashMap</li>
* <li>Create HashMap from the hashingRelation till it hit the memory limits</li>
* <li>Determine whether the hashingRelation is fully consumed</li>
* </ol>
*/
private void doHashing() {
this.keyHash.clear();
// building the hash, as long as memory permits
while (Runtime.getRuntime().freeMemory() > lowerMemoryBound && hashingRelation.hasNext()) {
Tuple<?, ?> nextLeft = hashingRelation.next();
keyHash.computeIfAbsent(nextLeft.getKey(), k -> new ArrayList()).add(nextLeft.getValue());
}
// determine whether hashRelation is fully consumed
hashingDone = !hashingRelation.hasNext();
if (!hashingDone && this.keyHash.isEmpty()) {
// problem!. We have cleared the old hash, yet there's no free memory available to proceed
throw new Twister2RuntimeException("Couldn't progress due to memory limitations." + "Available free memory : " + Runtime.getRuntime().freeMemory() + ", Expected free memory : " + lowerMemoryBound);
}
}
{
// initially do hashing & probing
doHashing();
doProbing();
}
// when iterating over the right(probing) relation, current element
// (which has been returned by next()) will be kept in memory since it should be combined
// with all the tuples in leftListForCurrentKey. But this has to be done on demand, on next()
// call of joined iterator.
private Tuple<?, ?> currentProbingTuple;
// list of tuples from left relation(hashing relation),
// that matches with the currentRightTuple
private List leftListForCurrentKey;
// keeping the index of leftListForCurrentKey
private int leftListIndex = 0;
/**
* This method should be guaranteed to create a {@link JoinedTuple}. If a tuple can't be
* created, caller should determine that before calling this method.
* Additionally, this method should clear everything if everything related to
* currentRightTuple is processed.
*/
private void progressProbing() {
Object key = this.currentProbingTuple.getKey();
// we have interchanged original iterators based on the join type.
// that should be taken into consideration when creating the JoinedTuple
Object left = joinType.equals(CommunicationContext.JoinType.LEFT) ? this.currentProbingTuple.getValue() : leftListForCurrentKey.get(leftListIndex);
Object right = joinType.equals(CommunicationContext.JoinType.LEFT) ? leftListForCurrentKey.get(leftListIndex) : this.currentProbingTuple.getValue();
this.nextJoinTuple = JoinedTuple.of(key, left, right);
leftListIndex++;
// if end of the list has reached, reset everything!
if (leftListIndex == leftListForCurrentKey.size()) {
currentProbingTuple = null;
leftListForCurrentKey = null;
leftListIndex = 0;
}
}
/**
* This method iterates through the right relation(probing relation).
*/
private void doProbing() {
// if there is a non null nextJoinTuple, no need of proceeding
while (this.nextJoinTuple == null) {
// hashed list and still in the middle of combining that list
if (this.currentProbingTuple == null) {
if (probingRelation.hasNext()) {
this.currentProbingTuple = probingRelation.next();
this.leftListForCurrentKey = this.keyHash.get(currentProbingTuple.getKey());
if (this.leftListForCurrentKey == null) {
// handle left and right joins here
if (joinType.equals(CommunicationContext.JoinType.LEFT)) {
this.nextJoinTuple = JoinedTuple.of(currentProbingTuple.getKey(), currentProbingTuple.getValue(), null);
} else if (joinType.equals(CommunicationContext.JoinType.RIGHT)) {
this.nextJoinTuple = JoinedTuple.of(currentProbingTuple.getKey(), null, currentProbingTuple.getValue());
}
// any join : We are done with currentProbingTuple
this.currentProbingTuple = null;
} else {
progressProbing();
}
} else {
// right iterator has reached to an end for current HashMap.
if (!hashingDone) {
// clear current hash and reset the right iterator
doHashing();
probingRelation.reset();
} else {
// end of join operation. Yay!
break;
}
}
} else {
progressProbing();
}
}
}
@Override
public boolean hasNext() {
return this.nextJoinTuple != null;
}
@Override
public JoinedTuple next() {
if (!hasNext()) {
throw new Twister2RuntimeException("Join operation has reached to an end. " + "Use hasNext() to check the status.");
}
JoinedTuple currentJoinTuple = nextJoinTuple;
nextJoinTuple = null;
// create the next JoinTuple before returning
doProbing();
return currentJoinTuple;
}
};
}
use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.
the class SortJoinUtils method outerJoin.
/**
* Full Outer join the left and right relation using the tuple key
*/
private static List<Object> outerJoin(List<Tuple> leftRelation, List<Tuple> rightRelation, KeyComparatorWrapper comparator, CommunicationContext.JoinType outerJoinType) {
int leftIndex = 0;
int rightIndex = 0;
leftRelation.sort(comparator);
rightRelation.sort(comparator);
List<Object> outPut = new ArrayList<>();
while (leftIndex < leftRelation.size() && rightIndex < rightRelation.size()) {
Tuple left = leftRelation.get(leftIndex);
Tuple right = rightRelation.get(rightIndex);
if (comparator.compare(left, right) == 0) {
outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), right.getValue()));
int index = leftIndex + 1;
while (index < leftRelation.size()) {
Tuple l = leftRelation.get(index);
if (comparator.compare(l, right) == 0) {
outPut.add(new JoinedTuple<>(l.getKey(), l.getValue(), right.getValue()));
} else {
break;
}
index++;
}
leftIndex = index;
index = rightIndex + 1;
while (index < rightRelation.size()) {
Tuple r = rightRelation.get(index);
if (comparator.compare(left, r) == 0) {
outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), r.getValue()));
} else {
break;
}
index++;
}
rightIndex = index;
} else if (comparator.compare(left, right) < 0) {
if (outerJoinType.includeLeft()) {
outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), null));
}
leftIndex++;
} else {
if (outerJoinType.includeRight()) {
outPut.add(new JoinedTuple<>(right.getKey(), null, right.getValue()));
}
rightIndex++;
}
}
while (leftIndex < leftRelation.size() && outerJoinType.includeLeft()) {
Tuple left = leftRelation.get(leftIndex);
outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), null));
leftIndex++;
}
while (rightIndex < rightRelation.size() && outerJoinType.includeRight()) {
Tuple right = rightRelation.get(rightIndex);
outPut.add(new JoinedTuple<>(right.getKey(), null, right.getValue()));
rightIndex++;
}
return outPut;
}
use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.
the class SortJoinUtils method joinWithCache.
/**
* This method avoid having to scan back and forth of the files by reading data iterators once
* and backup them into a {@link DiskBasedList}, which has a memory buffer
*/
public static Iterator<JoinedTuple> joinWithCache(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator, CommunicationContext.JoinType joinType, Config config) {
LOG.info("Performing join with cache....");
return new Iterator<JoinedTuple>() {
private final List<DiskBasedList> oldLists = new ArrayList<>();
private DiskBasedList leftList;
private DiskBasedList rightList;
// if we had to keep next() to check the next tuple, these variables can be used to keep them
private Tuple leftBackup;
private Tuple rightBackup;
private Iterator<JoinedTuple> localJoinIterator;
/**
* Advances two iterators by reading onto memory
*
* @return true if advance() should be called again
*/
private boolean advance() {
if (this.leftList != null) {
this.leftList.dispose();
this.oldLists.add(this.leftList);
}
if (this.rightList != null) {
this.rightList.dispose();
this.oldLists.add(this.rightList);
}
long maxRecordsInMemory = CommunicationContext.getShuffleMaxRecordsInMemory(config) / 2;
// previous lists are now garbage collectible
this.leftList = new DiskBasedList(config, MessageTypes.OBJECT);
this.rightList = new DiskBasedList(config, MessageTypes.OBJECT);
Tuple currentTuple = null;
// read from left iterator
while (leftIt.hasNext() || this.leftBackup != null) {
Tuple<?, ?> nextLeft = this.leftBackup != null ? this.leftBackup : leftIt.next();
// we used the backup
this.leftBackup = null;
if (currentTuple == null) {
currentTuple = nextLeft;
}
if (comparator.compare(currentTuple, nextLeft) == 0) {
this.leftList.add(nextLeft);
} else if (comparator.compare(currentTuple, nextLeft) < 0 && this.leftList.size() < maxRecordsInMemory) {
currentTuple = nextLeft;
this.leftList.add(nextLeft);
} else {
this.leftBackup = nextLeft;
break;
}
}
// read from right iterator
while (rightIt.hasNext() || this.rightBackup != null) {
Tuple<?, ?> nextRight = this.rightBackup != null ? this.rightBackup : rightIt.next();
this.rightBackup = null;
if (currentTuple == null) {
currentTuple = nextRight;
}
if (comparator.compare(currentTuple, nextRight) >= 0) {
this.rightList.add(nextRight);
} else {
this.rightBackup = nextRight;
break;
}
}
this.localJoinIterator = join(new ListBasedRestorableIterator(this.leftList), new ListBasedRestorableIterator(this.rightList), comparator, joinType);
// data iterators, let's advance() again
return !this.localJoinIterator.hasNext() && (leftBackup != null || rightBackup != null || leftIt.hasNext() || rightIt.hasNext());
}
private void callAdvanceIt() {
boolean shouldCall = true;
while (shouldCall) {
shouldCall = this.advance();
}
}
{
this.callAdvanceIt();
// add a shutdown hook to cleanup
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public synchronized void start() {
LOG.info("Cleaning up disk based caches used for join...");
for (DiskBasedList oldList : oldLists) {
oldList.clear();
}
}
});
}
@Override
public boolean hasNext() {
return this.localJoinIterator != null && this.localJoinIterator.hasNext();
}
@Override
public JoinedTuple next() {
JoinedTuple next = this.localJoinIterator.next();
if (!this.localJoinIterator.hasNext()) {
this.callAdvanceIt();
}
return next;
}
};
}
use of edu.iu.dsc.tws.api.comms.structs.Tuple in project twister2 by DSC-SPIDAL.
the class SortJoinUtils method innerJoin.
/**
* Inner join the left and right relation using the tuple key
*
* @param leftRelation left relation
* @param rightRelation right relation
* @param comparator comparator
* @return the joined relation
*/
public static List<Object> innerJoin(List<Tuple> leftRelation, List<Tuple> rightRelation, KeyComparatorWrapper comparator) {
int leftIndex = 0;
int rightIndex = 0;
leftRelation.sort(comparator);
rightRelation.sort(comparator);
List<Object> outPut = new ArrayList<>();
while (leftIndex < leftRelation.size() && rightIndex < rightRelation.size()) {
Tuple left = leftRelation.get(leftIndex);
Tuple right = rightRelation.get(rightIndex);
if (comparator.compare(left, right) == 0) {
outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), right.getValue()));
int index = leftIndex + 1;
while (index < leftRelation.size()) {
Tuple l = leftRelation.get(index);
if (comparator.compare(l, right) == 0) {
outPut.add(new JoinedTuple<>(l.getKey(), l.getValue(), right.getValue()));
} else {
break;
}
index++;
}
index = rightIndex + 1;
while (index < rightRelation.size()) {
Tuple r = rightRelation.get(index);
if (comparator.compare(left, r) == 0) {
outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), r.getValue()));
} else {
break;
}
index++;
}
leftIndex++;
rightIndex++;
} else if (comparator.compare(left, right) < 0) {
leftIndex++;
} else {
rightIndex++;
}
}
return outPut;
}
Aggregations