use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.
the class SortJoinUtils method joinWithCache.
/**
* This method avoid having to scan back and forth of the files by reading data iterators once
* and backup them into a {@link DiskBasedList}, which has a memory buffer
*/
public static Iterator<JoinedTuple> joinWithCache(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator, CommunicationContext.JoinType joinType, Config config) {
LOG.info("Performing join with cache....");
return new Iterator<JoinedTuple>() {
private final List<DiskBasedList> oldLists = new ArrayList<>();
private DiskBasedList leftList;
private DiskBasedList rightList;
// if we had to keep next() to check the next tuple, these variables can be used to keep them
private Tuple leftBackup;
private Tuple rightBackup;
private Iterator<JoinedTuple> localJoinIterator;
/**
* Advances two iterators by reading onto memory
*
* @return true if advance() should be called again
*/
private boolean advance() {
if (this.leftList != null) {
this.leftList.dispose();
this.oldLists.add(this.leftList);
}
if (this.rightList != null) {
this.rightList.dispose();
this.oldLists.add(this.rightList);
}
long maxRecordsInMemory = CommunicationContext.getShuffleMaxRecordsInMemory(config) / 2;
// previous lists are now garbage collectible
this.leftList = new DiskBasedList(config, MessageTypes.OBJECT);
this.rightList = new DiskBasedList(config, MessageTypes.OBJECT);
Tuple currentTuple = null;
// read from left iterator
while (leftIt.hasNext() || this.leftBackup != null) {
Tuple<?, ?> nextLeft = this.leftBackup != null ? this.leftBackup : leftIt.next();
// we used the backup
this.leftBackup = null;
if (currentTuple == null) {
currentTuple = nextLeft;
}
if (comparator.compare(currentTuple, nextLeft) == 0) {
this.leftList.add(nextLeft);
} else if (comparator.compare(currentTuple, nextLeft) < 0 && this.leftList.size() < maxRecordsInMemory) {
currentTuple = nextLeft;
this.leftList.add(nextLeft);
} else {
this.leftBackup = nextLeft;
break;
}
}
// read from right iterator
while (rightIt.hasNext() || this.rightBackup != null) {
Tuple<?, ?> nextRight = this.rightBackup != null ? this.rightBackup : rightIt.next();
this.rightBackup = null;
if (currentTuple == null) {
currentTuple = nextRight;
}
if (comparator.compare(currentTuple, nextRight) >= 0) {
this.rightList.add(nextRight);
} else {
this.rightBackup = nextRight;
break;
}
}
this.localJoinIterator = join(new ListBasedRestorableIterator(this.leftList), new ListBasedRestorableIterator(this.rightList), comparator, joinType);
// data iterators, let's advance() again
return !this.localJoinIterator.hasNext() && (leftBackup != null || rightBackup != null || leftIt.hasNext() || rightIt.hasNext());
}
private void callAdvanceIt() {
boolean shouldCall = true;
while (shouldCall) {
shouldCall = this.advance();
}
}
{
this.callAdvanceIt();
// add a shutdown hook to cleanup
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public synchronized void start() {
LOG.info("Cleaning up disk based caches used for join...");
for (DiskBasedList oldList : oldLists) {
oldList.clear();
}
}
});
}
@Override
public boolean hasNext() {
return this.localJoinIterator != null && this.localJoinIterator.hasNext();
}
@Override
public JoinedTuple next() {
JoinedTuple next = this.localJoinIterator.next();
if (!this.localJoinIterator.hasNext()) {
this.callAdvanceIt();
}
return next;
}
};
}
use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.
the class SortJoinUtils method innerJoin.
/**
* Inner join the left and right relation using the tuple key
*
* @param leftRelation left relation
* @param rightRelation right relation
* @param comparator comparator
* @return the joined relation
*/
public static List<Object> innerJoin(List<Tuple> leftRelation, List<Tuple> rightRelation, KeyComparatorWrapper comparator) {
int leftIndex = 0;
int rightIndex = 0;
leftRelation.sort(comparator);
rightRelation.sort(comparator);
List<Object> outPut = new ArrayList<>();
while (leftIndex < leftRelation.size() && rightIndex < rightRelation.size()) {
Tuple left = leftRelation.get(leftIndex);
Tuple right = rightRelation.get(rightIndex);
if (comparator.compare(left, right) == 0) {
outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), right.getValue()));
int index = leftIndex + 1;
while (index < leftRelation.size()) {
Tuple l = leftRelation.get(index);
if (comparator.compare(l, right) == 0) {
outPut.add(new JoinedTuple<>(l.getKey(), l.getValue(), right.getValue()));
} else {
break;
}
index++;
}
index = rightIndex + 1;
while (index < rightRelation.size()) {
Tuple r = rightRelation.get(index);
if (comparator.compare(left, r) == 0) {
outPut.add(new JoinedTuple<>(left.getKey(), left.getValue(), r.getValue()));
} else {
break;
}
index++;
}
leftIndex++;
rightIndex++;
} else if (comparator.compare(left, right) < 0) {
leftIndex++;
} else {
rightIndex++;
}
}
return outPut;
}
use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.
the class HashJoinUtilsTest method innerJoinDiskTest.
@Test
public void innerJoinDiskTest() {
int noOfTuples = 1000;
List<Integer> keys1 = new ArrayList<>();
List<Integer> keys2 = new ArrayList<>();
for (int i = 0; i < noOfTuples; i++) {
keys1.add(i);
keys2.add(i);
}
Collections.shuffle(keys1);
Collections.shuffle(keys2);
FSKeyedMerger fsMerger1 = new FSKeyedMerger(0, 0, "/tmp", "op-left", MessageTypes.INTEGER, MessageTypes.INTEGER);
FSKeyedMerger fsMerger2 = new FSKeyedMerger(0, 0, "/tmp", "op-right", MessageTypes.INTEGER, MessageTypes.INTEGER);
byte[] key1 = ByteBuffer.wrap(new byte[4]).putInt(1).array();
byte[] key2 = ByteBuffer.wrap(new byte[4]).putInt(2).array();
for (int i = 0; i < noOfTuples; i++) {
fsMerger1.add(keys1.get(i), key1, Integer.BYTES);
fsMerger2.add(keys2.get(i), key2, Integer.BYTES);
fsMerger1.run();
fsMerger2.run();
}
fsMerger1.switchToReading();
fsMerger2.switchToReading();
ResettableIterator it1 = fsMerger1.readIterator();
ResettableIterator it2 = fsMerger2.readIterator();
Iterator<JoinedTuple> iterator = HashJoinUtils.innerJoin(it1, it2, MessageTypes.INTEGER);
Set<Integer> keysReceived = new HashSet<>();
while (iterator.hasNext()) {
JoinedTuple joinedTuple = iterator.next();
Assert.assertEquals(1, joinedTuple.getLeftValue());
Assert.assertEquals(2, joinedTuple.getRightValue());
keysReceived.add((Integer) joinedTuple.getKey());
}
Assert.assertEquals(noOfTuples, keysReceived.size());
fsMerger1.clean();
fsMerger2.clean();
}
use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.
the class HashJoinUtilsTest method rightJoinDiskTest.
@Test
public void rightJoinDiskTest() {
int noOfTuples = 1000;
Random random = new Random(System.currentTimeMillis());
List<Integer> keys1 = new ArrayList<>();
List<Integer> keys2 = new ArrayList<>();
for (int i = 0; i < noOfTuples; i++) {
keys1.add(i);
if (random.nextBoolean()) {
keys2.add(i);
}
}
Collections.shuffle(keys1);
Collections.shuffle(keys2);
FSKeyedMerger fsMerger1 = new FSKeyedMerger(0, 0, "/tmp", "op-left", MessageTypes.INTEGER, MessageTypes.INTEGER);
FSKeyedMerger fsMerger2 = new FSKeyedMerger(0, 0, "/tmp", "op-right", MessageTypes.INTEGER, MessageTypes.INTEGER);
byte[] key1 = ByteBuffer.wrap(new byte[4]).putInt(1).array();
byte[] key2 = ByteBuffer.wrap(new byte[4]).putInt(2).array();
for (int i = 0; i < keys1.size(); i++) {
fsMerger1.add(keys1.get(i), key1, Integer.BYTES);
fsMerger1.run();
}
for (int i = 0; i < keys2.size(); i++) {
fsMerger2.add(keys2.get(i), key2, Integer.BYTES);
fsMerger2.run();
}
fsMerger1.switchToReading();
fsMerger2.switchToReading();
ResettableIterator it1 = fsMerger1.readIterator();
ResettableIterator it2 = fsMerger2.readIterator();
Iterator<JoinedTuple> iterator = HashJoinUtils.rightJoin(it1, it2, MessageTypes.INTEGER);
Set<Integer> keysReceived = new HashSet<>();
Set<Integer> leftKeyLookup = new HashSet<>(keys1);
while (iterator.hasNext()) {
JoinedTuple joinedTuple = iterator.next();
Assert.assertEquals(2, joinedTuple.getRightValue());
if (leftKeyLookup.contains(joinedTuple.getKey())) {
Assert.assertEquals(1, joinedTuple.getLeftValue());
} else {
Assert.assertNull(joinedTuple.getLeftValue());
}
keysReceived.add((Integer) joinedTuple.getKey());
}
Assert.assertEquals(keys2.size(), keysReceived.size());
fsMerger1.clean();
fsMerger2.clean();
}
use of edu.iu.dsc.tws.api.comms.structs.JoinedTuple in project twister2 by DSC-SPIDAL.
the class SortJoinUtilsTest method getFullOuterJoined.
private List<Object> getFullOuterJoined() {
List<Object> innerJoined = new ArrayList<>();
innerJoined.add(new JoinedTuple(34, "Robinson", "Clerical"));
innerJoined.add(new JoinedTuple(33, "Jones", "Engineering"));
innerJoined.add(new JoinedTuple(34, "Smith", "Clerical"));
innerJoined.add(new JoinedTuple(null, "Williams", null));
innerJoined.add(new JoinedTuple(33, "Heisenberg", "Engineering"));
innerJoined.add(new JoinedTuple(31, "Rafferty", "Sales"));
innerJoined.add(new JoinedTuple(35, null, "Marketing"));
return innerJoined;
}
Aggregations