use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.
the class SortJoinUtils method outerJoin.
/**
* This util can be used to perform disk based inner join operations.
*/
public static Iterator<JoinedTuple> outerJoin(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator, CommunicationContext.JoinType outerJoinType) {
return new Iterator<JoinedTuple>() {
private JoinedTuple nextJoinTuple;
private Tuple currentLeft;
private Tuple currentRight;
// backup variables will hold a Tuple temporary if had to call .next()
// once during the join operation before creating a iterator restore point.
private Tuple backedUpLeft;
private Tuple backedUpRight;
// flags to mark the required side of iteration
private boolean shouldDoLeftIterations = false;
private boolean shouldDoRightIterations = false;
private JoinedTuple doLeftIteration() {
if (!shouldDoLeftIterations) {
return null;
}
JoinedTuple jtFromLeftIt = null;
if (leftIt.hasNext()) {
Tuple l = leftIt.next();
if (comparator.compare(l, this.currentRight) == 0) {
jtFromLeftIt = new JoinedTuple<>(l.getKey(), l.getValue(), this.currentRight.getValue());
} else {
this.backedUpLeft = l;
}
}
/*
if this is the end of left iteration(jtFromLeftIt == null), configure the right iterations
to run next and restore left iterator
*/
if (jtFromLeftIt == null) {
this.shouldDoLeftIterations = false;
this.shouldDoRightIterations = true;
}
return jtFromLeftIt;
}
private JoinedTuple doRightIteration() {
if (!shouldDoRightIterations) {
return null;
}
JoinedTuple jtFromRightIt = null;
if (rightIt.hasNext()) {
Tuple l = rightIt.next();
if (comparator.compare(this.currentLeft, l) == 0) {
jtFromRightIt = new JoinedTuple<>(l.getKey(), this.currentLeft.getValue(), l.getValue());
} else {
this.backedUpRight = l;
}
}
/*
if this is the end of left iteration(jtFromRightIt == null), configure the right iterations
to run next and restore left iterator
*/
if (jtFromRightIt == null) {
this.shouldDoRightIterations = false;
}
return jtFromRightIt;
}
private void makeNextJoinTuple() {
nextJoinTuple = this.doLeftIteration();
if (nextJoinTuple == null) {
nextJoinTuple = this.doRightIteration();
}
while (nextJoinTuple == null && (this.backedUpLeft != null || leftIt.hasNext()) && (this.backedUpRight != null || rightIt.hasNext())) {
this.currentLeft = this.backedUpLeft != null ? this.backedUpLeft : leftIt.next();
// we used the backup, so setting to null
this.backedUpLeft = null;
this.currentRight = this.backedUpRight != null ? this.backedUpRight : rightIt.next();
this.backedUpRight = null;
// still we don't need left or right iterations at this point
this.shouldDoLeftIterations = false;
this.shouldDoRightIterations = false;
if (comparator.compare(this.currentLeft, this.currentRight) == 0) {
this.nextJoinTuple = new JoinedTuple<>(this.currentLeft.getKey(), this.currentLeft.getValue(), this.currentRight.getValue());
// schedule to run the left iteration next.
// Left iteration at the end will schedule right iteration
this.shouldDoLeftIterations = true;
break;
} else if (comparator.compare(this.currentLeft, this.currentRight) < 0) {
if (outerJoinType.includeLeft()) {
this.nextJoinTuple = new JoinedTuple<>(this.currentLeft.getKey(), this.currentLeft.getValue(), null);
}
if (leftIt.hasNext()) {
this.backedUpLeft = leftIt.next();
}
this.backedUpRight = this.currentRight;
} else {
if (outerJoinType.includeRight()) {
this.nextJoinTuple = new JoinedTuple<>(this.currentRight.getKey(), null, this.currentRight.getValue());
}
if (rightIt.hasNext()) {
this.backedUpRight = rightIt.next();
}
this.backedUpLeft = this.currentLeft;
}
}
}
{
// start by creating the first join tuple
this.makeNextJoinTuple();
}
@Override
public boolean hasNext() {
return nextJoinTuple != null;
}
@Override
public JoinedTuple next() {
JoinedTuple current = nextJoinTuple;
this.makeNextJoinTuple();
return current;
}
};
}
use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.
the class SortJoinUtils method innerJoin.
/**
* This util can be used to perform disk based inner join operations.
*/
public static Iterator<JoinedTuple> innerJoin(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator) {
return new Iterator<JoinedTuple>() {
private JoinedTuple nextJoinTuple;
private Tuple currentLeft;
private Tuple currentRight;
// backup variables will hold a Tuple temporary if had to call .next()
// once during the join operation before creating a iterator restore point.
private Tuple backedUpLeft;
private Tuple backedUpRight;
// flags to mark the required side of iteration
private boolean shouldDoLeftIterations = false;
private boolean shouldDoRightIterations = false;
// keeps the no of iterations done on each side of the relationship while keeping the
// other side constant
private int leftIterations = 0;
private int rightIterations = 0;
private JoinedTuple doLeftIteration() {
if (!shouldDoLeftIterations) {
return null;
}
JoinedTuple jtFromLeftIt = null;
if (leftIt.hasNext()) {
Tuple l = leftIt.next();
if (this.leftIterations == 0) {
this.backedUpLeft = l;
}
if (comparator.compare(l, this.currentRight) == 0) {
if (this.leftIterations == 0) {
leftIt.createRestorePoint();
}
this.leftIterations++;
jtFromLeftIt = new JoinedTuple<>(l.getKey(), l.getValue(), this.currentRight.getValue());
}
}
/*
if this is the end of left iteration(jtFromLeftIt == null), configure the right iterations
to run next and restore left iterator
*/
if (jtFromLeftIt == null) {
this.leftIterations = 0;
this.shouldDoLeftIterations = false;
this.shouldDoRightIterations = true;
if (leftIt.hasRestorePoint()) {
leftIt.restore();
leftIt.clearRestorePoint();
}
}
return jtFromLeftIt;
}
private JoinedTuple doRightIteration() {
if (!shouldDoRightIterations) {
return null;
}
JoinedTuple jtFromRightIt = null;
if (rightIt.hasNext()) {
Tuple l = rightIt.next();
if (this.rightIterations == 0) {
this.backedUpRight = l;
}
if (comparator.compare(this.currentLeft, l) == 0) {
if (this.rightIterations == 0) {
rightIt.createRestorePoint();
}
this.rightIterations++;
jtFromRightIt = new JoinedTuple<>(l.getKey(), this.currentLeft.getValue(), l.getValue());
}
}
/*
if this is the end of left iteration(jtFromRightIt == null), configure the right iterations
to run next and restore left iterator
*/
if (jtFromRightIt == null) {
this.rightIterations = 0;
this.shouldDoRightIterations = false;
if (rightIt.hasRestorePoint()) {
rightIt.restore();
rightIt.clearRestorePoint();
}
}
return jtFromRightIt;
}
private void makeNextJoinTuple() {
nextJoinTuple = this.doLeftIteration();
if (nextJoinTuple == null) {
nextJoinTuple = this.doRightIteration();
}
while (nextJoinTuple == null && (this.backedUpLeft != null || leftIt.hasNext()) && (this.backedUpRight != null || rightIt.hasNext())) {
this.currentLeft = this.backedUpLeft != null ? this.backedUpLeft : leftIt.next();
// we used the backup, so setting to null
this.backedUpLeft = null;
this.currentRight = this.backedUpRight != null ? this.backedUpRight : rightIt.next();
this.backedUpRight = null;
// still we don't need left or right iterations at this point
this.shouldDoLeftIterations = false;
this.shouldDoRightIterations = false;
if (comparator.compare(this.currentLeft, this.currentRight) == 0) {
this.nextJoinTuple = new JoinedTuple<>(this.currentLeft.getKey(), this.currentLeft.getValue(), this.currentRight.getValue());
// schedule to run the left iteration next.
// Left iteration at the end will schedule right iteration
this.shouldDoLeftIterations = true;
break;
} else if (comparator.compare(this.currentLeft, this.currentRight) < 0) {
if (leftIt.hasNext()) {
this.backedUpLeft = leftIt.next();
}
this.backedUpRight = this.currentRight;
} else {
if (rightIt.hasNext()) {
this.backedUpRight = rightIt.next();
}
this.backedUpLeft = this.currentLeft;
}
}
}
{
// start by creating the first join tuple
this.makeNextJoinTuple();
}
@Override
public boolean hasNext() {
return nextJoinTuple != null;
}
@Override
public JoinedTuple next() {
JoinedTuple current = nextJoinTuple;
this.makeNextJoinTuple();
return current;
}
};
}
use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.
the class SortJoinUtilsTest method innerJoinWithDiskBasedListComparision.
/**
* This test compares the results of in memory and disk based inner joins.
* Purpose is to verify the accuracy of disk based inner join
*/
@Test
public void innerJoinWithDiskBasedListComparision() {
List<Tuple> left = new ArrayList<>();
List<Tuple> right = new ArrayList<>();
Random random = new Random();
for (int i = 0; i < 100; i++) {
left.add(Tuple.of(random.nextInt(10), random.nextInt()));
right.add(Tuple.of(random.nextInt(10), random.nextInt()));
}
FSKeyedSortedMerger2 fsk1 = new FSKeyedSortedMerger2(10, 100, "/tmp", "op-1-" + UUID.randomUUID().toString(), MessageTypes.INTEGER, MessageTypes.INTEGER, (Comparator<Integer>) Integer::compare, 0, false, 1);
for (Tuple tuple : left) {
byte[] data = MessageTypes.INTEGER.getDataPacker().packToByteArray((Integer) tuple.getValue());
fsk1.add(tuple.getKey(), data, data.length);
fsk1.run();
}
FSKeyedSortedMerger2 fsk2 = new FSKeyedSortedMerger2(10, 100, "/tmp", "op-2-" + UUID.randomUUID().toString(), MessageTypes.INTEGER, MessageTypes.INTEGER, (Comparator<Integer>) Integer::compare, 0, false, 1);
for (Tuple tuple : right) {
byte[] data = MessageTypes.INTEGER.getDataPacker().packToByteArray((Integer) tuple.getValue());
fsk2.add(tuple.getKey(), data, data.length);
fsk2.run();
}
CommonThreadPool.init(Config.newBuilder().build());
fsk1.switchToReading();
fsk2.switchToReading();
Iterator iterator = SortJoinUtils.joinWithCache((RestorableIterator) fsk1.readIterator(), (RestorableIterator) fsk2.readIterator(), new KeyComparatorWrapper((Comparator<Integer>) Integer::compare), CommunicationContext.JoinType.INNER, Config.newBuilder().build());
List<Object> objects = SortJoinUtils.innerJoin(left, right, new KeyComparatorWrapper(Comparator.naturalOrder()));
objects.sort(Comparator.comparingInt(o -> (Integer) ((JoinedTuple) o).getKey()));
int i = 0;
while (iterator.hasNext()) {
JoinedTuple nextFromIt = (JoinedTuple) iterator.next();
JoinedTuple nextFromList = (JoinedTuple) objects.get(i++);
Assert.assertEquals(nextFromIt.getKey(), nextFromList.getKey());
}
Assert.assertEquals(i, objects.size());
}
use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.
the class SortJoinUtilsTest method leftOuterJoinComparision.
/**
* This test compares the results of in memory and disk based left outer joins.
* Purpose is to verify the accuracy of disk based left outer join
*/
@Test
public void leftOuterJoinComparision() {
List<Tuple> left = new ArrayList<>();
List<Tuple> right = new ArrayList<>();
Random random = new Random();
for (int i = 0; i < 100; i++) {
left.add(Tuple.of(random.nextInt(10), random.nextInt()));
right.add(Tuple.of(random.nextInt(10), random.nextInt()));
}
FSKeyedSortedMerger2 fsk1 = new FSKeyedSortedMerger2(10, 100, "/tmp", "op-1-" + UUID.randomUUID().toString(), MessageTypes.INTEGER, MessageTypes.INTEGER, (Comparator<Integer>) Integer::compare, 0, false, 1);
for (Tuple tuple : left) {
byte[] data = MessageTypes.INTEGER.getDataPacker().packToByteArray((Integer) tuple.getValue());
fsk1.add(tuple.getKey(), data, data.length);
fsk1.run();
}
FSKeyedSortedMerger2 fsk2 = new FSKeyedSortedMerger2(10, 100, "/tmp", "op-2-" + UUID.randomUUID().toString(), MessageTypes.INTEGER, MessageTypes.INTEGER, (Comparator<Integer>) Integer::compare, 0, false, 1);
for (Tuple tuple : right) {
byte[] data = MessageTypes.INTEGER.getDataPacker().packToByteArray((Integer) tuple.getValue());
fsk2.add(tuple.getKey(), data, data.length);
fsk2.run();
}
CommonThreadPool.init(Config.newBuilder().build());
fsk1.switchToReading();
fsk2.switchToReading();
Iterator iterator = SortJoinUtils.leftOuterJoin((RestorableIterator) fsk1.readIterator(), (RestorableIterator) fsk2.readIterator(), new KeyComparatorWrapper((Comparator<Integer>) Integer::compare));
List<Object> objects = SortJoinUtils.leftOuterJoin(left, right, new KeyComparatorWrapper(Comparator.naturalOrder()));
objects.sort(Comparator.comparingInt(o -> (Integer) ((JoinedTuple) o).getKey()));
int i = 0;
while (iterator.hasNext()) {
JoinedTuple nextFromIt = (JoinedTuple) iterator.next();
JoinedTuple nextFromList = (JoinedTuple) objects.get(i++);
Assert.assertEquals(nextFromIt.getKey(), nextFromList.getKey());
}
Assert.assertEquals(i, objects.size());
}
use of edu.iu.dsc.tws.comms.shuffle.RestorableIterator in project twister2 by DSC-SPIDAL.
the class SortJoinUtils method joinWithCache.
/**
* This method avoid having to scan back and forth of the files by reading data iterators once
* and backup them into a {@link DiskBasedList}, which has a memory buffer
*/
public static Iterator<JoinedTuple> joinWithCache(RestorableIterator<Tuple<?, ?>> leftIt, RestorableIterator<Tuple<?, ?>> rightIt, KeyComparatorWrapper comparator, CommunicationContext.JoinType joinType, Config config) {
LOG.info("Performing join with cache....");
return new Iterator<JoinedTuple>() {
private final List<DiskBasedList> oldLists = new ArrayList<>();
private DiskBasedList leftList;
private DiskBasedList rightList;
// if we had to keep next() to check the next tuple, these variables can be used to keep them
private Tuple leftBackup;
private Tuple rightBackup;
private Iterator<JoinedTuple> localJoinIterator;
/**
* Advances two iterators by reading onto memory
*
* @return true if advance() should be called again
*/
private boolean advance() {
if (this.leftList != null) {
this.leftList.dispose();
this.oldLists.add(this.leftList);
}
if (this.rightList != null) {
this.rightList.dispose();
this.oldLists.add(this.rightList);
}
long maxRecordsInMemory = CommunicationContext.getShuffleMaxRecordsInMemory(config) / 2;
// previous lists are now garbage collectible
this.leftList = new DiskBasedList(config, MessageTypes.OBJECT);
this.rightList = new DiskBasedList(config, MessageTypes.OBJECT);
Tuple currentTuple = null;
// read from left iterator
while (leftIt.hasNext() || this.leftBackup != null) {
Tuple<?, ?> nextLeft = this.leftBackup != null ? this.leftBackup : leftIt.next();
// we used the backup
this.leftBackup = null;
if (currentTuple == null) {
currentTuple = nextLeft;
}
if (comparator.compare(currentTuple, nextLeft) == 0) {
this.leftList.add(nextLeft);
} else if (comparator.compare(currentTuple, nextLeft) < 0 && this.leftList.size() < maxRecordsInMemory) {
currentTuple = nextLeft;
this.leftList.add(nextLeft);
} else {
this.leftBackup = nextLeft;
break;
}
}
// read from right iterator
while (rightIt.hasNext() || this.rightBackup != null) {
Tuple<?, ?> nextRight = this.rightBackup != null ? this.rightBackup : rightIt.next();
this.rightBackup = null;
if (currentTuple == null) {
currentTuple = nextRight;
}
if (comparator.compare(currentTuple, nextRight) >= 0) {
this.rightList.add(nextRight);
} else {
this.rightBackup = nextRight;
break;
}
}
this.localJoinIterator = join(new ListBasedRestorableIterator(this.leftList), new ListBasedRestorableIterator(this.rightList), comparator, joinType);
// data iterators, let's advance() again
return !this.localJoinIterator.hasNext() && (leftBackup != null || rightBackup != null || leftIt.hasNext() || rightIt.hasNext());
}
private void callAdvanceIt() {
boolean shouldCall = true;
while (shouldCall) {
shouldCall = this.advance();
}
}
{
this.callAdvanceIt();
// add a shutdown hook to cleanup
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public synchronized void start() {
LOG.info("Cleaning up disk based caches used for join...");
for (DiskBasedList oldList : oldLists) {
oldList.clear();
}
}
});
}
@Override
public boolean hasNext() {
return this.localJoinIterator != null && this.localJoinIterator.hasNext();
}
@Override
public JoinedTuple next() {
JoinedTuple next = this.localJoinIterator.next();
if (!this.localJoinIterator.hasNext()) {
this.callAdvanceIt();
}
return next;
}
};
}
Aggregations