use of org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock in project incubator-systemml by apache.
the class RDDSortUtils method sortDataByValMemSort.
/**
* This function collects and sorts value column in memory and then broadcasts it.
*
* @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
* @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
* @param asc if true, sort ascending
* @param rlen number of rows
* @param clen number of columns
* @param brlen number of rows in a block
* @param bclen number of columns in a block
* @param sec spark execution context
* @param r_op reorg operator
* @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort(JavaPairRDD<MatrixIndexes, MatrixBlock> val, JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int brlen, int bclen, SparkExecutionContext sec, ReorgOperator r_op) throws DMLRuntimeException {
//collect orderby column for in-memory sorting
MatrixBlock inMatBlock = SparkExecutionContext.toMatrixBlock(val, (int) rlen, 1, brlen, bclen, -1);
//in-memory sort operation (w/ index return: source index in target position)
ReorgOperator lrop = new ReorgOperator(SortIndex.getSortIndexFnObject(1, !asc, true));
MatrixBlock sortedIx = (MatrixBlock) inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
//flip sort indices from <source ix in target pos> to <target ix in source pos>
MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false);
for (int i = 0; i < sortedIx.getNumRows(); i++) sortedIxSrc.quickSetValue((int) sortedIx.quickGetValue(i, 0) - 1, 0, i + 1);
//broadcast index vector
PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<MatrixBlock>(sortedIxSrc, brlen, bclen);
Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);
//sort data with broadcast index vector
JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, brlen, _pmb));
return RDDAggregateUtils.mergeRowsByKey(ret);
}
use of org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock in project incubator-systemml by apache.
the class RDDSortUtils method sortDataByVal.
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByVal(JavaPairRDD<MatrixIndexes, MatrixBlock> val, JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int brlen, int bclen) {
//create value-index rdd from inputs
JavaPairRDD<ValueIndexPair, Double> dvals = val.flatMapToPair(new ExtractDoubleValuesWithIndexFunction(brlen));
//sort (creates sorted range per partition)
long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
int numPartitions = (int) Math.ceil(((double) rlen * 16) / hdfsBlocksize);
JavaRDD<ValueIndexPair> sdvals = dvals.sortByKey(new IndexComparator(asc), true, numPartitions).keys();
//create target indexes by original index
long numRep = (long) Math.ceil((double) clen / bclen);
JavaPairRDD<MatrixIndexes, MatrixBlock> ixmap = sdvals.zipWithIndex().mapToPair(new ExtractIndexFunction()).sortByKey().mapPartitionsToPair(new ConvertToBinaryBlockFunction4(rlen, brlen));
ixmap = RDDAggregateUtils.mergeByKey(ixmap, false);
//replicate indexes for all column blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> rixmap = ixmap.flatMapToPair(new ReplicateVectorFunction(false, numRep));
//create binary block output
JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data.join(rixmap).mapPartitionsToPair(new ShuffleMatrixBlockRowsFunction(rlen, brlen));
return RDDAggregateUtils.mergeRowsByKey(ret);
}
Aggregations