Search in sources :

Example 66 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class RDDSortUtils method sortDataByValMemSort.

/**
 * This function collects and sorts value column in memory and then broadcasts it.
 *
 * @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param asc if true, sort ascending
 * @param rlen number of rows
 * @param clen number of columns
 * @param brlen number of rows in a block
 * @param bclen number of columns in a block
 * @param sec spark execution context
 * @param r_op reorg operator
 * @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort(JavaPairRDD<MatrixIndexes, MatrixBlock> val, JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int brlen, int bclen, SparkExecutionContext sec, ReorgOperator r_op) {
    // collect orderby column for in-memory sorting
    MatrixBlock inMatBlock = SparkExecutionContext.toMatrixBlock(val, (int) rlen, 1, brlen, bclen, -1);
    // in-memory sort operation (w/ index return: source index in target position)
    ReorgOperator lrop = new ReorgOperator(new SortIndex(1, !asc, true));
    MatrixBlock sortedIx = (MatrixBlock) inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
    // flip sort indices from <source ix in target pos> to <target ix in source pos>
    MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false);
    for (int i = 0; i < sortedIx.getNumRows(); i++) sortedIxSrc.quickSetValue((int) sortedIx.quickGetValue(i, 0) - 1, 0, i + 1);
    // broadcast index vector
    PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, brlen, bclen);
    Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);
    // sort data with broadcast index vector
    JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, brlen, _pmb));
    return RDDAggregateUtils.mergeRowsByKey(ret);
}
Also used : PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) RowMatrixBlock(org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SortIndex(org.apache.sysml.runtime.functionobjects.SortIndex) ReorgOperator(org.apache.sysml.runtime.matrix.operators.ReorgOperator) RowMatrixBlock(org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock)

Example 67 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class RDDSortUtils method sortByVals.

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVals(JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, long clen, int brlen) {
    // create value-index rdd from inputs
    JavaRDD<MatrixBlock> dvals = in.values().flatMap(new ExtractRowsFunction());
    // sort (creates sorted range per partition)
    int numPartitions = SparkUtils.getNumPreferredPartitions(new MatrixCharacteristics(rlen, clen, brlen, brlen), in);
    JavaRDD<MatrixBlock> sdvals = dvals.sortBy(new CreateDoubleKeysFunction(), true, numPartitions);
    // create binary block output
    JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals.zipWithIndex().mapPartitionsToPair(new ConvertToBinaryBlockFunction5(rlen, brlen));
    ret = RDDAggregateUtils.mergeByKey(ret, false);
    return ret;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) RowMatrixBlock(org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 68 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class ReaderBinaryBlock method readBinaryBlockMatrixBlocksFromHDFS.

private static void readBinaryBlockMatrixBlocksFromHDFS(Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) throws IOException {
    MatrixIndexes key = new MatrixIndexes();
    MatrixBlock value = new MatrixBlock();
    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);
    for (// 1..N files
    Path lpath : // 1..N files
    IOUtilFunctions.getSequenceFilePaths(fs, path)) {
        // directly read from sequence files (individual partfiles)
        SequenceFile.Reader reader = new SequenceFile.Reader(job, SequenceFile.Reader.file(lpath));
        try {
            while (reader.next(key, value)) {
                int row_offset = (int) (key.getRowIndex() - 1) * brlen;
                int col_offset = (int) (key.getColumnIndex() - 1) * bclen;
                int rows = value.getNumRows();
                int cols = value.getNumColumns();
                // bound check per block
                if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) {
                    throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
                }
                // copy block to result
                dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value)));
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) SequenceFile(org.apache.hadoop.io.SequenceFile) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) IOException(java.io.IOException) IndexedMatrixValue(org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)

Example 69 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class TestUtils method writeBinaryTestMatrixBlocks.

/**
 * <p>
 * Writes a matrix to a file using the binary blocks format.
 * </p>
 *
 * @param file
 *            file name
 * @param matrix
 *            matrix
 * @param rowsInBlock
 *            rows in block
 * @param colsInBlock
 *            columns in block
 * @param sparseFormat
 *            sparse format
 */
@SuppressWarnings("deprecation")
public static void writeBinaryTestMatrixBlocks(String file, double[][] matrix, int rowsInBlock, int colsInBlock, boolean sparseFormat) {
    SequenceFile.Writer writer = null;
    try {
        Path path = new Path(file);
        FileSystem fs = IOUtilFunctions.getFileSystem(path, conf);
        writer = new SequenceFile.Writer(fs, conf, path, MatrixIndexes.class, MatrixBlock.class);
        MatrixIndexes index = new MatrixIndexes();
        MatrixBlock value = new MatrixBlock();
        for (int i = 0; i < matrix.length; i += rowsInBlock) {
            int rows = Math.min(rowsInBlock, (matrix.length - i));
            for (int j = 0; j < matrix[i].length; j += colsInBlock) {
                int cols = Math.min(colsInBlock, (matrix[i].length - j));
                index.setIndexes(((i / rowsInBlock) + 1), ((j / colsInBlock) + 1));
                value = new MatrixBlock(rows, cols, sparseFormat);
                for (int k = 0; k < rows; k++) {
                    for (int l = 0; l < cols; l++) {
                        value.setValue(k, l, matrix[i + k][j + l]);
                    }
                }
                writer.append(index, value);
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
        fail("unable to write test matrix: " + e.getMessage());
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) SequenceFile(org.apache.hadoop.io.SequenceFile) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException)

Example 70 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class TestUtils method writeBinaryTestMatrixCells.

/**
 * <p>
 * Writes a matrix to a file using the binary cells format.
 * </p>
 *
 * @param file
 *            file name
 * @param matrix
 *            matrix
 */
@SuppressWarnings("deprecation")
public static void writeBinaryTestMatrixCells(String file, double[][] matrix) {
    try {
        SequenceFile.Writer writer = null;
        try {
            Path path = new Path(file);
            FileSystem fs = IOUtilFunctions.getFileSystem(path, conf);
            writer = new SequenceFile.Writer(fs, conf, path, MatrixIndexes.class, MatrixCell.class);
            MatrixIndexes index = new MatrixIndexes();
            MatrixCell value = new MatrixCell();
            for (int i = 0; i < matrix.length; i++) {
                for (int j = 0; j < matrix[i].length; j++) {
                    if (matrix[i][j] != 0) {
                        index.setIndexes((i + 1), (j + 1));
                        value.setValue(matrix[i][j]);
                        writer.append(index, value);
                    }
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(writer);
        }
    } catch (IOException e) {
        e.printStackTrace();
        fail("unable to write test matrix: " + e.getMessage());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) IOException(java.io.IOException)

Aggregations

MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)165 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)142 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)70 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)48 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)41 Path (org.apache.hadoop.fs.Path)24 SequenceFile (org.apache.hadoop.io.SequenceFile)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)22 ArrayList (java.util.ArrayList)21 IOException (java.io.IOException)20 FileSystem (org.apache.hadoop.fs.FileSystem)20 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)19 Tuple2 (scala.Tuple2)19 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)17 JobConf (org.apache.hadoop.mapred.JobConf)14 MatrixValue (org.apache.sysml.runtime.matrix.data.MatrixValue)11 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)10 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)10 File (java.io.File)9 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)9