Search in sources :

Example 31 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project systemml by apache.

the class TestUtils method writeBinaryTestMatrixCells.

/**
 * <p>
 * Writes a matrix to a file using the binary cells format.
 * </p>
 *
 * @param file
 *            file name
 * @param matrix
 *            matrix
 */
@SuppressWarnings("deprecation")
public static void writeBinaryTestMatrixCells(String file, double[][] matrix) {
    try {
        SequenceFile.Writer writer = null;
        try {
            Path path = new Path(file);
            FileSystem fs = IOUtilFunctions.getFileSystem(path, conf);
            writer = new SequenceFile.Writer(fs, conf, path, MatrixIndexes.class, MatrixCell.class);
            MatrixIndexes index = new MatrixIndexes();
            MatrixCell value = new MatrixCell();
            for (int i = 0; i < matrix.length; i++) {
                for (int j = 0; j < matrix[i].length; j++) {
                    if (matrix[i][j] != 0) {
                        index.setIndexes((i + 1), (j + 1));
                        value.setValue(matrix[i][j]);
                        writer.append(index, value);
                    }
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(writer);
        }
    } catch (IOException e) {
        e.printStackTrace();
        fail("unable to write test matrix: " + e.getMessage());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) IOException(java.io.IOException)

Example 32 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project systemml by apache.

the class SparkExecutionContext method toMatrixBlock.

/**
 * Utility method for creating a single matrix block out of a binary cell RDD.
 * Note that this collect call might trigger execution of any pending transformations.
 *
 * @param rdd JavaPairRDD for matrix block
 * @param rlen number of rows
 * @param clen number of columns
 * @param nnz number of non-zeros
 * @return matrix block
 */
public static MatrixBlock toMatrixBlock(JavaPairRDD<MatrixIndexes, MatrixCell> rdd, int rlen, int clen, long nnz) {
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    MatrixBlock out = null;
    // determine target sparse/dense representation
    long lnnz = (nnz >= 0) ? nnz : (long) rlen * clen;
    boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, lnnz);
    // create output matrix block (w/ lazy allocation)
    out = new MatrixBlock(rlen, clen, sparse);
    List<Tuple2<MatrixIndexes, MatrixCell>> list = rdd.collect();
    // copy blocks one-at-a-time into output matrix block
    for (Tuple2<MatrixIndexes, MatrixCell> keyval : list) {
        // unpack index-block pair
        MatrixIndexes ix = keyval._1();
        MatrixCell cell = keyval._2();
        // append cell to dense/sparse target in order to avoid shifting for sparse
        // note: this append requires a final sort of sparse rows
        out.appendValue((int) ix.getRowIndex() - 1, (int) ix.getColumnIndex() - 1, cell.getValue());
    }
    // post-processing output matrix
    if (sparse)
        out.sortSparseRows();
    out.recomputeNonZeros();
    out.examSparsity();
    if (DMLScript.STATISTICS) {
        Statistics.accSparkCollectTime(System.nanoTime() - t0);
        Statistics.incSparkCollectCount(1);
    }
    return out;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) Tuple2(scala.Tuple2) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell)

Example 33 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project systemml by apache.

the class DataPartitionerLocal method partitionBinaryCell.

@SuppressWarnings("deprecation")
private void partitionBinaryCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
    long row = -1;
    long col = -1;
    try {
        // STEP 1: read matrix from HDFS and write blocks to local staging area
        // check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        // prepare sequence file reader, and write to local staging area
        LinkedList<Cell> buffer = new LinkedList<>();
        MatrixIndexes key = new MatrixIndexes();
        MatrixCell value = new MatrixCell();
        for (Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path)) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
            try {
                while (reader.next(key, value)) {
                    row = key.getRowIndex();
                    col = key.getColumnIndex();
                    Cell tmp = new Cell(row, col, value.getValue());
                    buffer.addLast(tmp);
                    if (// periodic flush
                    buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }
                // final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
        // STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerBinaryCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
                threads[i].start();
            }
            for (Thread t : threads) t.join();
        } else {
            for (String pdir : fnamesPartitions) writeBinaryCellSequenceFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        // post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition binary cell matrix.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) RecordReader(org.apache.hadoop.mapred.RecordReader) LinkedList(java.util.LinkedList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) JobConf(org.apache.hadoop.mapred.JobConf) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 34 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project systemml by apache.

the class ReblockSPInstruction method processMatrixReblockInstruction.

@SuppressWarnings("unchecked")
protected void processMatrixReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) {
    MatrixObject mo = sec.getMatrixObject(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (iinfo == InputInfo.TextCellInputInfo || iinfo == InputInfo.MatrixMarketInputInfo) {
        // get the input textcell rdd
        JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
        // convert textcell to binary block
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, outputEmptyBlocks);
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else if (iinfo == InputInfo.CSVInputInfo) {
        // HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
        // throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
        CSVReblockSPInstruction csvInstruction = null;
        boolean hasHeader = false;
        String delim = ",";
        boolean fill = false;
        double fillValue = 0;
        if (mo.getFileFormatProperties() instanceof CSVFileFormatProperties && mo.getFileFormatProperties() != null) {
            CSVFileFormatProperties props = (CSVFileFormatProperties) mo.getFileFormatProperties();
            hasHeader = props.hasHeader();
            delim = props.getDelim();
            fill = props.isFill();
            fillValue = props.getFillValue();
        }
        csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
        csvInstruction.processInstruction(sec);
        return;
    } else if (iinfo == InputInfo.BinaryCellInputInfo) {
        JavaPairRDD<MatrixIndexes, MatrixCell> binaryCells = (JavaPairRDD<MatrixIndexes, MatrixCell>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), binaryCells, mcOut, outputEmptyBlocks);
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else if (iinfo == InputInfo.BinaryBlockInputInfo) {
        // BINARY BLOCK <- BINARY BLOCK (different sizes)
        JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
        boolean shuffleFreeReblock = mc.dimsKnown() && mcOut.dimsKnown() && (mc.getRows() < mcOut.getRowsPerBlock() || mc.getRowsPerBlock() % mcOut.getRowsPerBlock() == 0) && (mc.getCols() < mcOut.getColsPerBlock() || mc.getColsPerBlock() % mcOut.getColsPerBlock() == 0);
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.flatMapToPair(new ExtractBlockForBinaryReblock(mc, mcOut));
        if (!shuffleFreeReblock)
            out = RDDAggregateUtils.mergeByKey(out, false);
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else {
        throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction:" + InputInfo.inputInfoToString(iinfo));
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) ExtractBlockForBinaryReblock(org.apache.sysml.runtime.instructions.spark.functions.ExtractBlockForBinaryReblock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) LongWritable(org.apache.hadoop.io.LongWritable)

Example 35 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project systemml by apache.

the class CopyBinaryCellFunction method call.

@Override
public Tuple2<MatrixIndexes, MatrixCell> call(Tuple2<MatrixIndexes, MatrixCell> arg0) throws Exception {
    MatrixIndexes ix = new MatrixIndexes(arg0._1());
    MatrixCell cell = new MatrixCell();
    cell.copy(arg0._2());
    return new Tuple2<>(ix, cell);
}
Also used : MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Tuple2(scala.Tuple2) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell)

Aggregations

MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)35 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)33 SequenceFile (org.apache.hadoop.io.SequenceFile)21 FileSystem (org.apache.hadoop.fs.FileSystem)19 Path (org.apache.hadoop.fs.Path)17 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)17 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)13 IOException (java.io.IOException)12 JobConf (org.apache.hadoop.mapred.JobConf)11 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)11 BufferedWriter (java.io.BufferedWriter)7 File (java.io.File)7 OutputStreamWriter (java.io.OutputStreamWriter)7 ArrayList (java.util.ArrayList)6 RecordReader (org.apache.hadoop.mapred.RecordReader)6 Cell (org.apache.sysml.runtime.controlprogram.parfor.util.Cell)6 IJV (org.apache.sysml.runtime.matrix.data.IJV)5 LinkedList (java.util.LinkedList)4 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)4 CTableMap (org.apache.sysml.runtime.matrix.data.CTableMap)4