Search in sources :

Example 26 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project incubator-systemml by apache.

the class ReaderBinaryCell method readBinaryCellMatrixFromHDFS.

@SuppressWarnings("deprecation")
private static void readBinaryCellMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    MatrixIndexes key = new MatrixIndexes();
    MatrixCell value = new MatrixCell();
    int row = -1;
    int col = -1;
    try {
        for (// 1..N files
        Path lpath : // 1..N files
        IOUtilFunctions.getSequenceFilePaths(fs, path)) {
            // directly read from sequence files (individual partfiles)
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
            try {
                if (sparse) {
                    while (reader.next(key, value)) {
                        row = (int) key.getRowIndex() - 1;
                        col = (int) key.getColumnIndex() - 1;
                        double lvalue = value.getValue();
                        dest.appendValue(row, col, lvalue);
                    }
                } else {
                    while (reader.next(key, value)) {
                        row = (int) key.getRowIndex() - 1;
                        col = (int) key.getColumnIndex() - 1;
                        double lvalue = value.getValue();
                        dest.appendValue(row, col, lvalue);
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
        if (sparse)
            dest.sortSparseRows();
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else {
            throw new IOException("Unable to read matrix in binary cell format.", ex);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Example 27 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project incubator-systemml by apache.

the class GMRCtableBuffer method flushBuffer.

public void flushBuffer(Reporter reporter) throws RuntimeException {
    try {
        if (_mapBuffer != null) {
            // new MatrixIndexes();
            MatrixIndexes key = null;
            MatrixCell value = new MatrixCell();
            for (Entry<Byte, CTableMap> ctable : _mapBuffer.entrySet()) {
                ArrayList<Integer> resultIDs = ReduceBase.getOutputIndexes(ctable.getKey(), _resultIndexes);
                CTableMap resultMap = ctable.getValue();
                // maintain result dims and nonzeros
                for (Integer i : resultIDs) {
                    _resultNonZeros[i] += resultMap.size();
                    if (_resultDimsUnknown[i] == (byte) 1) {
                        _resultMaxRowDims[i] = Math.max(resultMap.getMaxRow(), _resultMaxRowDims[i]);
                        _resultMaxColDims[i] = Math.max(resultMap.getMaxColumn(), _resultMaxColDims[i]);
                    }
                }
                // output result data
                Iterator<ADoubleEntry> iter = resultMap.getIterator();
                while (iter.hasNext()) {
                    ADoubleEntry e = iter.next();
                    key = new MatrixIndexes(e.getKey1(), e.getKey2());
                    value.setValue(e.value);
                    for (Integer i : resultIDs) _collector.collectOutput(key, value, i, reporter);
                }
            }
        } else if (_blockBuffer != null) {
            MatrixIndexes key = new MatrixIndexes(1, 1);
            // DataConverter.writeBinaryBlockMatrixToHDFS(path, job, mat, mc.get_rows(), mc.get_cols(), mc.get_rows_per_block(), mc.get_cols_per_block(), replication);
            for (Entry<Byte, MatrixBlock> ctable : _blockBuffer.entrySet()) {
                ArrayList<Integer> resultIDs = ReduceBase.getOutputIndexes(ctable.getKey(), _resultIndexes);
                MatrixBlock outBlock = ctable.getValue();
                outBlock.recomputeNonZeros();
                // TODO: change hard coding of 1000
                int brlen = 1000, bclen = 1000;
                int rlen = outBlock.getNumRows();
                int clen = outBlock.getNumColumns();
                // final output matrix is smaller than a single block
                if (rlen <= brlen && clen <= brlen) {
                    key = new MatrixIndexes(1, 1);
                    for (Integer i : resultIDs) {
                        _collector.collectOutput(key, outBlock, i, reporter);
                        _resultNonZeros[i] += outBlock.getNonZeros();
                    }
                } else {
                    // Following code is similar to that in DataConverter.DataConverter.writeBinaryBlockMatrixToHDFS
                    // initialize blocks for reuse (at most 4 different blocks required)
                    MatrixBlock[] blocks = MatrixWriter.createMatrixBlocksForReuse(rlen, clen, brlen, bclen, true, outBlock.getNonZeros());
                    // create and write subblocks of matrix
                    for (int blockRow = 0; blockRow < (int) Math.ceil(rlen / (double) brlen); blockRow++) {
                        for (int blockCol = 0; blockCol < (int) Math.ceil(clen / (double) bclen); blockCol++) {
                            int maxRow = (blockRow * brlen + brlen < rlen) ? brlen : rlen - blockRow * brlen;
                            int maxCol = (blockCol * bclen + bclen < clen) ? bclen : clen - blockCol * bclen;
                            int row_offset = blockRow * brlen;
                            int col_offset = blockCol * bclen;
                            // get reuse matrix block
                            MatrixBlock block = MatrixWriter.getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);
                            // copy submatrix to block
                            outBlock.slice(row_offset, row_offset + maxRow - 1, col_offset, col_offset + maxCol - 1, block);
                            // TODO: skip empty "block"
                            // append block to sequence file
                            key.setIndexes(blockRow + 1, blockCol + 1);
                            for (Integer i : resultIDs) {
                                _collector.collectOutput(key, block, i, reporter);
                                _resultNonZeros[i] += block.getNonZeros();
                            }
                            // reset block for later reuse
                            block.reset();
                        }
                    }
                }
            }
        } else {
            throw new DMLRuntimeException("Unexpected.. both ctable buffers are empty.");
        }
    } catch (Exception ex) {
        throw new RuntimeException("Failed to flush ctable buffer.", ex);
    }
    // remove existing partial ctables
    if (_mapBuffer != null)
        _mapBuffer.clear();
    else
        _blockBuffer.clear();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ADoubleEntry(org.apache.sysml.runtime.util.LongLongDoubleHashMap.ADoubleEntry) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) ADoubleEntry(org.apache.sysml.runtime.util.LongLongDoubleHashMap.ADoubleEntry) Entry(java.util.Map.Entry) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CTableMap(org.apache.sysml.runtime.matrix.data.CTableMap) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell)

Example 28 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project incubator-systemml by apache.

the class SamplingSortMRInputFormat method writePartitionFile.

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @return index value
 * @throws IOException if something goes wrong
 * @throws InstantiationException if InstantiationException occurs
 * @throws IllegalAccessException if IllegalAccessException occurs
 */
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
    SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
    Sampler sampler = new Sampler();
    Class<? extends WritableComparable> targetKeyClass;
    targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
    // get input converter information
    int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
    int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
    // indicate whether the matrix value in this mapper is a matrix cell or a matrix block
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    // take N samples from different parts of the input
    int totalcount = 0;
    for (int i = 0; i < samples; i++) {
        SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        int count = 0;
        WritableComparable key = (WritableComparable) reader.createKey();
        Writable value = (Writable) reader.createValue();
        while (reader.next(key, value) && count < recordsPerSample) {
            Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
            inputConverter.setBlockSize(brlen, bclen);
            inputConverter.convert(key, value);
            while (inputConverter.hasNext()) {
                Pair pair = inputConverter.next();
                if (pair.getKey() instanceof DoubleWritable) {
                    sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
                } else if (pair.getValue() instanceof MatrixCell) {
                    sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
                } else
                    throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
                count++;
            }
            key = (WritableComparable) reader.createKey();
            value = (Writable) reader.createValue();
        }
        totalcount += count;
    }
    if (// empty input files
    totalcount == 0)
        sampler.addValue(new DoubleWritable(0));
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    // note: key value always double/null as expected by partitioner
    SequenceFile.Writer writer = null;
    int index0 = -1;
    try {
        writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
        NullWritable nullValue = NullWritable.get();
        int i = 0;
        boolean lessthan0 = true;
        for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
            writer.append(splitValue, nullValue);
            if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
                index0 = i;
                lessthan0 = false;
            }
            i++;
        }
        if (lessthan0)
            index0 = partitions - 1;
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
    return index0;
}
Also used : SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Converter(org.apache.sysml.runtime.matrix.data.Converter) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 29 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project systemml by apache.

the class PairWritableCell method readFields.

@Override
public void readFields(DataInput in) throws IOException {
    indexes = new MatrixIndexes();
    indexes.readFields(in);
    cell = new MatrixCell();
    cell.readFields(in);
}
Also used : MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell)

Example 30 with MatrixCell

use of org.apache.sysml.runtime.matrix.data.MatrixCell in project systemml by apache.

the class SamplingSortMRInputFormat method writePartitionFile.

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @return index value
 * @throws IOException if something goes wrong
 * @throws InstantiationException if InstantiationException occurs
 * @throws IllegalAccessException if IllegalAccessException occurs
 */
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
    SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
    Sampler sampler = new Sampler();
    Class<? extends WritableComparable> targetKeyClass;
    targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
    // get input converter information
    int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
    int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
    // indicate whether the matrix value in this mapper is a matrix cell or a matrix block
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    // take N samples from different parts of the input
    int totalcount = 0;
    for (int i = 0; i < samples; i++) {
        SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        int count = 0;
        WritableComparable key = (WritableComparable) reader.createKey();
        Writable value = (Writable) reader.createValue();
        while (reader.next(key, value) && count < recordsPerSample) {
            Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
            inputConverter.setBlockSize(brlen, bclen);
            inputConverter.convert(key, value);
            while (inputConverter.hasNext()) {
                Pair pair = inputConverter.next();
                if (pair.getKey() instanceof DoubleWritable) {
                    sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
                } else if (pair.getValue() instanceof MatrixCell) {
                    sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
                } else
                    throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
                count++;
            }
            key = (WritableComparable) reader.createKey();
            value = (Writable) reader.createValue();
        }
        totalcount += count;
    }
    if (// empty input files
    totalcount == 0)
        sampler.addValue(new DoubleWritable(0));
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    // note: key value always double/null as expected by partitioner
    SequenceFile.Writer writer = null;
    int index0 = -1;
    try {
        writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
        NullWritable nullValue = NullWritable.get();
        int i = 0;
        boolean lessthan0 = true;
        for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
            writer.append(splitValue, nullValue);
            if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
                index0 = i;
                lessthan0 = false;
            }
            i++;
        }
        if (lessthan0)
            index0 = partitions - 1;
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
    return index0;
}
Also used : SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Converter(org.apache.sysml.runtime.matrix.data.Converter) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Aggregations

MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)35 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)33 SequenceFile (org.apache.hadoop.io.SequenceFile)21 FileSystem (org.apache.hadoop.fs.FileSystem)19 Path (org.apache.hadoop.fs.Path)17 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)17 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)13 IOException (java.io.IOException)12 JobConf (org.apache.hadoop.mapred.JobConf)11 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)11 BufferedWriter (java.io.BufferedWriter)7 File (java.io.File)7 OutputStreamWriter (java.io.OutputStreamWriter)7 ArrayList (java.util.ArrayList)6 RecordReader (org.apache.hadoop.mapred.RecordReader)6 Cell (org.apache.sysml.runtime.controlprogram.parfor.util.Cell)6 IJV (org.apache.sysml.runtime.matrix.data.IJV)5 LinkedList (java.util.LinkedList)4 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)4 CTableMap (org.apache.sysml.runtime.matrix.data.CTableMap)4