Search in sources :

Example 1 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project incubator-systemml by apache.

the class ResultMergeLocalFile method createTextCellStagingFile.

private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LinkedList<Cell> buffer = new LinkedList<>();
    LongWritable key = new LongWritable();
    Text value = new Text();
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    // long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    // NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
    FastStringTokenizer st = new FastStringTokenizer(' ');
    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                // reset tokenizer
                st.reset(value.toString());
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());
                Cell tmp = new Cell(row, col, lvalue);
                buffer.addLast(tmp);
                if (// periodic flush
                buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }
            // final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell)

Example 2 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project incubator-systemml by apache.

the class StagingFileUtils method readCellListFromLocal.

public static LinkedList<Cell> readCellListFromLocal(String fname) throws IOException {
    FileInputStream fis = new FileInputStream(fname);
    BufferedReader in = new BufferedReader(new InputStreamReader(fis));
    LinkedList<Cell> buffer = new LinkedList<>();
    try {
        String value = null;
        FastStringTokenizer st = new FastStringTokenizer(' ');
        while ((value = in.readLine()) != null) {
            // reset tokenizer
            st.reset(value);
            long row = st.nextLong();
            long col = st.nextLong();
            double lvalue = st.nextDouble();
            Cell c = new Cell(row, col, lvalue);
            buffer.addLast(c);
        }
    } finally {
        IOUtilFunctions.closeSilently(in);
    }
    return buffer;
}
Also used : FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) FileInputStream(java.io.FileInputStream) LinkedList(java.util.LinkedList)

Example 3 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project incubator-systemml by apache.

the class DataPartitionerLocal method partitionTextCell.

private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
    long row = -1;
    long col = -1;
    try {
        // STEP 1: read matrix from HDFS and write blocks to local staging area
        // check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        LinkedList<Cell> buffer = new LinkedList<>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    // reset tokenizer
                    st.reset(value.toString());
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);
                    buffer.addLast(tmp);
                    if (// periodic flush
                    buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }
                // final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
        // STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
                threads[i].start();
            }
            for (Thread t : threads) t.join();
        } else {
            for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        // post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 4 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project incubator-systemml by apache.

the class FrameReaderTextCell method readTextCellFrameFromInputSplit.

protected static void readTextCellFrameFromInputSplit(InputSplit split, TextInputFormat informat, JobConf job, FrameBlock dest) throws IOException {
    ValueType[] schema = dest.getSchema();
    int rlen = dest.getNumRows();
    int clen = dest.getNumColumns();
    // create record reader
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
    LongWritable key = new LongWritable();
    Text value = new Text();
    FastStringTokenizer st = new FastStringTokenizer(' ');
    int row = -1;
    int col = -1;
    try {
        while (reader.next(key, value)) {
            // reinit tokenizer
            st.reset(value.toString());
            row = st.nextInt() - 1;
            col = st.nextInt() - 1;
            if (row == -3)
                dest.getColumnMetadata(col).setMvValue(st.nextToken());
            else if (row == -2)
                dest.getColumnMetadata(col).setNumDistinct(st.nextLong());
            else
                dest.set(row, col, UtilFunctions.stringToObject(schema[col], st.nextToken()));
        }
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Frame cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall frame range [1:" + rlen + ",1:" + clen + "].");
        } else {
            throw new IOException("Unable to read frame in text cell format.", ex);
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
}
Also used : FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) ValueType(org.apache.sysml.parser.Expression.ValueType) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Example 5 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project incubator-systemml by apache.

the class FrameReaderTextCell method readRawTextCellFrameFromInputStream.

protected static void readRawTextCellFrameFromInputStream(InputStream is, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    // create buffered reader
    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    String value = null;
    FastStringTokenizer st = new FastStringTokenizer(' ');
    int row = -1;
    int col = -1;
    try {
        while ((value = br.readLine()) != null) {
            // reinit tokenizer
            st.reset(value);
            row = st.nextInt() - 1;
            col = st.nextInt() - 1;
            if (row == -3)
                dest.getColumnMetadata(col).setMvValue(st.nextToken());
            else if (row == -2)
                dest.getColumnMetadata(col).setNumDistinct(st.nextLong());
            else
                dest.set(row, col, UtilFunctions.stringToObject(schema[col], st.nextToken()));
        }
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Frame cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall frame range [1:" + rlen + ",1:" + clen + "].", ex);
        } else {
            throw new IOException("Unable to read frame in raw text cell format.", ex);
        }
    } finally {
        IOUtilFunctions.closeSilently(br);
    }
}
Also used : FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Aggregations

FastStringTokenizer (org.apache.sysml.runtime.util.FastStringTokenizer)20 IOException (java.io.IOException)10 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)10 BufferedReader (java.io.BufferedReader)8 InputStreamReader (java.io.InputStreamReader)8 LongWritable (org.apache.hadoop.io.LongWritable)8 Text (org.apache.hadoop.io.Text)8 LinkedList (java.util.LinkedList)6 InputSplit (org.apache.hadoop.mapred.InputSplit)6 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)6 DenseBlock (org.apache.sysml.runtime.matrix.data.DenseBlock)6 FileInputStream (java.io.FileInputStream)4 HashMap (java.util.HashMap)4 Path (org.apache.hadoop.fs.Path)4 JobConf (org.apache.hadoop.mapred.JobConf)4 Cell (org.apache.sysml.runtime.controlprogram.parfor.util.Cell)4 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)4 File (java.io.File)2 SequenceFile (org.apache.hadoop.io.SequenceFile)2 ValueType (org.apache.sysml.parser.Expression.ValueType)2