Search in sources :

Example 16 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project systemml by apache.

the class StagingFileUtils method readCellListFromLocal.

public static LinkedList<Cell> readCellListFromLocal(String fname) throws IOException {
    FileInputStream fis = new FileInputStream(fname);
    BufferedReader in = new BufferedReader(new InputStreamReader(fis));
    LinkedList<Cell> buffer = new LinkedList<>();
    try {
        String value = null;
        FastStringTokenizer st = new FastStringTokenizer(' ');
        while ((value = in.readLine()) != null) {
            // reset tokenizer
            st.reset(value);
            long row = st.nextLong();
            long col = st.nextLong();
            double lvalue = st.nextDouble();
            Cell c = new Cell(row, col, lvalue);
            buffer.addLast(c);
        }
    } finally {
        IOUtilFunctions.closeSilently(in);
    }
    return buffer;
}
Also used : FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) FileInputStream(java.io.FileInputStream) LinkedList(java.util.LinkedList)

Example 17 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project systemml by apache.

the class DataPartitionerLocal method partitionTextCell.

private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
    long row = -1;
    long col = -1;
    try {
        // STEP 1: read matrix from HDFS and write blocks to local staging area
        // check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        LinkedList<Cell> buffer = new LinkedList<>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    // reset tokenizer
                    st.reset(value.toString());
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);
                    buffer.addLast(tmp);
                    if (// periodic flush
                    buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }
                // final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
        // STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
                threads[i].start();
            }
            for (Thread t : threads) t.join();
        } else {
            for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        // post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 18 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project systemml by apache.

the class ReaderTextCell method readTextCellMatrixFromHDFS.

private static void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;
    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                if (// SPARSE<-value
                sparse) {
                    while (reader.next(key, value)) {
                        // reinit tokenizer
                        st.reset(value.toString());
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        if (row == -1 || col == -1)
                            continue;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);
                    }
                    dest.sortSparseRows();
                } else // DENSE<-value
                {
                    DenseBlock a = dest.getDenseBlock();
                    while (reader.next(key, value)) {
                        // reinit tokenizer
                        st.reset(value.toString());
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        if (row == -1 || col == -1)
                            continue;
                        double lvalue = st.nextDouble();
                        a.set(row, col, lvalue);
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen)
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        else
            throw new IOException("Unable to read matrix in text cell format.", ex);
    }
}
Also used : Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DenseBlock(org.apache.sysml.runtime.matrix.data.DenseBlock) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 19 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project systemml by apache.

the class FrameReaderTextCell method readTextCellFrameFromInputSplit.

protected static void readTextCellFrameFromInputSplit(InputSplit split, TextInputFormat informat, JobConf job, FrameBlock dest) throws IOException {
    ValueType[] schema = dest.getSchema();
    int rlen = dest.getNumRows();
    int clen = dest.getNumColumns();
    // create record reader
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
    LongWritable key = new LongWritable();
    Text value = new Text();
    FastStringTokenizer st = new FastStringTokenizer(' ');
    int row = -1;
    int col = -1;
    try {
        while (reader.next(key, value)) {
            // reinit tokenizer
            st.reset(value.toString());
            row = st.nextInt() - 1;
            col = st.nextInt() - 1;
            if (row == -3)
                dest.getColumnMetadata(col).setMvValue(st.nextToken());
            else if (row == -2)
                dest.getColumnMetadata(col).setNumDistinct(st.nextLong());
            else
                dest.set(row, col, UtilFunctions.stringToObject(schema[col], st.nextToken()));
        }
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Frame cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall frame range [1:" + rlen + ",1:" + clen + "].");
        } else {
            throw new IOException("Unable to read frame in text cell format.", ex);
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
}
Also used : FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) ValueType(org.apache.sysml.parser.Expression.ValueType) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Example 20 with FastStringTokenizer

use of org.apache.sysml.runtime.util.FastStringTokenizer in project systemml by apache.

the class FrameReaderTextCell method readRawTextCellFrameFromInputStream.

protected static void readRawTextCellFrameFromInputStream(InputStream is, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    // create buffered reader
    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    String value = null;
    FastStringTokenizer st = new FastStringTokenizer(' ');
    int row = -1;
    int col = -1;
    try {
        while ((value = br.readLine()) != null) {
            // reinit tokenizer
            st.reset(value);
            row = st.nextInt() - 1;
            col = st.nextInt() - 1;
            if (row == -3)
                dest.getColumnMetadata(col).setMvValue(st.nextToken());
            else if (row == -2)
                dest.getColumnMetadata(col).setNumDistinct(st.nextLong());
            else
                dest.set(row, col, UtilFunctions.stringToObject(schema[col], st.nextToken()));
        }
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Frame cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall frame range [1:" + rlen + ",1:" + clen + "].", ex);
        } else {
            throw new IOException("Unable to read frame in raw text cell format.", ex);
        }
    } finally {
        IOUtilFunctions.closeSilently(br);
    }
}
Also used : FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Aggregations

FastStringTokenizer (org.apache.sysml.runtime.util.FastStringTokenizer)20 IOException (java.io.IOException)10 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)10 BufferedReader (java.io.BufferedReader)8 InputStreamReader (java.io.InputStreamReader)8 LongWritable (org.apache.hadoop.io.LongWritable)8 Text (org.apache.hadoop.io.Text)8 LinkedList (java.util.LinkedList)6 InputSplit (org.apache.hadoop.mapred.InputSplit)6 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)6 DenseBlock (org.apache.sysml.runtime.matrix.data.DenseBlock)6 FileInputStream (java.io.FileInputStream)4 HashMap (java.util.HashMap)4 Path (org.apache.hadoop.fs.Path)4 JobConf (org.apache.hadoop.mapred.JobConf)4 Cell (org.apache.sysml.runtime.controlprogram.parfor.util.Cell)4 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)4 File (java.io.File)2 SequenceFile (org.apache.hadoop.io.SequenceFile)2 ValueType (org.apache.sysml.parser.Expression.ValueType)2