Search in sources :

Example 71 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.

the class RemoteParForMR method readResultFile.

/**
 * Result file contains hierarchy of workerID-resultvar(incl filename). We deduplicate
 * on the workerID. Without JVM reuse each task refers to a unique workerID, so we
 * will not find any duplicates. With JVM reuse, however, each slot refers to a workerID,
 * and there are duplicate filenames due to partial aggregation and overwrite of fname
 * (the RemoteParWorkerMapper ensures uniqueness of those files independent of the
 * runtime implementation).
 *
 * @param job job configuration
 * @param fname file name
 * @return array of local variable maps
 * @throws IOException if IOException occurs
 */
@SuppressWarnings("deprecation")
public static LocalVariableMap[] readResultFile(JobConf job, String fname) throws IOException {
    HashMap<Long, LocalVariableMap> tmp = new HashMap<>();
    Path path = new Path(fname);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    // workerID
    LongWritable key = new LongWritable();
    // serialized var header (incl filename)
    Text value = new Text();
    int countAll = 0;
    for (Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path)) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
        try {
            while (reader.next(key, value)) {
                if (!tmp.containsKey(key.get()))
                    tmp.put(key.get(), new LocalVariableMap());
                Object[] dat = ProgramConverter.parseDataObject(value.toString());
                tmp.get(key.get()).put((String) dat[0], (Data) dat[1]);
                countAll++;
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
    LOG.debug("Num remote worker results (before deduplication): " + countAll);
    LOG.debug("Num remote worker results: " + tmp.size());
    // create return array
    return tmp.values().toArray(new LocalVariableMap[0]);
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) Text(org.apache.hadoop.io.Text) SequenceFile(org.apache.hadoop.io.SequenceFile) LocalVariableMap(org.apache.sysml.runtime.controlprogram.LocalVariableMap) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) LongWritable(org.apache.hadoop.io.LongWritable)

Example 72 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.

the class ResultMergeLocalFile method createTextCellStagingFile.

private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LinkedList<Cell> buffer = new LinkedList<>();
    LongWritable key = new LongWritable();
    Text value = new Text();
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    // long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    // NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
    FastStringTokenizer st = new FastStringTokenizer(' ');
    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                // reset tokenizer
                st.reset(value.toString());
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());
                Cell tmp = new Cell(row, col, lvalue);
                buffer.addLast(tmp);
                if (// periodic flush
                buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }
            // final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell)

Example 73 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.

the class DataPartitionerLocal method partitionTextCell.

private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
    long row = -1;
    long col = -1;
    try {
        // STEP 1: read matrix from HDFS and write blocks to local staging area
        // check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        LinkedList<Cell> buffer = new LinkedList<>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    // reset tokenizer
                    st.reset(value.toString());
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);
                    buffer.addLast(tmp);
                    if (// periodic flush
                    buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }
                // final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
        // STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
                threads[i].start();
            }
            for (Thread t : threads) t.join();
        } else {
            for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        // post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 74 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.

the class FrameReaderBinaryBlock method readBinaryBlockFrameFromSequenceFile.

@SuppressWarnings({ "deprecation" })
protected static void readBinaryBlockFrameFromSequenceFile(Path path, JobConf job, FileSystem fs, FrameBlock dest) throws IOException, DMLRuntimeException {
    int rlen = dest.getNumRows();
    int clen = dest.getNumColumns();
    // directly read from sequence files (individual partfiles)
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
    LongWritable key = new LongWritable(-1L);
    FrameBlock value = new FrameBlock();
    try {
        while (reader.next(key, value)) {
            int row_offset = (int) (key.get() - 1);
            int rows = value.getNumRows();
            int cols = value.getNumColumns();
            if (// Empty block, ignore it.
            rows == 0 || cols == 0)
                continue;
            // bound check per block
            if (row_offset + rows < 0 || row_offset + rows > rlen) {
                throw new IOException("Frame block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + ":" + "] " + "out of overall frame range [1:" + rlen + ",1:" + clen + "].");
            }
            // copy block into target frame, incl meta on first
            dest.copy(row_offset, row_offset + rows - 1, 0, cols - 1, value);
            if (row_offset == 0)
                dest.setColumnMetadata(value.getColumnMetadata());
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
}
Also used : SequenceFile(org.apache.hadoop.io.SequenceFile) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) LongWritable(org.apache.hadoop.io.LongWritable) IOException(java.io.IOException)

Example 75 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.

the class FrameReaderTextCSV method readCSVFrameFromInputSplit.

protected final int readCSVFrameFromInputSplit(InputSplit split, InputFormat<LongWritable, Text> informat, JobConf job, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen, int rl, boolean first) throws IOException {
    boolean hasHeader = _props.hasHeader();
    boolean isFill = _props.isFill();
    double dfillValue = _props.getFillValue();
    String sfillValue = String.valueOf(_props.getFillValue());
    String delim = _props.getDelim();
    // create record reader
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = rl;
    int col = -1;
    // handle header if existing
    if (first && hasHeader) {
        // read header
        reader.next(key, value);
        dest.setColumnNames(value.toString().split(delim));
    }
    // Read the data
    boolean emptyValuesFound = false;
    try {
        while (// foreach line
        reader.next(key, value)) {
            String cellStr = value.toString().trim();
            emptyValuesFound = false;
            col = 0;
            String[] parts = IOUtilFunctions.splitCSV(cellStr, delim);
            // parse frame meta data (missing values / num distinct)
            if (parts[0].equals(TfUtils.TXMTD_MVPREFIX) || parts[0].equals(TfUtils.TXMTD_NDPREFIX)) {
                if (parts[0].equals(TfUtils.TXMTD_MVPREFIX))
                    for (int j = 0; j < dest.getNumColumns(); j++) dest.getColumnMetadata(j).setMvValue(parts[j + 1]);
                else if (parts[0].equals(TfUtils.TXMTD_NDPREFIX))
                    for (int j = 0; j < dest.getNumColumns(); j++) dest.getColumnMetadata(j).setNumDistinct(Long.parseLong(parts[j + 1]));
                continue;
            }
            for (// foreach cell
            String part : // foreach cell
            parts) {
                part = part.trim();
                if (part.isEmpty()) {
                    if (isFill && dfillValue != 0)
                        dest.set(row, col, UtilFunctions.stringToObject(schema[col], sfillValue));
                    emptyValuesFound = true;
                } else {
                    dest.set(row, col, UtilFunctions.stringToObject(schema[col], part));
                }
                col++;
            }
            // sanity checks for empty values and number of columns
            IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill, emptyValuesFound);
            IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen);
            row++;
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
    return row;
}
Also used : Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable)

Aggregations

LongWritable (org.apache.hadoop.io.LongWritable)445 Text (org.apache.hadoop.io.Text)220 Test (org.junit.Test)171 IntWritable (org.apache.hadoop.io.IntWritable)102 Path (org.apache.hadoop.fs.Path)99 BytesWritable (org.apache.hadoop.io.BytesWritable)70 FloatWritable (org.apache.hadoop.io.FloatWritable)68 Configuration (org.apache.hadoop.conf.Configuration)62 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)62 BooleanWritable (org.apache.hadoop.io.BooleanWritable)60 ArrayList (java.util.ArrayList)59 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)57 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)53 IOException (java.io.IOException)49 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)48 SequenceFile (org.apache.hadoop.io.SequenceFile)42 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)40 FileSystem (org.apache.hadoop.fs.FileSystem)37 JobConf (org.apache.hadoop.mapred.JobConf)37 DeferredObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject)35