Search in sources :

Example 81 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class FrameReaderTextCell method readTextCellFrameFromHDFS.

protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    if (fs.isDirectory(path)) {
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        for (InputSplit split : splits) readTextCellFrameFromInputSplit(split, informat, job, dest);
    } else {
        readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
    }
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 82 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class ReaderTextCSVParallel method readMatrixFromHDFS.

@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException {
    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fname);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, _numThreads);
    splits = IOUtilFunctions.sortInputSplits(splits);
    // check existence and non-empty file
    checkValidInputFile(fs, path);
    // allocate output matrix block
    // First Read Pass (count rows/cols, determine offsets, allocate matrix block)
    MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(), _props.getDelim(), estnnz);
    rlen = ret.getNumRows();
    clen = ret.getNumColumns();
    // Second Read Pass (read, parse strings, append to matrix block)
    readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(), _props.getDelim(), _props.isFill(), _props.getFillValue());
    // post-processing (representation-specific, change of sparse/dense block representation)
    // - no sorting required for CSV because it is read in sorted order per row
    // - nnz explicitly maintained in parallel for the individual splits
    ret.examSparsity();
    // sanity check for parallel row count (since determined internally)
    if (rlen >= 0 && rlen != ret.getNumRows())
        throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow=" + rlen + ", real nrow=" + ret.getNumRows());
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 83 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class ReaderTextCSVParallel method computeCSVSizeAndCreateOutputMatrixBlock.

private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job, boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException {
    int nrow = 0;
    int ncol = 0;
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    // count no of entities in the first non-header row
    LongWritable key = new LongWritable();
    Text oneLine = new Text();
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL);
    try {
        if (reader.next(key, oneLine)) {
            String cellStr = oneLine.toString().trim();
            ncol = StringUtils.countMatches(cellStr, delim) + 1;
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
    // count rows in parallel per split
    try {
        ExecutorService pool = CommonThreadPool.get(_numThreads);
        ArrayList<CountRowsTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) {
            tasks.add(new CountRowsTask(split, informat, job, hasHeader));
            hasHeader = false;
        }
        pool.invokeAll(tasks);
        pool.shutdown();
        // collect row counts for offset computation
        // early error notify in case not all tasks successful
        _offsets = new SplitOffsetInfos(tasks.size());
        for (CountRowsTask rt : tasks) {
            if (!rt.getReturnCode())
                throw new IOException("Count task for csv input failed: " + rt.getErrMsg());
            _offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
            _offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
            nrow = nrow + rt.getRowCount();
        }
    } catch (Exception e) {
        throw new IOException("Threadpool Error " + e.getMessage(), e);
    }
    // allocate target matrix block based on given size;
    // need to allocate sparse as well since lock-free insert into target
    long estnnz2 = (estnnz < 0) ? (long) nrow * ncol : estnnz;
    return createOutputMatrixBlock(nrow, ncol, nrow, ncol, estnnz2, true, true);
}
Also used : ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 84 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class ReaderTextCell method readTextCellMatrixFromHDFS.

private static void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;
    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                if (// SPARSE<-value
                sparse) {
                    while (reader.next(key, value)) {
                        // reinit tokenizer
                        st.reset(value.toString());
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        if (row == -1 || col == -1)
                            continue;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);
                    }
                    dest.sortSparseRows();
                } else // DENSE<-value
                {
                    DenseBlock a = dest.getDenseBlock();
                    while (reader.next(key, value)) {
                        // reinit tokenizer
                        st.reset(value.toString());
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        if (row == -1 || col == -1)
                            continue;
                        double lvalue = st.nextDouble();
                        a.set(row, col, lvalue);
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen)
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        else
            throw new IOException("Unable to read matrix in text cell format.", ex);
    }
}
Also used : Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DenseBlock(org.apache.sysml.runtime.matrix.data.DenseBlock) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 85 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class ReaderTextCellParallel method readTextCellMatrixFromHDFS.

private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket) throws IOException {
    int par = _numThreads;
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    // check for min file size for matrix market (adjust num splits if necessary)
    if (_isMMFile) {
        long len = MapReduceTool.getFilesizeOnHDFS(path);
        par = (len < MIN_FILESIZE_MM) ? 1 : par;
    }
    try {
        // create read tasks for all splits
        ExecutorService pool = CommonThreadPool.get(par);
        InputSplit[] splits = informat.getSplits(job, par);
        ArrayList<ReadTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) {
            ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
            tasks.add(t);
        }
        // wait until all tasks have been executed
        List<Future<Long>> rt = pool.invokeAll(tasks);
        // check for exceptions and aggregate nnz
        long lnnz = 0;
        for (Future<Long> task : rt) lnnz += task.get();
        // post-processing
        dest.setNonZeros(lnnz);
        if (dest.isInSparseFormat())
            sortSparseRowsParallel(dest, rlen, _numThreads, pool);
        pool.shutdown();
    } catch (Exception e) {
        throw new IOException("Threadpool issue, while parallel read.", e);
    }
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8