Examples with TextInputFormat - org.apache.hadoop.mapred.TextInputFormat

Example 41 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class DataPartitionerLocal method partitionTextCell.

private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
    long row = -1;
    long col = -1;
    try {
        // STEP 1: read matrix from HDFS and write blocks to local staging area
        // check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        LinkedList<Cell> buffer = new LinkedList<>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    // reset tokenizer
                    st.reset(value.toString());
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);
                    buffer.addLast(tmp);
                    if (// periodic flush
                    buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }
                // final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
        // STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
                threads[i].start();
            }
            for (Thread t : threads) t.join();
        } else {
            for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        // post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 42 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class ReaderTextCSVParallel method readMatrixFromHDFS.

@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException {
    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fname);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, _numThreads);
    splits = IOUtilFunctions.sortInputSplits(splits);
    // check existence and non-empty file
    checkValidInputFile(fs, path);
    // allocate output matrix block
    // First Read Pass (count rows/cols, determine offsets, allocate matrix block)
    MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(), _props.getDelim(), estnnz);
    rlen = ret.getNumRows();
    clen = ret.getNumColumns();
    // Second Read Pass (read, parse strings, append to matrix block)
    readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(), _props.getDelim(), _props.isFill(), _props.getFillValue());
    // post-processing (representation-specific, change of sparse/dense block representation)
    // - no sorting required for CSV because it is read in sorted order per row
    // - nnz explicitly maintained in parallel for the individual splits
    ret.examSparsity();
    // sanity check for parallel row count (since determined internally)
    if (rlen >= 0 && rlen != ret.getNumRows())
        throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow=" + rlen + ", real nrow=" + ret.getNumRows());
    return ret;
}

Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 43 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class ReaderTextCSVParallel method computeCSVSizeAndCreateOutputMatrixBlock.

private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job, boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException {
    int nrow = 0;
    int ncol = 0;
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    // count no of entities in the first non-header row
    LongWritable key = new LongWritable();
    Text oneLine = new Text();
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL);
    try {
        if (reader.next(key, oneLine)) {
            String cellStr = oneLine.toString().trim();
            ncol = StringUtils.countMatches(cellStr, delim) + 1;
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
    // count rows in parallel per split
    try {
        ExecutorService pool = CommonThreadPool.get(_numThreads);
        ArrayList<CountRowsTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) {
            tasks.add(new CountRowsTask(split, informat, job, hasHeader));
            hasHeader = false;
        }
        pool.invokeAll(tasks);
        pool.shutdown();
        // collect row counts for offset computation
        // early error notify in case not all tasks successful
        _offsets = new SplitOffsetInfos(tasks.size());
        for (CountRowsTask rt : tasks) {
            if (!rt.getReturnCode())
                throw new IOException("Count task for csv input failed: " + rt.getErrMsg());
            _offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
            _offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
            nrow = nrow + rt.getRowCount();
        }
    } catch (Exception e) {
        throw new IOException("Threadpool Error " + e.getMessage(), e);
    }
    // allocate target matrix block based on given size;
    // need to allocate sparse as well since lock-free insert into target
    long estnnz2 = (estnnz < 0) ? (long) nrow * ncol : estnnz;
    return createOutputMatrixBlock(nrow, ncol, nrow, ncol, estnnz2, true, true);
}

Also used : ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 44 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class ReaderTextCell method readTextCellMatrixFromHDFS.

private static void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;
    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                if (// SPARSE<-value
                sparse) {
                    while (reader.next(key, value)) {
                        // reinit tokenizer
                        st.reset(value.toString());
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        if (row == -1 || col == -1)
                            continue;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);
                    }
                    dest.sortSparseRows();
                } else // DENSE<-value
                {
                    DenseBlock a = dest.getDenseBlock();
                    while (reader.next(key, value)) {
                        // reinit tokenizer
                        st.reset(value.toString());
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        if (row == -1 || col == -1)
                            continue;
                        double lvalue = st.nextDouble();
                        a.set(row, col, lvalue);
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen)
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        else
            throw new IOException("Unable to read matrix in text cell format.", ex);
    }
}

Also used : Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DenseBlock(org.apache.sysml.runtime.matrix.data.DenseBlock) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 45 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class ReaderTextCellParallel method readTextCellMatrixFromHDFS.

private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket) throws IOException {
    int par = _numThreads;
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    // check for min file size for matrix market (adjust num splits if necessary)
    if (_isMMFile) {
        long len = MapReduceTool.getFilesizeOnHDFS(path);
        par = (len < MIN_FILESIZE_MM) ? 1 : par;
    }
    try {
        // create read tasks for all splits
        ExecutorService pool = CommonThreadPool.get(par);
        InputSplit[] splits = informat.getSplits(job, par);
        ArrayList<ReadTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) {
            ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
            tasks.add(t);
        }
        // wait until all tasks have been executed
        List<Future<Long>> rt = pool.invokeAll(tasks);
        // check for exceptions and aggregate nnz
        long lnnz = 0;
        for (Future<Long> task : rt) lnnz += task.get();
        // post-processing
        dest.setNonZeros(lnnz);
        if (dest.isInSparseFormat())
            sortSparseRowsParallel(dest, rlen, _numThreads, pool);
        pool.shutdown();
    } catch (Exception e) {
        throw new IOException("Threadpool issue, while parallel read.", e);
    }
}

Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)49 InputSplit (org.apache.hadoop.mapred.InputSplit)39 IOException (java.io.IOException)26 Path (org.apache.hadoop.fs.Path)25 JobConf (org.apache.hadoop.mapred.JobConf)24 LongWritable (org.apache.hadoop.io.LongWritable)19 Text (org.apache.hadoop.io.Text)19 ArrayList (java.util.ArrayList)16 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)14 ExecutorService (java.util.concurrent.ExecutorService)12 Future (java.util.concurrent.Future)8 FileSystem (org.apache.hadoop.fs.FileSystem)8 FastStringTokenizer (org.apache.sysml.runtime.util.FastStringTokenizer)6 Configuration (org.apache.hadoop.conf.Configuration)4 Pair (org.apache.sysml.runtime.matrix.data.Pair)4 LinkedList (java.util.LinkedList)3 Properties (java.util.Properties)3 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)3 HadoopOutputFormat (org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3