Search in sources :

Example 31 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class FrameReaderTextCSV method readCSVFrameFromHDFS.

protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    splits = IOUtilFunctions.sortInputSplits(splits);
    for (int i = 0, rpos = 0; i < splits.length; i++) rpos = readCSVFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, rpos, i == 0);
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 32 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class FrameReaderTextCSV method computeCSVSize.

protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    splits = IOUtilFunctions.sortInputSplits(splits);
    // compute number of columns
    int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
    // compute number of rows
    int nrow = 0;
    for (int i = 0; i < splits.length; i++) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
        LongWritable key = new LongWritable();
        Text value = new Text();
        try {
            // ignore header of first split
            if (i == 0 && _props.hasHeader())
                reader.next(key, value);
            // count remaining number of rows, ignore meta data
            while (reader.next(key, value)) {
                String val = value.toString();
                nrow += (val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
    return new Pair<>(nrow, ncol);
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 33 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class FrameReaderTextCSVParallel method computeCSVSize.

@Override
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
    int numThreads = OptimizerUtils.getParallelTextReadParallelism();
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, numThreads);
    // compute number of columns
    int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
    // compute number of rows
    int nrow = 0;
    ExecutorService pool = CommonThreadPool.get(numThreads);
    try {
        ArrayList<CountRowsTask> tasks = new ArrayList<>();
        for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
        List<Future<Long>> cret = pool.invokeAll(tasks);
        for (Future<Long> count : cret) nrow += count.get().intValue();
    } catch (Exception e) {
        throw new IOException("Failed parallel read of text csv input.", e);
    } finally {
        pool.shutdown();
    }
    return new Pair<>(nrow, ncol);
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 34 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class FrameReaderTextCell method readTextCellFrameFromHDFS.

protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    if (fs.isDirectory(path)) {
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        for (InputSplit split : splits) readTextCellFrameFromInputSplit(split, informat, job, dest);
    } else {
        readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
    }
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 35 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class ReaderTextCSVParallel method readMatrixFromHDFS.

@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException {
    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fname);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, _numThreads);
    splits = IOUtilFunctions.sortInputSplits(splits);
    // check existence and non-empty file
    checkValidInputFile(fs, path);
    // allocate output matrix block
    // First Read Pass (count rows/cols, determine offsets, allocate matrix block)
    MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(), _props.getDelim(), estnnz);
    rlen = ret.getNumRows();
    clen = ret.getNumColumns();
    // Second Read Pass (read, parse strings, append to matrix block)
    readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(), _props.getDelim(), _props.isFill(), _props.getFillValue());
    // post-processing (representation-specific, change of sparse/dense block representation)
    // - no sorting required for CSV because it is read in sorted order per row
    // - nnz explicitly maintained in parallel for the individual splits
    ret.examSparsity();
    // sanity check for parallel row count (since determined internally)
    if (rlen >= 0 && rlen != ret.getNumRows())
        throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow=" + rlen + ", real nrow=" + ret.getNumRows());
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)49 InputSplit (org.apache.hadoop.mapred.InputSplit)39 IOException (java.io.IOException)26 Path (org.apache.hadoop.fs.Path)25 JobConf (org.apache.hadoop.mapred.JobConf)24 LongWritable (org.apache.hadoop.io.LongWritable)19 Text (org.apache.hadoop.io.Text)19 ArrayList (java.util.ArrayList)16 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)14 ExecutorService (java.util.concurrent.ExecutorService)12 Future (java.util.concurrent.Future)8 FileSystem (org.apache.hadoop.fs.FileSystem)8 FastStringTokenizer (org.apache.sysml.runtime.util.FastStringTokenizer)6 Configuration (org.apache.hadoop.conf.Configuration)4 Pair (org.apache.sysml.runtime.matrix.data.Pair)4 LinkedList (java.util.LinkedList)3 Properties (java.util.Properties)3 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)3 HadoopOutputFormat (org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3