Search in sources :

Example 41 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class FrameReaderTextCSV method readFrameFromInputStream.

@Override
public FrameBlock readFrameFromInputStream(InputStream is, ValueType[] schema, String[] names, long rlen, long clen) throws IOException, DMLRuntimeException {
    // allocate output frame block
    ValueType[] lschema = createOutputSchema(schema, clen);
    String[] lnames = createOutputNames(names, clen);
    FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen);
    // core read (sequential/parallel)
    InputStreamInputFormat informat = new InputStreamInputFormat(is);
    InputSplit split = informat.getSplits(null, 1)[0];
    readCSVFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true);
    return ret;
}
Also used : InputStreamInputFormat(org.apache.sysml.runtime.util.InputStreamInputFormat) ValueType(org.apache.sysml.parser.Expression.ValueType) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 42 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class FrameReaderTextCSVParallel method readCSVFrameFromHDFS.

@Override
protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    int numThreads = OptimizerUtils.getParallelTextReadParallelism();
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, numThreads);
    splits = IOUtilFunctions.sortInputSplits(splits);
    try {
        ExecutorService pool = CommonThreadPool.get(Math.min(numThreads, splits.length));
        // compute num rows per split
        ArrayList<CountRowsTask> tasks = new ArrayList<>();
        for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
        List<Future<Long>> cret = pool.invokeAll(tasks);
        // compute row offset per split via cumsum on row counts
        long offset = 0;
        List<Long> offsets = new ArrayList<>();
        for (Future<Long> count : cret) {
            offsets.add(offset);
            offset += count.get();
        }
        // read individual splits
        ArrayList<ReadRowsTask> tasks2 = new ArrayList<>();
        for (int i = 0; i < splits.length; i++) tasks2.add(new ReadRowsTask(splits[i], informat, job, dest, offsets.get(i).intValue(), i == 0));
        List<Future<Object>> rret = pool.invokeAll(tasks2);
        pool.shutdown();
        // error handling
        for (Future<Object> read : rret) read.get();
    } catch (Exception e) {
        throw new IOException("Failed parallel read of text csv input.", e);
    }
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 43 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class FrameReaderTextCellParallel method readTextCellFrameFromHDFS.

@Override
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    int numThreads = OptimizerUtils.getParallelTextReadParallelism();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    try {
        // create read tasks for all splits
        ExecutorService pool = CommonThreadPool.get(numThreads);
        InputSplit[] splits = informat.getSplits(job, numThreads);
        ArrayList<ReadTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) tasks.add(new ReadTask(split, informat, job, dest));
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
    } catch (Exception e) {
        throw new IOException("Failed parallel read of text cell input.", e);
    }
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 44 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class ReaderTextCSVParallel method readCSVMatrixFromHDFS.

private void readCSVMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue) throws IOException {
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    ExecutorService pool = CommonThreadPool.get(_numThreads);
    try {
        // create read tasks for all splits
        ArrayList<CSVReadTask> tasks = new ArrayList<>();
        int splitCount = 0;
        for (InputSplit split : splits) {
            tasks.add(new CSVReadTask(split, _offsets, informat, job, dest, rlen, clen, hasHeader, delim, fill, fillValue, splitCount++));
        }
        pool.invokeAll(tasks);
        pool.shutdown();
        // check return codes and aggregate nnz
        long lnnz = 0;
        for (CSVReadTask rt : tasks) {
            lnnz += rt.getPartialNnz();
            if (!rt.getReturnCode()) {
                Exception err = rt.getException();
                throw new IOException("Read task for csv input failed: " + err.toString(), err);
            }
        }
        dest.setNonZeros(lnnz);
    } catch (Exception e) {
        throw new IOException("Threadpool issue, while parallel read.", e);
    }
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) ArrayList(java.util.ArrayList) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapred.InputSplit) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Example 45 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class RemoveEmptyRows method execute.

@Override
public void execute() {
    Matrix mat = (Matrix) this.getFunctionInput(0);
    String fnameOld = mat.getFilePath();
    // old,new rowID
    HashMap<Long, Long> keyMap = new HashMap<>();
    try {
        // prepare input
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fnameOld);
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        if (!fs.exists(path))
            throw new IOException("File " + fnameOld + " does not exist on HDFS.");
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        // prepare output
        String fnameNew = createOutputFilePathAndName(OUTPUT_FILE);
        DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true);
        // read and write if necessary
        InputSplit[] splits = informat.getSplits(job, 1);
        LongWritable key = new LongWritable();
        Text value = new Text();
        long ID = 1;
        try {
            // for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();
            for (InputSplit split : splits) {
                RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
                try {
                    while (reader.next(key, value)) {
                        String cellStr = value.toString().trim();
                        StringTokenizer st = new StringTokenizer(cellStr, " ");
                        long row = Integer.parseInt(st.nextToken());
                        long col = Integer.parseInt(st.nextToken());
                        double lvalue = Double.parseDouble(st.nextToken());
                        if (!keyMap.containsKey(row))
                            keyMap.put(row, ID++);
                        long rowNew = keyMap.get(row);
                        sb.append(rowNew);
                        sb.append(' ');
                        sb.append(col);
                        sb.append(' ');
                        sb.append(lvalue);
                        sb.append('\n');
                        ostream.writeBytes(sb.toString());
                        sb.setLength(0);
                    }
                } finally {
                    if (reader != null)
                        reader.close();
                }
            }
            _ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
        } finally {
            if (ostream != null)
                ostream.close();
        }
    } catch (Exception ex) {
        throw new RuntimeException("Unable to execute external function.", ex);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) DataOutputStream(java.io.DataOutputStream) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) IOException(java.io.IOException) StringTokenizer(java.util.StringTokenizer) Matrix(org.apache.sysml.udf.Matrix) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8