Search in sources :

Example 76 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class RemoteParForColocatedNLineInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    InputSplit[] tmp = super.getSplits(job, numSplits);
    // get partitioning information
    MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
    PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job);
    PartitionFormat pf = new PartitionFormat(dpf, -1);
    int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc));
    String fname = MRJobConfiguration.getPartitioningFilename(job);
    // create wrapper splits
    InputSplit[] ret = new InputSplit[tmp.length];
    for (int i = 0; i < tmp.length; i++) {
        // check for robustness of subsequent cast
        if (tmp[i] instanceof FileSplit)
            ret[i] = new RemoteParForColocatedFileSplit((FileSplit) tmp[i], fname, blen);
        else
            ret[i] = tmp[i];
    }
    return ret;
}
Also used : PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) PartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PartitionFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 77 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class ResultMergeLocalFile method mergeTextCellWithoutComp.

private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) {
    try {
        // delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            // we're done
            return;
        }
        // actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fnameNew);
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
        String valueStr = null;
        try {
            for (// read/write all inputs
            MatrixObject in : // read/write all inputs
            inMO) {
                if (LOG.isTraceEnabled())
                    LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname=" + in.getFileName() + ") via stream merge");
                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                informat.configure(tmpJob);
                InputSplit[] splits = informat.getSplits(tmpJob, 1);
                LongWritable key = new LongWritable();
                Text value = new Text();
                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                        }
                    } finally {
                        IOUtilFunctions.closeSilently(reader);
                    }
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(out);
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Text(org.apache.hadoop.io.Text) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStreamWriter(java.io.OutputStreamWriter) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 78 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class FrameReaderTextCSV method readCSVFrameFromHDFS.

protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    splits = IOUtilFunctions.sortInputSplits(splits);
    for (int i = 0, rpos = 0; i < splits.length; i++) rpos = readCSVFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, rpos, i == 0);
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 79 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class FrameReaderTextCSV method computeCSVSize.

protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    splits = IOUtilFunctions.sortInputSplits(splits);
    // compute number of columns
    int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
    // compute number of rows
    int nrow = 0;
    for (int i = 0; i < splits.length; i++) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
        LongWritable key = new LongWritable();
        Text value = new Text();
        try {
            // ignore header of first split
            if (i == 0 && _props.hasHeader())
                reader.next(key, value);
            // count remaining number of rows, ignore meta data
            while (reader.next(key, value)) {
                String val = value.toString();
                nrow += (val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
    return new Pair<>(nrow, ncol);
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 80 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class FrameReaderTextCSVParallel method computeCSVSize.

@Override
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
    int numThreads = OptimizerUtils.getParallelTextReadParallelism();
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, numThreads);
    // compute number of columns
    int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
    // compute number of rows
    int nrow = 0;
    ExecutorService pool = CommonThreadPool.get(numThreads);
    try {
        ArrayList<CountRowsTask> tasks = new ArrayList<>();
        for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
        List<Future<Long>> cret = pool.invokeAll(tasks);
        for (Future<Long> count : cret) nrow += count.get().intValue();
    } catch (Exception e) {
        throw new IOException("Failed parallel read of text csv input.", e);
    } finally {
        pool.shutdown();
    }
    return new Pair<>(nrow, ncol);
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8