Examples with TextInputFormat - org.apache.hadoop.mapred.TextInputFormat

Example 36 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class ReaderTextCSVParallel method computeCSVSizeAndCreateOutputMatrixBlock.

private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job, boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException {
    int nrow = 0;
    int ncol = 0;
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    // count no of entities in the first non-header row
    LongWritable key = new LongWritable();
    Text oneLine = new Text();
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL);
    try {
        if (reader.next(key, oneLine)) {
            String cellStr = oneLine.toString().trim();
            ncol = StringUtils.countMatches(cellStr, delim) + 1;
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
    // count rows in parallel per split
    try {
        ExecutorService pool = CommonThreadPool.get(_numThreads);
        ArrayList<CountRowsTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) {
            tasks.add(new CountRowsTask(split, informat, job, hasHeader));
            hasHeader = false;
        }
        pool.invokeAll(tasks);
        pool.shutdown();
        // collect row counts for offset computation
        // early error notify in case not all tasks successful
        _offsets = new SplitOffsetInfos(tasks.size());
        for (CountRowsTask rt : tasks) {
            if (!rt.getReturnCode())
                throw new IOException("Count task for csv input failed: " + rt.getErrMsg());
            _offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
            _offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
            nrow = nrow + rt.getRowCount();
        }
    } catch (Exception e) {
        throw new IOException("Threadpool Error " + e.getMessage(), e);
    }
    // allocate target matrix block based on given size;
    // need to allocate sparse as well since lock-free insert into target
    long estnnz2 = (estnnz < 0) ? (long) nrow * ncol : estnnz;
    return createOutputMatrixBlock(nrow, ncol, nrow, ncol, estnnz2, true, true);
}

Also used : ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 37 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class ReaderTextCell method readTextCellMatrixFromHDFS.

private static void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;
    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                if (// SPARSE<-value
                sparse) {
                    while (reader.next(key, value)) {
                        // reinit tokenizer
                        st.reset(value.toString());
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        if (row == -1 || col == -1)
                            continue;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);
                    }
                    dest.sortSparseRows();
                } else // DENSE<-value
                {
                    DenseBlock a = dest.getDenseBlock();
                    while (reader.next(key, value)) {
                        // reinit tokenizer
                        st.reset(value.toString());
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        if (row == -1 || col == -1)
                            continue;
                        double lvalue = st.nextDouble();
                        a.set(row, col, lvalue);
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
    } catch (Exception ex) {
        // post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen)
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        else
            throw new IOException("Unable to read matrix in text cell format.", ex);
    }
}

Also used : Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DenseBlock(org.apache.sysml.runtime.matrix.data.DenseBlock) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 38 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.

the class ReaderTextCellParallel method readTextCellMatrixFromHDFS.

private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket) throws IOException {
    int par = _numThreads;
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    // check for min file size for matrix market (adjust num splits if necessary)
    if (_isMMFile) {
        long len = MapReduceTool.getFilesizeOnHDFS(path);
        par = (len < MIN_FILESIZE_MM) ? 1 : par;
    }
    try {
        // create read tasks for all splits
        ExecutorService pool = CommonThreadPool.get(par);
        InputSplit[] splits = informat.getSplits(job, par);
        ArrayList<ReadTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) {
            ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
            tasks.add(t);
        }
        // wait until all tasks have been executed
        List<Future<Long>> rt = pool.invokeAll(tasks);
        // check for exceptions and aggregate nnz
        long lnnz = 0;
        for (Future<Long> task : rt) lnnz += task.get();
        // post-processing
        dest.setNonZeros(lnnz);
        if (dest.isInSparseFormat())
            sortSparseRowsParallel(dest, rlen, _numThreads, pool);
        pool.shutdown();
    } catch (Exception e) {
        throw new IOException("Threadpool issue, while parallel read.", e);
    }
}

Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 39 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project tez by apache.

the class TestGroupedSplits method testFormat.

@Test(timeout = 10000)
public void testFormat() throws Exception {
    JobConf job = new JobConf(defaultConf);
    Random random = new Random();
    long seed = random.nextLong();
    LOG.info("seed = " + seed);
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    createFiles(length, numFiles, random);
    // create a combined split for the files
    TextInputFormat wrappedFormat = new TextInputFormat();
    wrappedFormat.configure(job);
    TezGroupedSplitsInputFormat<LongWritable, Text> format = new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setDesiredNumberOfSplits(1);
    format.setInputFormat(wrappedFormat);
    LongWritable key = new LongWritable();
    Text value = new Text();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / 20) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        InputSplit[] splits = format.getSplits(job, numSplits);
        LOG.info("splitting: got =        " + splits.length);
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.length);
        InputSplit split = splits[0];
        assertEquals("It should be TezGroupedSplit", TezGroupedSplit.class, split.getClass());
        // check the split
        BitSet bits = new BitSet(length);
        LOG.debug("split= " + split);
        RecordReader<LongWritable, Text> reader = format.getRecordReader(split, job, voidReporter);
        try {
            int count = 0;
            while (reader.next(key, value)) {
                int v = Integer.parseInt(value.toString());
                LOG.debug("read " + v);
                if (bits.get(v)) {
                    LOG.warn("conflict with " + v + " at position " + reader.getPos());
                }
                assertFalse("Key in multiple partitions.", bits.get(v));
                bits.set(v);
                count++;
            }
            LOG.info("splits=" + split + " count=" + count);
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}

Also used : BitSet(java.util.BitSet) Text(org.apache.hadoop.io.Text) Random(java.util.Random) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 40 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class RemoveEmptyRows method execute.

@Override
public void execute() {
    Matrix mat = (Matrix) this.getFunctionInput(0);
    String fnameOld = mat.getFilePath();
    // old,new rowID
    HashMap<Long, Long> keyMap = new HashMap<>();
    try {
        // prepare input
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fnameOld);
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        if (!fs.exists(path))
            throw new IOException("File " + fnameOld + " does not exist on HDFS.");
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        // prepare output
        String fnameNew = createOutputFilePathAndName(OUTPUT_FILE);
        DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true);
        // read and write if necessary
        InputSplit[] splits = informat.getSplits(job, 1);
        LongWritable key = new LongWritable();
        Text value = new Text();
        long ID = 1;
        try {
            // for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();
            for (InputSplit split : splits) {
                RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
                try {
                    while (reader.next(key, value)) {
                        String cellStr = value.toString().trim();
                        StringTokenizer st = new StringTokenizer(cellStr, " ");
                        long row = Integer.parseInt(st.nextToken());
                        long col = Integer.parseInt(st.nextToken());
                        double lvalue = Double.parseDouble(st.nextToken());
                        if (!keyMap.containsKey(row))
                            keyMap.put(row, ID++);
                        long rowNew = keyMap.get(row);
                        sb.append(rowNew);
                        sb.append(' ');
                        sb.append(col);
                        sb.append(' ');
                        sb.append(lvalue);
                        sb.append('\n');
                        ostream.writeBytes(sb.toString());
                        sb.setLength(0);
                    }
                } finally {
                    if (reader != null)
                        reader.close();
                }
            }
            _ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
        } finally {
            if (ostream != null)
                ostream.close();
        }
    } catch (Exception ex) {
        throw new RuntimeException("Unable to execute external function.", ex);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) DataOutputStream(java.io.DataOutputStream) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) IOException(java.io.IOException) StringTokenizer(java.util.StringTokenizer) Matrix(org.apache.sysml.udf.Matrix) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)49 InputSplit (org.apache.hadoop.mapred.InputSplit)39 IOException (java.io.IOException)26 Path (org.apache.hadoop.fs.Path)25 JobConf (org.apache.hadoop.mapred.JobConf)24 LongWritable (org.apache.hadoop.io.LongWritable)19 Text (org.apache.hadoop.io.Text)19 ArrayList (java.util.ArrayList)16 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)14 ExecutorService (java.util.concurrent.ExecutorService)12 Future (java.util.concurrent.Future)8 FileSystem (org.apache.hadoop.fs.FileSystem)8 FastStringTokenizer (org.apache.sysml.runtime.util.FastStringTokenizer)6 Configuration (org.apache.hadoop.conf.Configuration)4 Pair (org.apache.sysml.runtime.matrix.data.Pair)4 LinkedList (java.util.LinkedList)3 Properties (java.util.Properties)3 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)3 HadoopOutputFormat (org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3