Search in sources :

Example 16 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class FrameReaderTextCSV method computeCSVSize.

protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    splits = IOUtilFunctions.sortInputSplits(splits);
    // compute number of columns
    int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
    // compute number of rows
    int nrow = 0;
    for (int i = 0; i < splits.length; i++) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
        LongWritable key = new LongWritable();
        Text value = new Text();
        try {
            // ignore header of first split
            if (i == 0 && _props.hasHeader())
                reader.next(key, value);
            // count remaining number of rows, ignore meta data
            while (reader.next(key, value)) {
                String val = value.toString();
                nrow += (val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
    return new Pair<>(nrow, ncol);
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 17 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class FrameReaderTextCSVParallel method computeCSVSize.

@Override
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
    int numThreads = OptimizerUtils.getParallelTextReadParallelism();
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, numThreads);
    // compute number of columns
    int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
    // compute number of rows
    int nrow = 0;
    ExecutorService pool = CommonThreadPool.get(numThreads);
    try {
        ArrayList<CountRowsTask> tasks = new ArrayList<>();
        for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
        List<Future<Long>> cret = pool.invokeAll(tasks);
        for (Future<Long> count : cret) nrow += count.get().intValue();
    } catch (Exception e) {
        throw new IOException("Failed parallel read of text csv input.", e);
    } finally {
        pool.shutdown();
    }
    return new Pair<>(nrow, ncol);
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 18 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class FrameReaderTextCell method readTextCellFrameFromHDFS.

protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    if (fs.isDirectory(path)) {
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        for (InputSplit split : splits) readTextCellFrameFromInputSplit(split, informat, job, dest);
    } else {
        readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
    }
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 19 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.

the class FrameReaderTextCellParallel method readTextCellFrameFromHDFS.

@Override
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    int numThreads = OptimizerUtils.getParallelTextReadParallelism();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    try {
        // create read tasks for all splits
        ExecutorService pool = CommonThreadPool.get(numThreads);
        InputSplit[] splits = informat.getSplits(job, numThreads);
        ArrayList<ReadTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) tasks.add(new ReadTask(split, informat, job, dest));
        // wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();
        // check for exceptions
        for (Future<Object> task : rt) task.get();
    } catch (Exception e) {
        throw new IOException("Failed parallel read of text cell input.", e);
    }
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 20 with TextInputFormat

use of org.apache.hadoop.mapred.TextInputFormat in project tez by apache.

the class TestGroupedSplits method testGzip.

/**
 * Test using the gzip codec for reading
 */
@Test(timeout = 10000)
public void testGzip() throws IOException {
    JobConf job = new JobConf(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, job);
    localFs.delete(workDir, true);
    writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "is\ngzip\n");
    writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip, "one\nmore\nsplit\n");
    FileInputFormat.setInputPaths(job, workDir);
    TextInputFormat wrappedFormat = new TextInputFormat();
    wrappedFormat.configure(job);
    TezGroupedSplitsInputFormat<LongWritable, Text> format = new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setInputFormat(wrappedFormat);
    // TextInputFormat will produce 3 splits
    for (int j = 1; j <= 3; ++j) {
        format.setDesiredNumberOfSplits(j);
        InputSplit[] splits = format.getSplits(job, 100);
        if (j == 1) {
            // j==1 covers single split corner case
            // and does not do grouping
            assertEquals("compressed splits == " + j, j, splits.length);
        }
        List<Text> results = new ArrayList<Text>();
        for (int i = 0; i < splits.length; ++i) {
            List<Text> read = readSplit(format, splits[i], job);
            results.addAll(read);
        }
        assertEquals("splits length", 11, results.size());
        final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
        final String[] secondList = { "is", "gzip" };
        final String[] thirdList = { "one", "more", "split" };
        String first = results.get(0).toString();
        int start = 0;
        switch(first.charAt(0)) {
            case 't':
                start = testResults(results, firstList, start);
                break;
            case 'i':
                start = testResults(results, secondList, start);
                break;
            case 'o':
                start = testResults(results, thirdList, start);
                break;
            default:
                Assert.fail("unexpected first token - " + first);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Aggregations

TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)49 InputSplit (org.apache.hadoop.mapred.InputSplit)39 IOException (java.io.IOException)26 Path (org.apache.hadoop.fs.Path)25 JobConf (org.apache.hadoop.mapred.JobConf)24 LongWritable (org.apache.hadoop.io.LongWritable)19 Text (org.apache.hadoop.io.Text)19 ArrayList (java.util.ArrayList)16 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)14 ExecutorService (java.util.concurrent.ExecutorService)12 Future (java.util.concurrent.Future)8 FileSystem (org.apache.hadoop.fs.FileSystem)8 FastStringTokenizer (org.apache.sysml.runtime.util.FastStringTokenizer)6 Configuration (org.apache.hadoop.conf.Configuration)4 Pair (org.apache.sysml.runtime.matrix.data.Pair)4 LinkedList (java.util.LinkedList)3 Properties (java.util.Properties)3 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)3 HadoopOutputFormat (org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat)3 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)3