use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class FrameReaderTextCSV method computeCSVSize.
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
// compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
// compute number of rows
int nrow = 0;
for (int i = 0; i < splits.length; i++) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
LongWritable key = new LongWritable();
Text value = new Text();
try {
// ignore header of first split
if (i == 0 && _props.hasHeader())
reader.next(key, value);
// count remaining number of rows, ignore meta data
while (reader.next(key, value)) {
String val = value.toString();
nrow += (val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
return new Pair<>(nrow, ncol);
}
use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class FrameReaderTextCSVParallel method computeCSVSize.
@Override
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, numThreads);
// compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
// compute number of rows
int nrow = 0;
ExecutorService pool = CommonThreadPool.get(numThreads);
try {
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
List<Future<Long>> cret = pool.invokeAll(tasks);
for (Future<Long> count : cret) nrow += count.get().intValue();
} catch (Exception e) {
throw new IOException("Failed parallel read of text csv input.", e);
} finally {
pool.shutdown();
}
return new Pair<>(nrow, ncol);
}
use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class FrameReaderTextCell method readTextCellFrameFromHDFS.
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
if (fs.isDirectory(path)) {
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
for (InputSplit split : splits) readTextCellFrameFromInputSplit(split, informat, job, dest);
} else {
readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class FrameReaderTextCellParallel method readTextCellFrameFromHDFS.
@Override
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
try {
// create read tasks for all splits
ExecutorService pool = CommonThreadPool.get(numThreads);
InputSplit[] splits = informat.getSplits(job, numThreads);
ArrayList<ReadTask> tasks = new ArrayList<>();
for (InputSplit split : splits) tasks.add(new ReadTask(split, informat, job, dest));
// wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
// check for exceptions
for (Future<Object> task : rt) task.get();
} catch (Exception e) {
throw new IOException("Failed parallel read of text cell input.", e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project tez by apache.
the class TestGroupedSplits method testGzip.
/**
* Test using the gzip codec for reading
*/
@Test(timeout = 10000)
public void testGzip() throws IOException {
JobConf job = new JobConf(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, job);
localFs.delete(workDir, true);
writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "is\ngzip\n");
writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip, "one\nmore\nsplit\n");
FileInputFormat.setInputPaths(job, workDir);
TextInputFormat wrappedFormat = new TextInputFormat();
wrappedFormat.configure(job);
TezGroupedSplitsInputFormat<LongWritable, Text> format = new TezGroupedSplitsInputFormat<LongWritable, Text>();
format.setConf(job);
format.setInputFormat(wrappedFormat);
// TextInputFormat will produce 3 splits
for (int j = 1; j <= 3; ++j) {
format.setDesiredNumberOfSplits(j);
InputSplit[] splits = format.getSplits(job, 100);
if (j == 1) {
// j==1 covers single split corner case
// and does not do grouping
assertEquals("compressed splits == " + j, j, splits.length);
}
List<Text> results = new ArrayList<Text>();
for (int i = 0; i < splits.length; ++i) {
List<Text> read = readSplit(format, splits[i], job);
results.addAll(read);
}
assertEquals("splits length", 11, results.size());
final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
final String[] secondList = { "is", "gzip" };
final String[] thirdList = { "one", "more", "split" };
String first = results.get(0).toString();
int start = 0;
switch(first.charAt(0)) {
case 't':
start = testResults(results, firstList, start);
break;
case 'i':
start = testResults(results, secondList, start);
break;
case 'o':
start = testResults(results, thirdList, start);
break;
default:
Assert.fail("unexpected first token - " + first);
}
}
}
Aggregations