use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class ReaderTextCSVParallel method computeCSVSizeAndCreateOutputMatrixBlock.
private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job, boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException {
int nrow = 0;
int ncol = 0;
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
// count no of entities in the first non-header row
LongWritable key = new LongWritable();
Text oneLine = new Text();
RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL);
try {
if (reader.next(key, oneLine)) {
String cellStr = oneLine.toString().trim();
ncol = StringUtils.countMatches(cellStr, delim) + 1;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
// count rows in parallel per split
try {
ExecutorService pool = CommonThreadPool.get(_numThreads);
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for (InputSplit split : splits) {
tasks.add(new CountRowsTask(split, informat, job, hasHeader));
hasHeader = false;
}
pool.invokeAll(tasks);
pool.shutdown();
// collect row counts for offset computation
// early error notify in case not all tasks successful
_offsets = new SplitOffsetInfos(tasks.size());
for (CountRowsTask rt : tasks) {
if (!rt.getReturnCode())
throw new IOException("Count task for csv input failed: " + rt.getErrMsg());
_offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
_offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
nrow = nrow + rt.getRowCount();
}
} catch (Exception e) {
throw new IOException("Threadpool Error " + e.getMessage(), e);
}
// allocate target matrix block based on given size;
// need to allocate sparse as well since lock-free insert into target
long estnnz2 = (estnnz < 0) ? (long) nrow * ncol : estnnz;
return createOutputMatrixBlock(nrow, ncol, nrow, ncol, estnnz2, true, true);
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class ReaderTextCell method readTextCellMatrixFromHDFS.
private static void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException {
boolean sparse = dest.isInSparseFormat();
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LongWritable key = new LongWritable();
Text value = new Text();
int row = -1;
int col = -1;
try {
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
if (// SPARSE<-value
sparse) {
while (reader.next(key, value)) {
// reinit tokenizer
st.reset(value.toString());
row = st.nextInt() - 1;
col = st.nextInt() - 1;
if (row == -1 || col == -1)
continue;
double lvalue = st.nextDouble();
dest.appendValue(row, col, lvalue);
}
dest.sortSparseRows();
} else // DENSE<-value
{
DenseBlock a = dest.getDenseBlock();
while (reader.next(key, value)) {
// reinit tokenizer
st.reset(value.toString());
row = st.nextInt() - 1;
col = st.nextInt() - 1;
if (row == -1 || col == -1)
continue;
double lvalue = st.nextDouble();
a.set(row, col, lvalue);
}
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
} catch (Exception ex) {
// post-mortem error handling and bounds checking
if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen)
throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
else
throw new IOException("Unable to read matrix in text cell format.", ex);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class ReaderTextCellParallel method readTextCellMatrixFromHDFS.
private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket) throws IOException {
int par = _numThreads;
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
// check for min file size for matrix market (adjust num splits if necessary)
if (_isMMFile) {
long len = MapReduceTool.getFilesizeOnHDFS(path);
par = (len < MIN_FILESIZE_MM) ? 1 : par;
}
try {
// create read tasks for all splits
ExecutorService pool = CommonThreadPool.get(par);
InputSplit[] splits = informat.getSplits(job, par);
ArrayList<ReadTask> tasks = new ArrayList<>();
for (InputSplit split : splits) {
ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
tasks.add(t);
}
// wait until all tasks have been executed
List<Future<Long>> rt = pool.invokeAll(tasks);
// check for exceptions and aggregate nnz
long lnnz = 0;
for (Future<Long> task : rt) lnnz += task.get();
// post-processing
dest.setNonZeros(lnnz);
if (dest.isInSparseFormat())
sortSparseRowsParallel(dest, rlen, _numThreads, pool);
pool.shutdown();
} catch (Exception e) {
throw new IOException("Threadpool issue, while parallel read.", e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project tez by apache.
the class TestGroupedSplits method testFormat.
@Test(timeout = 10000)
public void testFormat() throws Exception {
JobConf job = new JobConf(defaultConf);
Random random = new Random();
long seed = random.nextLong();
LOG.info("seed = " + seed);
random.setSeed(seed);
localFs.delete(workDir, true);
FileInputFormat.setInputPaths(job, workDir);
final int length = 10000;
final int numFiles = 10;
createFiles(length, numFiles, random);
// create a combined split for the files
TextInputFormat wrappedFormat = new TextInputFormat();
wrappedFormat.configure(job);
TezGroupedSplitsInputFormat<LongWritable, Text> format = new TezGroupedSplitsInputFormat<LongWritable, Text>();
format.setConf(job);
format.setDesiredNumberOfSplits(1);
format.setInputFormat(wrappedFormat);
LongWritable key = new LongWritable();
Text value = new Text();
for (int i = 0; i < 3; i++) {
int numSplits = random.nextInt(length / 20) + 1;
LOG.info("splitting: requesting = " + numSplits);
InputSplit[] splits = format.getSplits(job, numSplits);
LOG.info("splitting: got = " + splits.length);
// we should have a single split as the length is comfortably smaller than
// the block size
assertEquals("We got more than one splits!", 1, splits.length);
InputSplit split = splits[0];
assertEquals("It should be TezGroupedSplit", TezGroupedSplit.class, split.getClass());
// check the split
BitSet bits = new BitSet(length);
LOG.debug("split= " + split);
RecordReader<LongWritable, Text> reader = format.getRecordReader(split, job, voidReporter);
try {
int count = 0;
while (reader.next(key, value)) {
int v = Integer.parseInt(value.toString());
LOG.debug("read " + v);
if (bits.get(v)) {
LOG.warn("conflict with " + v + " at position " + reader.getPos());
}
assertFalse("Key in multiple partitions.", bits.get(v));
bits.set(v);
count++;
}
LOG.info("splits=" + split + " count=" + count);
} finally {
reader.close();
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class RemoveEmptyRows method execute.
@Override
public void execute() {
Matrix mat = (Matrix) this.getFunctionInput(0);
String fnameOld = mat.getFilePath();
// old,new rowID
HashMap<Long, Long> keyMap = new HashMap<>();
try {
// prepare input
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameOld);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
if (!fs.exists(path))
throw new IOException("File " + fnameOld + " does not exist on HDFS.");
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
// prepare output
String fnameNew = createOutputFilePathAndName(OUTPUT_FILE);
DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true);
// read and write if necessary
InputSplit[] splits = informat.getSplits(job, 1);
LongWritable key = new LongWritable();
Text value = new Text();
long ID = 1;
try {
// for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
String cellStr = value.toString().trim();
StringTokenizer st = new StringTokenizer(cellStr, " ");
long row = Integer.parseInt(st.nextToken());
long col = Integer.parseInt(st.nextToken());
double lvalue = Double.parseDouble(st.nextToken());
if (!keyMap.containsKey(row))
keyMap.put(row, ID++);
long rowNew = keyMap.get(row);
sb.append(rowNew);
sb.append(' ');
sb.append(col);
sb.append(' ');
sb.append(lvalue);
sb.append('\n');
ostream.writeBytes(sb.toString());
sb.setLength(0);
}
} finally {
if (reader != null)
reader.close();
}
}
_ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
} finally {
if (ostream != null)
ostream.close();
}
} catch (Exception ex) {
throw new RuntimeException("Unable to execute external function.", ex);
}
}
Aggregations