use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class FrameReaderTextCell method readTextCellFrameFromHDFS.
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
if (fs.isDirectory(path)) {
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
for (InputSplit split : splits) readTextCellFrameFromInputSplit(split, informat, job, dest);
} else {
readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
}
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class ReaderTextCSVParallel method readMatrixFromHDFS.
@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException {
// prepare file access
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fname);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, _numThreads);
splits = IOUtilFunctions.sortInputSplits(splits);
// check existence and non-empty file
checkValidInputFile(fs, path);
// allocate output matrix block
// First Read Pass (count rows/cols, determine offsets, allocate matrix block)
MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(), _props.getDelim(), estnnz);
rlen = ret.getNumRows();
clen = ret.getNumColumns();
// Second Read Pass (read, parse strings, append to matrix block)
readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(), _props.getDelim(), _props.isFill(), _props.getFillValue());
// post-processing (representation-specific, change of sparse/dense block representation)
// - no sorting required for CSV because it is read in sorted order per row
// - nnz explicitly maintained in parallel for the individual splits
ret.examSparsity();
// sanity check for parallel row count (since determined internally)
if (rlen >= 0 && rlen != ret.getNumRows())
throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow=" + rlen + ", real nrow=" + ret.getNumRows());
return ret;
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class ReaderTextCSVParallel method computeCSVSizeAndCreateOutputMatrixBlock.
private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job, boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException {
int nrow = 0;
int ncol = 0;
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
// count no of entities in the first non-header row
LongWritable key = new LongWritable();
Text oneLine = new Text();
RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL);
try {
if (reader.next(key, oneLine)) {
String cellStr = oneLine.toString().trim();
ncol = StringUtils.countMatches(cellStr, delim) + 1;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
// count rows in parallel per split
try {
ExecutorService pool = CommonThreadPool.get(_numThreads);
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for (InputSplit split : splits) {
tasks.add(new CountRowsTask(split, informat, job, hasHeader));
hasHeader = false;
}
pool.invokeAll(tasks);
pool.shutdown();
// collect row counts for offset computation
// early error notify in case not all tasks successful
_offsets = new SplitOffsetInfos(tasks.size());
for (CountRowsTask rt : tasks) {
if (!rt.getReturnCode())
throw new IOException("Count task for csv input failed: " + rt.getErrMsg());
_offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
_offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
nrow = nrow + rt.getRowCount();
}
} catch (Exception e) {
throw new IOException("Threadpool Error " + e.getMessage(), e);
}
// allocate target matrix block based on given size;
// need to allocate sparse as well since lock-free insert into target
long estnnz2 = (estnnz < 0) ? (long) nrow * ncol : estnnz;
return createOutputMatrixBlock(nrow, ncol, nrow, ncol, estnnz2, true, true);
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class ReaderTextCell method readTextCellMatrixFromHDFS.
private static void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException {
boolean sparse = dest.isInSparseFormat();
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LongWritable key = new LongWritable();
Text value = new Text();
int row = -1;
int col = -1;
try {
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
if (// SPARSE<-value
sparse) {
while (reader.next(key, value)) {
// reinit tokenizer
st.reset(value.toString());
row = st.nextInt() - 1;
col = st.nextInt() - 1;
if (row == -1 || col == -1)
continue;
double lvalue = st.nextDouble();
dest.appendValue(row, col, lvalue);
}
dest.sortSparseRows();
} else // DENSE<-value
{
DenseBlock a = dest.getDenseBlock();
while (reader.next(key, value)) {
// reinit tokenizer
st.reset(value.toString());
row = st.nextInt() - 1;
col = st.nextInt() - 1;
if (row == -1 || col == -1)
continue;
double lvalue = st.nextDouble();
a.set(row, col, lvalue);
}
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
} catch (Exception ex) {
// post-mortem error handling and bounds checking
if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen)
throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
else
throw new IOException("Unable to read matrix in text cell format.", ex);
}
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class ReaderTextCellParallel method readTextCellMatrixFromHDFS.
private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket) throws IOException {
int par = _numThreads;
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
// check for min file size for matrix market (adjust num splits if necessary)
if (_isMMFile) {
long len = MapReduceTool.getFilesizeOnHDFS(path);
par = (len < MIN_FILESIZE_MM) ? 1 : par;
}
try {
// create read tasks for all splits
ExecutorService pool = CommonThreadPool.get(par);
InputSplit[] splits = informat.getSplits(job, par);
ArrayList<ReadTask> tasks = new ArrayList<>();
for (InputSplit split : splits) {
ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
tasks.add(t);
}
// wait until all tasks have been executed
List<Future<Long>> rt = pool.invokeAll(tasks);
// check for exceptions and aggregate nnz
long lnnz = 0;
for (Future<Long> task : rt) lnnz += task.get();
// post-processing
dest.setNonZeros(lnnz);
if (dest.isInSparseFormat())
sortSparseRowsParallel(dest, rlen, _numThreads, pool);
pool.shutdown();
} catch (Exception e) {
throw new IOException("Threadpool issue, while parallel read.", e);
}
}
Aggregations