use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class FrameReaderTextCSV method readCSVFrameFromHDFS.
protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
for (int i = 0, rpos = 0; i < splits.length; i++) rpos = readCSVFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, rpos, i == 0);
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class FrameReaderTextCSV method computeCSVSize.
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
// compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
// compute number of rows
int nrow = 0;
for (int i = 0; i < splits.length; i++) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
LongWritable key = new LongWritable();
Text value = new Text();
try {
// ignore header of first split
if (i == 0 && _props.hasHeader())
reader.next(key, value);
// count remaining number of rows, ignore meta data
while (reader.next(key, value)) {
String val = value.toString();
nrow += (val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
return new Pair<>(nrow, ncol);
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class FrameReaderTextCSVParallel method computeCSVSize.
@Override
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, numThreads);
// compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
// compute number of rows
int nrow = 0;
ExecutorService pool = CommonThreadPool.get(numThreads);
try {
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
List<Future<Long>> cret = pool.invokeAll(tasks);
for (Future<Long> count : cret) nrow += count.get().intValue();
} catch (Exception e) {
throw new IOException("Failed parallel read of text csv input.", e);
} finally {
pool.shutdown();
}
return new Pair<>(nrow, ncol);
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class FrameReaderTextCell method readTextCellFrameFromHDFS.
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
if (fs.isDirectory(path)) {
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
for (InputSplit split : splits) readTextCellFrameFromInputSplit(split, informat, job, dest);
} else {
readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class ReaderTextCSVParallel method readMatrixFromHDFS.
@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException {
// prepare file access
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fname);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, _numThreads);
splits = IOUtilFunctions.sortInputSplits(splits);
// check existence and non-empty file
checkValidInputFile(fs, path);
// allocate output matrix block
// First Read Pass (count rows/cols, determine offsets, allocate matrix block)
MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(), _props.getDelim(), estnnz);
rlen = ret.getNumRows();
clen = ret.getNumColumns();
// Second Read Pass (read, parse strings, append to matrix block)
readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(), _props.getDelim(), _props.isFill(), _props.getFillValue());
// post-processing (representation-specific, change of sparse/dense block representation)
// - no sorting required for CSV because it is read in sorted order per row
// - nnz explicitly maintained in parallel for the individual splits
ret.examSparsity();
// sanity check for parallel row count (since determined internally)
if (rlen >= 0 && rlen != ret.getNumRows())
throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow=" + rlen + ", real nrow=" + ret.getNumRows());
return ret;
}
Aggregations