use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class ResultMergeLocalFile method createTextCellStagingFile.
private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(mo.getFileName());
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
// long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
// NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
// errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
// It works fine with int row, col but we require long for larger matrices.
// Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
// we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
// reset tokenizer
st.reset(value.toString());
long row = st.nextLong();
long col = st.nextLong();
double lvalue = Double.parseDouble(st.nextToken());
Cell tmp = new Cell(row, col, lvalue);
buffer.addLast(tmp);
if (// periodic flush
buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
}
// final flush
if (!buffer.isEmpty()) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class DataPartitionerLocal method partitionTextCell.
private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
long row = -1;
long col = -1;
try {
// STEP 1: read matrix from HDFS and write blocks to local staging area
// check and add input path
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fname);
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
// reset tokenizer
st.reset(value.toString());
row = st.nextLong();
col = st.nextLong();
double lvalue = st.nextDouble();
Cell tmp = new Cell(row, col, lvalue);
buffer.addLast(tmp);
if (// periodic flush
buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
buffer.clear();
}
}
// final flush
if (!buffer.isEmpty()) {
appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
buffer.clear();
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
// STEP 2: read matrix blocks from staging area and write matrix to HDFS
String[] fnamesPartitions = new File(fnameStaging).list();
if (PARALLEL) {
int len = Math.min(fnamesPartitions.length, _par);
Thread[] threads = new Thread[len];
for (int i = 0; i < len; i++) {
int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
end = Math.min(end, fnamesPartitions.length - 1);
threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
threads[i].start();
}
for (Thread t : threads) t.join();
} else {
for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
}
} catch (Exception e) {
// post-mortem error handling and bounds checking
if (row < 1 || row > rlen || col < 1 || col > clen) {
throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
} else
throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class FrameReaderTextCSVParallel method readCSVFrameFromHDFS.
@Override
protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, numThreads);
splits = IOUtilFunctions.sortInputSplits(splits);
try {
ExecutorService pool = CommonThreadPool.get(Math.min(numThreads, splits.length));
// compute num rows per split
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
List<Future<Long>> cret = pool.invokeAll(tasks);
// compute row offset per split via cumsum on row counts
long offset = 0;
List<Long> offsets = new ArrayList<>();
for (Future<Long> count : cret) {
offsets.add(offset);
offset += count.get();
}
// read individual splits
ArrayList<ReadRowsTask> tasks2 = new ArrayList<>();
for (int i = 0; i < splits.length; i++) tasks2.add(new ReadRowsTask(splits[i], informat, job, dest, offsets.get(i).intValue(), i == 0));
List<Future<Object>> rret = pool.invokeAll(tasks2);
pool.shutdown();
// error handling
for (Future<Object> read : rret) read.get();
} catch (Exception e) {
throw new IOException("Failed parallel read of text csv input.", e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class FrameReaderTextCellParallel method readTextCellFrameFromHDFS.
@Override
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
try {
// create read tasks for all splits
ExecutorService pool = CommonThreadPool.get(numThreads);
InputSplit[] splits = informat.getSplits(job, numThreads);
ArrayList<ReadTask> tasks = new ArrayList<>();
for (InputSplit split : splits) tasks.add(new ReadTask(split, informat, job, dest));
// wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
// check for exceptions
for (Future<Object> task : rt) task.get();
} catch (Exception e) {
throw new IOException("Failed parallel read of text cell input.", e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class ReaderTextCSVParallel method readCSVMatrixFromHDFS.
private void readCSVMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue) throws IOException {
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
ExecutorService pool = CommonThreadPool.get(_numThreads);
try {
// create read tasks for all splits
ArrayList<CSVReadTask> tasks = new ArrayList<>();
int splitCount = 0;
for (InputSplit split : splits) {
tasks.add(new CSVReadTask(split, _offsets, informat, job, dest, rlen, clen, hasHeader, delim, fill, fillValue, splitCount++));
}
pool.invokeAll(tasks);
pool.shutdown();
// check return codes and aggregate nnz
long lnnz = 0;
for (CSVReadTask rt : tasks) {
lnnz += rt.getPartialNnz();
if (!rt.getReturnCode()) {
Exception err = rt.getException();
throw new IOException("Read task for csv input failed: " + err.toString(), err);
}
}
dest.setNonZeros(lnnz);
} catch (Exception e) {
throw new IOException("Threadpool issue, while parallel read.", e);
}
}
Aggregations