use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class FrameReaderTextCSV method readFrameFromInputStream.
@Override
public FrameBlock readFrameFromInputStream(InputStream is, ValueType[] schema, String[] names, long rlen, long clen) throws IOException, DMLRuntimeException {
// allocate output frame block
ValueType[] lschema = createOutputSchema(schema, clen);
String[] lnames = createOutputNames(names, clen);
FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen);
// core read (sequential/parallel)
InputStreamInputFormat informat = new InputStreamInputFormat(is);
InputSplit split = informat.getSplits(null, 1)[0];
readCSVFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true);
return ret;
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class FrameReaderTextCSVParallel method readCSVFrameFromHDFS.
@Override
protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, numThreads);
splits = IOUtilFunctions.sortInputSplits(splits);
try {
ExecutorService pool = CommonThreadPool.get(Math.min(numThreads, splits.length));
// compute num rows per split
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for (int i = 0; i < splits.length; i++) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i == 0));
List<Future<Long>> cret = pool.invokeAll(tasks);
// compute row offset per split via cumsum on row counts
long offset = 0;
List<Long> offsets = new ArrayList<>();
for (Future<Long> count : cret) {
offsets.add(offset);
offset += count.get();
}
// read individual splits
ArrayList<ReadRowsTask> tasks2 = new ArrayList<>();
for (int i = 0; i < splits.length; i++) tasks2.add(new ReadRowsTask(splits[i], informat, job, dest, offsets.get(i).intValue(), i == 0));
List<Future<Object>> rret = pool.invokeAll(tasks2);
pool.shutdown();
// error handling
for (Future<Object> read : rret) read.get();
} catch (Exception e) {
throw new IOException("Failed parallel read of text csv input.", e);
}
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class FrameReaderTextCellParallel method readTextCellFrameFromHDFS.
@Override
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
try {
// create read tasks for all splits
ExecutorService pool = CommonThreadPool.get(numThreads);
InputSplit[] splits = informat.getSplits(job, numThreads);
ArrayList<ReadTask> tasks = new ArrayList<>();
for (InputSplit split : splits) tasks.add(new ReadTask(split, informat, job, dest));
// wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
// check for exceptions
for (Future<Object> task : rt) task.get();
} catch (Exception e) {
throw new IOException("Failed parallel read of text cell input.", e);
}
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class ReaderTextCSVParallel method readCSVMatrixFromHDFS.
private void readCSVMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue) throws IOException {
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
ExecutorService pool = CommonThreadPool.get(_numThreads);
try {
// create read tasks for all splits
ArrayList<CSVReadTask> tasks = new ArrayList<>();
int splitCount = 0;
for (InputSplit split : splits) {
tasks.add(new CSVReadTask(split, _offsets, informat, job, dest, rlen, clen, hasHeader, delim, fill, fillValue, splitCount++));
}
pool.invokeAll(tasks);
pool.shutdown();
// check return codes and aggregate nnz
long lnnz = 0;
for (CSVReadTask rt : tasks) {
lnnz += rt.getPartialNnz();
if (!rt.getReturnCode()) {
Exception err = rt.getException();
throw new IOException("Read task for csv input failed: " + err.toString(), err);
}
}
dest.setNonZeros(lnnz);
} catch (Exception e) {
throw new IOException("Threadpool issue, while parallel read.", e);
}
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class RemoveEmptyRows method execute.
@Override
public void execute() {
Matrix mat = (Matrix) this.getFunctionInput(0);
String fnameOld = mat.getFilePath();
// old,new rowID
HashMap<Long, Long> keyMap = new HashMap<>();
try {
// prepare input
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameOld);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
if (!fs.exists(path))
throw new IOException("File " + fnameOld + " does not exist on HDFS.");
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
// prepare output
String fnameNew = createOutputFilePathAndName(OUTPUT_FILE);
DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true);
// read and write if necessary
InputSplit[] splits = informat.getSplits(job, 1);
LongWritable key = new LongWritable();
Text value = new Text();
long ID = 1;
try {
// for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
String cellStr = value.toString().trim();
StringTokenizer st = new StringTokenizer(cellStr, " ");
long row = Integer.parseInt(st.nextToken());
long col = Integer.parseInt(st.nextToken());
double lvalue = Double.parseDouble(st.nextToken());
if (!keyMap.containsKey(row))
keyMap.put(row, ID++);
long rowNew = keyMap.get(row);
sb.append(rowNew);
sb.append(' ');
sb.append(col);
sb.append(' ');
sb.append(lvalue);
sb.append('\n');
ostream.writeBytes(sb.toString());
sb.setLength(0);
}
} finally {
if (reader != null)
reader.close();
}
}
_ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
} finally {
if (ostream != null)
ostream.close();
}
} catch (Exception ex) {
throw new RuntimeException("Unable to execute external function.", ex);
}
}
Aggregations