use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class RemoveEmptyRows method execute.
@Override
public void execute() {
Matrix mat = (Matrix) this.getFunctionInput(0);
String fnameOld = mat.getFilePath();
// old,new rowID
HashMap<Long, Long> keyMap = new HashMap<>();
try {
// prepare input
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameOld);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
if (!fs.exists(path))
throw new IOException("File " + fnameOld + " does not exist on HDFS.");
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
// prepare output
String fnameNew = createOutputFilePathAndName(OUTPUT_FILE);
DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true);
// read and write if necessary
InputSplit[] splits = informat.getSplits(job, 1);
LongWritable key = new LongWritable();
Text value = new Text();
long ID = 1;
try {
// for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
String cellStr = value.toString().trim();
StringTokenizer st = new StringTokenizer(cellStr, " ");
long row = Integer.parseInt(st.nextToken());
long col = Integer.parseInt(st.nextToken());
double lvalue = Double.parseDouble(st.nextToken());
if (!keyMap.containsKey(row))
keyMap.put(row, ID++);
long rowNew = keyMap.get(row);
sb.append(rowNew);
sb.append(' ');
sb.append(col);
sb.append(' ');
sb.append(lvalue);
sb.append('\n');
ostream.writeBytes(sb.toString());
sb.setLength(0);
}
} finally {
if (reader != null)
reader.close();
}
}
_ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
} finally {
if (ostream != null)
ostream.close();
}
} catch (Exception ex) {
throw new RuntimeException("Unable to execute external function.", ex);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class ResultMergeLocalFile method createTextCellStagingFile.
private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(mo.getFileName());
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
// long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
// NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
// errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
// It works fine with int row, col but we require long for larger matrices.
// Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
// we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
// reset tokenizer
st.reset(value.toString());
long row = st.nextLong();
long col = st.nextLong();
double lvalue = Double.parseDouble(st.nextToken());
Cell tmp = new Cell(row, col, lvalue);
buffer.addLast(tmp);
if (// periodic flush
buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
}
// final flush
if (!buffer.isEmpty()) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class ResultMergeLocalFile method mergeTextCellWithoutComp.
private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) {
try {
// delete target file if already exists
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
if (ALLOW_COPY_CELLFILES) {
copyAllFiles(fnameNew, inMO);
// we're done
return;
}
// actual merge
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameNew);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
String valueStr = null;
try {
for (// read/write all inputs
MatrixObject in : // read/write all inputs
inMO) {
if (LOG.isTraceEnabled())
LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname=" + in.getFileName() + ") via stream merge");
JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
Path tmpPath = new Path(in.getFileName());
FileInputFormat.addInputPath(tmpJob, tmpPath);
TextInputFormat informat = new TextInputFormat();
informat.configure(tmpJob);
InputSplit[] splits = informat.getSplits(tmpJob, 1);
LongWritable key = new LongWritable();
Text value = new Text();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
try {
while (reader.next(key, value)) {
valueStr = value.toString().trim();
out.write(valueStr + "\n");
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
} finally {
IOUtilFunctions.closeSilently(out);
}
} catch (Exception ex) {
throw new DMLRuntimeException("Unable to merge text cell results.", ex);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class ReaderTextCSVParallel method readCSVMatrixFromHDFS.
private void readCSVMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue) throws IOException {
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
ExecutorService pool = CommonThreadPool.get(_numThreads);
try {
// create read tasks for all splits
ArrayList<CSVReadTask> tasks = new ArrayList<>();
int splitCount = 0;
for (InputSplit split : splits) {
tasks.add(new CSVReadTask(split, _offsets, informat, job, dest, rlen, clen, hasHeader, delim, fill, fillValue, splitCount++));
}
pool.invokeAll(tasks);
pool.shutdown();
// check return codes and aggregate nnz
long lnnz = 0;
for (CSVReadTask rt : tasks) {
lnnz += rt.getPartialNnz();
if (!rt.getReturnCode()) {
Exception err = rt.getException();
throw new IOException("Read task for csv input failed: " + err.toString(), err);
}
}
dest.setNonZeros(lnnz);
} catch (Exception e) {
throw new IOException("Threadpool issue, while parallel read.", e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project systemml by apache.
the class FrameReaderTextCSV method readCSVFrameFromHDFS.
protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
for (int i = 0, rpos = 0; i < splits.length; i++) rpos = readCSVFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, rpos, i == 0);
}
Aggregations