use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class RemoteParForMR method readResultFile.
/**
* Result file contains hierarchy of workerID-resultvar(incl filename). We deduplicate
* on the workerID. Without JVM reuse each task refers to a unique workerID, so we
* will not find any duplicates. With JVM reuse, however, each slot refers to a workerID,
* and there are duplicate filenames due to partial aggregation and overwrite of fname
* (the RemoteParWorkerMapper ensures uniqueness of those files independent of the
* runtime implementation).
*
* @param job job configuration
* @param fname file name
* @return array of local variable maps
* @throws IOException if IOException occurs
*/
@SuppressWarnings("deprecation")
public static LocalVariableMap[] readResultFile(JobConf job, String fname) throws IOException {
HashMap<Long, LocalVariableMap> tmp = new HashMap<>();
Path path = new Path(fname);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
// workerID
LongWritable key = new LongWritable();
// serialized var header (incl filename)
Text value = new Text();
int countAll = 0;
for (Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path)) {
SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
try {
while (reader.next(key, value)) {
if (!tmp.containsKey(key.get()))
tmp.put(key.get(), new LocalVariableMap());
Object[] dat = ProgramConverter.parseDataObject(value.toString());
tmp.get(key.get()).put((String) dat[0], (Data) dat[1]);
countAll++;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
LOG.debug("Num remote worker results (before deduplication): " + countAll);
LOG.debug("Num remote worker results: " + tmp.size());
// create return array
return tmp.values().toArray(new LocalVariableMap[0]);
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class ResultMergeLocalFile method createTextCellStagingFile.
private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(mo.getFileName());
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
// long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
// NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
// errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
// It works fine with int row, col but we require long for larger matrices.
// Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
// we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
// reset tokenizer
st.reset(value.toString());
long row = st.nextLong();
long col = st.nextLong();
double lvalue = Double.parseDouble(st.nextToken());
Cell tmp = new Cell(row, col, lvalue);
buffer.addLast(tmp);
if (// periodic flush
buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
}
// final flush
if (!buffer.isEmpty()) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class DataPartitionerLocal method partitionTextCell.
private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
long row = -1;
long col = -1;
try {
// STEP 1: read matrix from HDFS and write blocks to local staging area
// check and add input path
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fname);
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
// reset tokenizer
st.reset(value.toString());
row = st.nextLong();
col = st.nextLong();
double lvalue = st.nextDouble();
Cell tmp = new Cell(row, col, lvalue);
buffer.addLast(tmp);
if (// periodic flush
buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
buffer.clear();
}
}
// final flush
if (!buffer.isEmpty()) {
appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
buffer.clear();
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
// STEP 2: read matrix blocks from staging area and write matrix to HDFS
String[] fnamesPartitions = new File(fnameStaging).list();
if (PARALLEL) {
int len = Math.min(fnamesPartitions.length, _par);
Thread[] threads = new Thread[len];
for (int i = 0; i < len; i++) {
int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
end = Math.min(end, fnamesPartitions.length - 1);
threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
threads[i].start();
}
for (Thread t : threads) t.join();
} else {
for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
}
} catch (Exception e) {
// post-mortem error handling and bounds checking
if (row < 1 || row > rlen || col < 1 || col > clen) {
throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
} else
throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
}
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class FrameReaderBinaryBlock method readBinaryBlockFrameFromSequenceFile.
@SuppressWarnings({ "deprecation" })
protected static void readBinaryBlockFrameFromSequenceFile(Path path, JobConf job, FileSystem fs, FrameBlock dest) throws IOException, DMLRuntimeException {
int rlen = dest.getNumRows();
int clen = dest.getNumColumns();
// directly read from sequence files (individual partfiles)
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
LongWritable key = new LongWritable(-1L);
FrameBlock value = new FrameBlock();
try {
while (reader.next(key, value)) {
int row_offset = (int) (key.get() - 1);
int rows = value.getNumRows();
int cols = value.getNumColumns();
if (// Empty block, ignore it.
rows == 0 || cols == 0)
continue;
// bound check per block
if (row_offset + rows < 0 || row_offset + rows > rlen) {
throw new IOException("Frame block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + ":" + "] " + "out of overall frame range [1:" + rlen + ",1:" + clen + "].");
}
// copy block into target frame, incl meta on first
dest.copy(row_offset, row_offset + rows - 1, 0, cols - 1, value);
if (row_offset == 0)
dest.setColumnMetadata(value.getColumnMetadata());
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class FrameReaderTextCSV method readCSVFrameFromInputSplit.
protected final int readCSVFrameFromInputSplit(InputSplit split, InputFormat<LongWritable, Text> informat, JobConf job, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen, int rl, boolean first) throws IOException {
boolean hasHeader = _props.hasHeader();
boolean isFill = _props.isFill();
double dfillValue = _props.getFillValue();
String sfillValue = String.valueOf(_props.getFillValue());
String delim = _props.getDelim();
// create record reader
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
LongWritable key = new LongWritable();
Text value = new Text();
int row = rl;
int col = -1;
// handle header if existing
if (first && hasHeader) {
// read header
reader.next(key, value);
dest.setColumnNames(value.toString().split(delim));
}
// Read the data
boolean emptyValuesFound = false;
try {
while (// foreach line
reader.next(key, value)) {
String cellStr = value.toString().trim();
emptyValuesFound = false;
col = 0;
String[] parts = IOUtilFunctions.splitCSV(cellStr, delim);
// parse frame meta data (missing values / num distinct)
if (parts[0].equals(TfUtils.TXMTD_MVPREFIX) || parts[0].equals(TfUtils.TXMTD_NDPREFIX)) {
if (parts[0].equals(TfUtils.TXMTD_MVPREFIX))
for (int j = 0; j < dest.getNumColumns(); j++) dest.getColumnMetadata(j).setMvValue(parts[j + 1]);
else if (parts[0].equals(TfUtils.TXMTD_NDPREFIX))
for (int j = 0; j < dest.getNumColumns(); j++) dest.getColumnMetadata(j).setNumDistinct(Long.parseLong(parts[j + 1]));
continue;
}
for (// foreach cell
String part : // foreach cell
parts) {
part = part.trim();
if (part.isEmpty()) {
if (isFill && dfillValue != 0)
dest.set(row, col, UtilFunctions.stringToObject(schema[col], sfillValue));
emptyValuesFound = true;
} else {
dest.set(row, col, UtilFunctions.stringToObject(schema[col], part));
}
col++;
}
// sanity checks for empty values and number of columns
IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill, emptyValuesFound);
IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen);
row++;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
return row;
}
Aggregations