use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class RemoteParForColocatedFileSplit method getLocations.
/**
* Get the list of hostnames where the input split is located.
*/
@Override
public String[] getLocations() throws IOException {
// Timing time = new Timing();
// time.start();
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = IOUtilFunctions.getFileSystem(getPath(), job);
// read task string
LongWritable key = new LongWritable();
Text value = new Text();
RecordReader<LongWritable, Text> reader = null;
try {
reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL);
reader.next(key, value);
} finally {
IOUtilFunctions.closeSilently(reader);
}
// parse task
Task t = Task.parseCompactString(value.toString());
// get all locations
HashMap<String, Integer> hosts = new HashMap<>();
if (t.getType() == TaskType.SET) {
for (IntObject val : t.getIterations()) {
String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1));
FileStatus status = fs.getFileStatus(new Path(fname));
BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts());
}
} else // TaskType.RANGE
{
// since this is a serial process, we use just the first iteration
// as a heuristic for location information
long lFrom = t.getIterations().get(0).getLongValue();
long lTo = t.getIterations().get(1).getLongValue();
for (long li : new long[] { lFrom, lTo }) {
String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1));
FileStatus status = fs.getFileStatus(new Path(fname));
BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts());
}
}
// majority consensus on top host
return getTopHosts(hosts);
}
use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class ResultMergeLocalFile method createTextCellStagingFile.
private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(mo.getFileName());
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
// long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
// NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
// errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
// It works fine with int row, col but we require long for larger matrices.
// Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
// we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
FastStringTokenizer st = new FastStringTokenizer(' ');
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try {
while (reader.next(key, value)) {
// reset tokenizer
st.reset(value.toString());
long row = st.nextLong();
long col = st.nextLong();
double lvalue = Double.parseDouble(st.nextToken());
Cell tmp = new Cell(row, col, lvalue);
buffer.addLast(tmp);
if (// periodic flush
buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
}
// final flush
if (!buffer.isEmpty()) {
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class ResultMergeLocalFile method mergeTextCellWithoutComp.
private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) {
try {
// delete target file if already exists
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
if (ALLOW_COPY_CELLFILES) {
copyAllFiles(fnameNew, inMO);
// we're done
return;
}
// actual merge
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameNew);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
String valueStr = null;
try {
for (// read/write all inputs
MatrixObject in : // read/write all inputs
inMO) {
if (LOG.isTraceEnabled())
LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname=" + in.getFileName() + ") via stream merge");
JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
Path tmpPath = new Path(in.getFileName());
FileInputFormat.addInputPath(tmpJob, tmpPath);
TextInputFormat informat = new TextInputFormat();
informat.configure(tmpJob);
InputSplit[] splits = informat.getSplits(tmpJob, 1);
LongWritable key = new LongWritable();
Text value = new Text();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
try {
while (reader.next(key, value)) {
valueStr = value.toString().trim();
out.write(valueStr + "\n");
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
} finally {
IOUtilFunctions.closeSilently(out);
}
} catch (Exception ex) {
throw new DMLRuntimeException("Unable to merge text cell results.", ex);
}
}
use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class MLContextConversionUtil method javaRDDStringCSVToMatrixObject.
/**
* Convert a {@code JavaRDD<String>} in CSV format to a {@code MatrixObject}
*
* @param javaRDD
* the Java RDD of strings
* @param matrixMetadata
* matrix metadata
* @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
*/
public static MatrixObject javaRDDStringCSVToMatrixObject(JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.CSVOutputInfo, InputInfo.CSVInputInfo));
JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
matrixObject.setRDDHandle(new RDDObject(javaPairRDD2));
return matrixObject;
}
use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class MLContextConversionUtil method javaRDDStringCSVToFrameObject.
/**
* Convert a {@code JavaRDD<String>} in CSV format to a {@code FrameObject}
*
* @param javaRDD
* the Java RDD of strings
* @param frameMetadata
* frame metadata
* @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
*/
public static FrameObject javaRDDStringCSVToFrameObject(JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
JavaPairRDD<Long, FrameBlock> rdd;
try {
rdd = FrameRDDConverterUtils.csvToBinaryBlock(jsc(), javaPairRDDText, mc, frameObject.getSchema(), false, ",", false, -1);
} catch (DMLRuntimeException e) {
e.printStackTrace();
return null;
}
frameObject.setRDDHandle(new RDDObject(rdd));
return frameObject;
}
Aggregations