use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class MLContextConversionUtil method javaRDDStringIJVToMatrixObject.
/**
* Convert a {@code JavaRDD<String>} in IJV format to a {@code MatrixObject}
* . Note that metadata is required for IJV format.
*
* @param javaRDD
* the Java RDD of strings
* @param matrixMetadata
* matrix metadata
* @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
*/
public static MatrixObject javaRDDStringIJVToMatrixObject(JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.TextCellOutputInfo, InputInfo.TextCellInputInfo));
JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
matrixObject.setRDDHandle(new RDDObject(javaPairRDD2));
return matrixObject;
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class MLContextConversionUtil method javaRDDStringCSVToFrameObject.
/**
* Convert a {@code JavaRDD<String>} in CSV format to a {@code FrameObject}
*
* @param javaRDD
* the Java RDD of strings
* @param frameMetadata
* frame metadata
* @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
*/
public static FrameObject javaRDDStringCSVToFrameObject(JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
JavaPairRDD<Long, FrameBlock> rdd;
try {
rdd = FrameRDDConverterUtils.csvToBinaryBlock(jsc(), javaPairRDDText, mc, frameObject.getSchema(), false, ",", false, -1);
} catch (DMLRuntimeException e) {
e.printStackTrace();
return null;
}
frameObject.setRDDHandle(new RDDObject(rdd));
return frameObject;
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class ReblockSPInstruction method processMatrixReblockInstruction.
@SuppressWarnings("unchecked")
protected void processMatrixReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) {
MatrixObject mo = sec.getMatrixObject(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (iinfo == InputInfo.TextCellInputInfo || iinfo == InputInfo.MatrixMarketInputInfo) {
// get the input textcell rdd
JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
// convert textcell to binary block
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, outputEmptyBlocks);
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
} else if (iinfo == InputInfo.CSVInputInfo) {
// HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
// throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
CSVReblockSPInstruction csvInstruction = null;
boolean hasHeader = false;
String delim = ",";
boolean fill = false;
double fillValue = 0;
if (mo.getFileFormatProperties() instanceof CSVFileFormatProperties && mo.getFileFormatProperties() != null) {
CSVFileFormatProperties props = (CSVFileFormatProperties) mo.getFileFormatProperties();
hasHeader = props.hasHeader();
delim = props.getDelim();
fill = props.isFill();
fillValue = props.getFillValue();
}
csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
csvInstruction.processInstruction(sec);
return;
} else if (iinfo == InputInfo.BinaryCellInputInfo) {
JavaPairRDD<MatrixIndexes, MatrixCell> binaryCells = (JavaPairRDD<MatrixIndexes, MatrixCell>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), binaryCells, mcOut, outputEmptyBlocks);
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
} else if (iinfo == InputInfo.BinaryBlockInputInfo) {
// BINARY BLOCK <- BINARY BLOCK (different sizes)
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
boolean shuffleFreeReblock = mc.dimsKnown() && mcOut.dimsKnown() && (mc.getRows() < mcOut.getRowsPerBlock() || mc.getRowsPerBlock() % mcOut.getRowsPerBlock() == 0) && (mc.getCols() < mcOut.getColsPerBlock() || mc.getColsPerBlock() % mcOut.getColsPerBlock() == 0);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.flatMapToPair(new ExtractBlockForBinaryReblock(mc, mcOut));
if (!shuffleFreeReblock)
out = RDDAggregateUtils.mergeByKey(out, false);
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
} else {
throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction:" + InputInfo.inputInfoToString(iinfo));
}
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class RemoteDPParForMR method readResultFile.
/**
* Result file contains hierarchy of workerID-resultvar(incl filename). We deduplicate
* on the workerID. Without JVM reuse each task refers to a unique workerID, so we
* will not find any duplicates. With JVM reuse, however, each slot refers to a workerID,
* and there are duplicate filenames due to partial aggregation and overwrite of fname
* (the RemoteParWorkerMapper ensures uniqueness of those files independent of the
* runtime implementation).
*
* @param job job configuration
* @param fname file name
* @return array of local variable maps
* @throws IOException if IOException occurs
*/
@SuppressWarnings("deprecation")
public static LocalVariableMap[] readResultFile(JobConf job, String fname) throws IOException {
HashMap<Long, LocalVariableMap> tmp = new HashMap<>();
Path path = new Path(fname);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
// workerID
LongWritable key = new LongWritable();
// serialized var header (incl filename)
Text value = new Text();
int countAll = 0;
for (Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path)) {
SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
try {
while (reader.next(key, value)) {
if (!tmp.containsKey(key.get()))
tmp.put(key.get(), new LocalVariableMap());
Object[] dat = ProgramConverter.parseDataObject(value.toString());
tmp.get(key.get()).put((String) dat[0], (Data) dat[1]);
countAll++;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
LOG.debug("Num remote worker results (before deduplication): " + countAll);
LOG.debug("Num remote worker results: " + tmp.size());
// create return array
return tmp.values().toArray(new LocalVariableMap[0]);
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class RemoteParForColocatedFileSplit method getLocations.
/**
* Get the list of hostnames where the input split is located.
*/
@Override
public String[] getLocations() throws IOException {
// Timing time = new Timing();
// time.start();
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = IOUtilFunctions.getFileSystem(getPath(), job);
// read task string
LongWritable key = new LongWritable();
Text value = new Text();
RecordReader<LongWritable, Text> reader = null;
try {
reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL);
reader.next(key, value);
} finally {
IOUtilFunctions.closeSilently(reader);
}
// parse task
Task t = Task.parseCompactString(value.toString());
// get all locations
HashMap<String, Integer> hosts = new HashMap<>();
if (t.getType() == TaskType.SET) {
for (IntObject val : t.getIterations()) {
String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1));
FileStatus status = fs.getFileStatus(new Path(fname));
BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts());
}
} else // TaskType.RANGE
{
// since this is a serial process, we use just the first iteration
// as a heuristic for location information
long lFrom = t.getIterations().get(0).getLongValue();
long lTo = t.getIterations().get(1).getLongValue();
for (long li : new long[] { lFrom, lTo }) {
String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1));
FileStatus status = fs.getFileStatus(new Path(fname));
BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts());
}
}
// majority consensus on top host
return getTopHosts(hosts);
}
Aggregations