use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class RDDConverterUtils method libsvmToBinaryBlock.
/**
* Converts a libsvm text input file into two binary block matrices for features
* and labels, and saves these to the specified output files. This call also deletes
* existing files at the specified output locations, as well as determines and
* writes the meta data files of both output matrices.
* <p>
* Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing
* the libsvm input files in order to ensure consistency with Spark.
*
* @param sc java spark context
* @param pathIn path to libsvm input file
* @param pathX path to binary block output file of features
* @param pathY path to binary block output file of labels
* @param mcOutX matrix characteristics of output matrix X
*/
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, MatrixCharacteristics mcOutX) {
if (!mcOutX.dimsKnown())
throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation.");
try {
// cleanup existing output files
MapReduceTool.deleteFileIfExistOnHDFS(pathX);
MapReduceTool.deleteFileIfExistOnHDFS(pathY);
// convert libsvm to labeled points
int numFeatures = (int) mcOutX.getCols();
int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
// append row index and best-effort caching to avoid repeated text parsing
JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint, Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK());
// extract labels and convert to binary block
MatrixCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
JavaPairRDD<MatrixIndexes, MatrixBlock> out1 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
// update nnz after triggered save
mc1.setNonZeros(aNnz1.value());
MapReduceTool.writeMetaDataFile(pathY + ".mtd", ValueType.DOUBLE, mc1, OutputInfo.BinaryBlockOutputInfo);
// extract data and convert to binary block
MatrixCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
JavaPairRDD<MatrixIndexes, MatrixBlock> out2 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
// update nnz after triggered save
mc2.setNonZeros(aNnz2.value());
MapReduceTool.writeMetaDataFile(pathX + ".mtd", ValueType.DOUBLE, mc2, OutputInfo.BinaryBlockOutputInfo);
// asynchronous cleanup of cached intermediates
ilpoints.unpersist(false);
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
}
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class RDDSortUtils method sortByVals.
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVals(JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, long clen, int brlen) {
// create value-index rdd from inputs
JavaRDD<MatrixBlock> dvals = in.values().flatMap(new ExtractRowsFunction());
// sort (creates sorted range per partition)
int numPartitions = SparkUtils.getNumPreferredPartitions(new MatrixCharacteristics(rlen, clen, brlen, brlen), in);
JavaRDD<MatrixBlock> sdvals = dvals.sortBy(new CreateDoubleKeysFunction(), true, numPartitions);
// create binary block output
JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals.zipWithIndex().mapPartitionsToPair(new ConvertToBinaryBlockFunction5(rlen, brlen));
ret = RDDAggregateUtils.mergeByKey(ret, false);
return ret;
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class CostEstimator method maintainMRJobInstVariableStatistics.
private void maintainMRJobInstVariableStatistics(Instruction inst, HashMap<String, VarStats> stats) {
MRJobInstruction jobinst = (MRJobInstruction) inst;
// input sizes (varname, index mapping)
String[] inVars = jobinst.getInputVars();
int index = -1;
for (String varname : inVars) {
VarStats vs = stats.get(varname);
if (vs == null)
vs = _unknownStats;
stats.put(String.valueOf(++index), vs);
}
// rand output
String rdInst = jobinst.getIv_randInstructions();
if (rdInst != null && rdInst.length() > 0) {
StringTokenizer st = new StringTokenizer(rdInst, Lop.INSTRUCTION_DELIMITOR);
while (// foreach rand instruction
st.hasMoreTokens()) {
String[] parts = InstructionUtils.getInstructionParts(st.nextToken());
byte outIndex = Byte.parseByte(parts[2]);
long rlen = parts[3].contains(Lop.VARIABLE_NAME_PLACEHOLDER) ? -1 : UtilFunctions.parseToLong(parts[3]);
long clen = parts[4].contains(Lop.VARIABLE_NAME_PLACEHOLDER) ? -1 : UtilFunctions.parseToLong(parts[4]);
int brlen = Integer.parseInt(parts[5]);
int bclen = Integer.parseInt(parts[6]);
long nnz = (long) (Double.parseDouble(parts[9]) * rlen * clen);
VarStats vs = new VarStats(rlen, clen, brlen, bclen, nnz, false);
stats.put(String.valueOf(outIndex), vs);
}
}
// compute intermediate result indices
HashMap<Byte, MatrixCharacteristics> dims = new HashMap<>();
// populate input indices
for (Entry<String, VarStats> e : stats.entrySet()) {
if (UtilFunctions.isIntegerNumber(e.getKey())) {
byte ix = Byte.parseByte(e.getKey());
VarStats vs = e.getValue();
if (vs != null) {
MatrixCharacteristics mc = new MatrixCharacteristics(vs._rlen, vs._clen, vs._brlen, vs._bclen, (long) vs._nnz);
dims.put(ix, mc);
}
}
}
// compute dims for all instructions
String[] instCat = new String[] { jobinst.getIv_randInstructions(), jobinst.getIv_recordReaderInstructions(), jobinst.getIv_instructionsInMapper(), jobinst.getIv_shuffleInstructions(), jobinst.getIv_aggInstructions(), jobinst.getIv_otherInstructions() };
for (String linstCat : instCat) if (linstCat != null && linstCat.length() > 0) {
String[] linst = linstCat.split(Instruction.INSTRUCTION_DELIM);
for (String instStr : linst) {
String instStr2 = replaceInstructionPatch(instStr);
MRInstruction mrinst = MRInstructionParser.parseSingleInstruction(instStr2);
MatrixCharacteristics.computeDimension(dims, mrinst);
}
}
// create varstats if necessary
for (Entry<Byte, MatrixCharacteristics> e : dims.entrySet()) {
byte ix = e.getKey();
if (!stats.containsKey(String.valueOf(ix))) {
MatrixCharacteristics mc = e.getValue();
VarStats vs = new VarStats(mc.getRows(), mc.getCols(), mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros(), false);
stats.put(String.valueOf(ix), vs);
}
}
// map result indexes
String[] outLabels = jobinst.getOutputVars();
byte[] resultIndexes = jobinst.getIv_resultIndices();
for (int i = 0; i < resultIndexes.length; i++) {
String varname = outLabels[i];
VarStats varvs = stats.get(String.valueOf(resultIndexes[i]));
if (varvs == null) {
varvs = stats.get(outLabels[i]);
}
varvs._inmem = false;
stats.put(varname, varvs);
}
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class InterProceduralAnalysis method createOutputMatrix.
private static MatrixObject createOutputMatrix(long dim1, long dim2, long nnz) {
MatrixObject moOut = new MatrixObject(ValueType.DOUBLE, null);
MatrixCharacteristics mc = new MatrixCharacteristics(dim1, dim2, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), nnz);
MetaDataFormat meta = new MetaDataFormat(mc, null, null);
moOut.setMetaData(meta);
return moOut;
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class Recompiler method reconcileUpdatedCallVarsIf.
public static LocalVariableMap reconcileUpdatedCallVarsIf(LocalVariableMap oldCallVars, LocalVariableMap callVarsIf, LocalVariableMap callVarsElse, StatementBlock sb) {
for (String varname : sb.variablesUpdated().getVariableNames()) {
Data origVar = oldCallVars.get(varname);
Data ifVar = callVarsIf.get(varname);
Data elseVar = callVarsElse.get(varname);
Data dat1 = null, dat2 = null;
if (ifVar != null && elseVar != null) {
// both branches exists
dat1 = ifVar;
dat2 = elseVar;
} else if (ifVar != null && elseVar == null) {
// only if
dat1 = origVar;
dat2 = ifVar;
} else {
// only else
dat1 = origVar;
dat2 = elseVar;
}
// because we do not allow data type changes)
if (dat1 != null && dat1 instanceof MatrixObject && dat2 != null) {
// handle matrices
if (dat1 instanceof MatrixObject && dat2 instanceof MatrixObject) {
MatrixObject moOld = (MatrixObject) dat1;
MatrixObject mo = (MatrixObject) dat2;
MatrixCharacteristics mcOld = moOld.getMatrixCharacteristics();
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
if (mcOld.getRows() != mc.getRows() || mcOld.getCols() != mc.getCols() || mcOld.getNonZeros() != mc.getNonZeros()) {
long ldim1 = mc.getRows(), ldim2 = mc.getCols(), lnnz = mc.getNonZeros();
// handle row dimension change
if (mcOld.getRows() != mc.getRows()) {
// unknown
ldim1 = -1;
}
if (mcOld.getCols() != mc.getCols()) {
// unknown
ldim2 = -1;
}
// handle sparsity change
if (mcOld.getNonZeros() != mc.getNonZeros()) {
// unknown
lnnz = -1;
}
MatrixObject moNew = createOutputMatrix(ldim1, ldim2, lnnz);
callVarsIf.put(varname, moNew);
}
}
}
}
return callVarsIf;
}
Aggregations