use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class QuantileSortSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
boolean weighted = (input2 != null);
//get input rdds
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> inW = weighted ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
long clen = -1;
if (!weighted) {
//W/O WEIGHTS (default)
out = RDDSortUtils.sortByVal(in, mc.getRows(), mc.getRowsPerBlock());
clen = 1;
} else {
//W/ WEIGHTS
out = RDDSortUtils.sortByVal(in, inW, mc.getRows(), mc.getRowsPerBlock());
clen = 2;
}
//put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
if (weighted)
sec.addLineageRDD(output.getName(), input2.getName());
//update output matrix characteristics
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(mc.getRows(), clen, mc.getRowsPerBlock(), mc.getColsPerBlock());
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class SpoofSPInstruction method updateOutputMatrixCharacteristics.
private void updateOutputMatrixCharacteristics(SparkExecutionContext sec, SpoofOperator op) throws DMLRuntimeException {
if (op instanceof SpoofCellwise) {
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(_in[0].getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(_out.getName());
if (((SpoofCellwise) op).getCellType() == CellType.ROW_AGG)
mcOut.set(mcIn.getRows(), 1, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
else if (((SpoofCellwise) op).getCellType() == CellType.NO_AGG)
mcOut.set(mcIn);
} else if (op instanceof SpoofOuterProduct) {
//X
MatrixCharacteristics mcIn1 = sec.getMatrixCharacteristics(_in[0].getName());
//U
MatrixCharacteristics mcIn2 = sec.getMatrixCharacteristics(_in[1].getName());
//V
MatrixCharacteristics mcIn3 = sec.getMatrixCharacteristics(_in[2].getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(_out.getName());
OutProdType type = ((SpoofOuterProduct) op).getOuterProdType();
if (type == OutProdType.CELLWISE_OUTER_PRODUCT)
mcOut.set(mcIn1.getRows(), mcIn1.getCols(), mcIn1.getRowsPerBlock(), mcIn1.getColsPerBlock());
else if (type == OutProdType.LEFT_OUTER_PRODUCT)
mcOut.set(mcIn3.getRows(), mcIn3.getCols(), mcIn3.getRowsPerBlock(), mcIn3.getColsPerBlock());
else if (type == OutProdType.RIGHT_OUTER_PRODUCT)
mcOut.set(mcIn2.getRows(), mcIn2.getCols(), mcIn2.getRowsPerBlock(), mcIn2.getColsPerBlock());
} else if (op instanceof SpoofRowwise) {
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(_in[0].getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(_out.getName());
RowType type = ((SpoofRowwise) op).getRowType();
if (type == RowType.NO_AGG)
mcOut.set(mcIn);
else if (type == RowType.ROW_AGG)
mcOut.set(mcIn.getRows(), 1, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
else if (type == RowType.COL_AGG)
mcOut.set(1, mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
else if (type == RowType.COL_AGG_T)
mcOut.set(mcIn.getCols(), 1, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
}
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class WriteSPInstruction method processMatrixWriteInstruction.
protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi) throws DMLRuntimeException, IOException {
//get input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
//piggyback nnz maintenance on write
LongAccumulator aNnz = null;
if (isInputMatrixBlock && !mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
JavaRDD<String> header = null;
if (oi == OutputInfo.MatrixMarketOutputInfo) {
ArrayList<String> headerContainer = new ArrayList<String>(1);
// First output MM header
String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz
mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
headerContainer.add(headerStr);
header = sec.getSparkContext().parallelize(headerContainer);
}
JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
if (header != null)
customSaveTextFile(header.union(ijv), fname, true);
else
customSaveTextFile(ijv, fname, false);
if (isInputMatrixBlock && !mc.nnzKnown())
mc.setNonZeros(aNnz.value());
} else if (oi == OutputInfo.CSVOutputInfo) {
JavaRDD<String> out = null;
LongAccumulator aNnz = null;
if (isInputMatrixBlock) {
//piggyback nnz computation on actual write
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true);
} else {
// This case is applicable when the CSV output from transform() is written out
// TODO remove once transform over frames supported
@SuppressWarnings("unchecked") JavaPairRDD<Long, String> rdd = (JavaPairRDD<Long, String>) (sec.getMatrixObject(input1.getName())).getRDDHandle().getRDD();
out = rdd.values();
String sep = ",";
boolean hasHeader = false;
if (formatProperties != null) {
sep = ((CSVFileFormatProperties) formatProperties).getDelim();
hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
}
if (hasHeader) {
StringBuffer buf = new StringBuffer();
for (int j = 1; j < mc.getCols(); j++) {
if (j != 1) {
buf.append(sep);
}
buf.append("C" + j);
}
ArrayList<String> headerContainer = new ArrayList<String>(1);
headerContainer.add(0, buf.toString());
JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
out = header.union(out);
}
}
customSaveTextFile(out, fname, false);
if (isInputMatrixBlock && !mc.nnzKnown())
mc.setNonZeros((long) aNnz.value().longValue());
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
//piggyback nnz computation on actual write
LongAccumulator aNnz = null;
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
//save binary block rdd on hdfs
in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
if (!mc.nnzKnown())
mc.setNonZeros((long) aNnz.value().longValue());
} else {
//unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
}
// write meta data file
MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class DataTransform method spDataTransform.
public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
JobConf job = new JobConf();
FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
// find the first file in alphabetical ordering of partfiles in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names and construct output header
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
int numColumns = colNamesToIds.size();
String outHeader = getOutputHeader(fs, headerLine, oprnds);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
// Construct RDD for input data
@SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
long numRowsTf = 0, numColumnsTf = 0;
JavaPairRDD<Long, String> tfPairRDD = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
// enable GC on colNamesToIds
colNamesToIds = null;
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
// store the specFileWithIDs as transformation metadata
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
} else {
// enable GC on colNamesToIds
colNamesToIds = null;
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// convert to csv output format (serialized longwritable/text)
JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
if (outtfPairRDD != null) {
MatrixObject outMO = outputs[0];
String outVar = outMO.getVarName();
outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
sec.addLineageRDD(outVar, inst.getParams().get("target"));
//update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
mcOut.setDimension(numRowsTf, numColumnsTf);
mcOut.setNonZeros(-1);
}
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class DataTransform method performTransform.
/**
* Main method to create and/or apply transformation metdata in-memory, on a single node.
*
* @param job job configuration
* @param fs file system
* @param inputPath path to input files
* @param ncols number of columns
* @param prop csv file format properties
* @param specWithIDs JSON transform specification with IDs
* @param tfMtdPath transform metadata path
* @param isApply ?
* @param result output matrix
* @param headerLine header line
* @param isBB true if binary block
* @param isCSV true if CSV
* @return MR job result
* @throws IOException if IOException occurs
* @throws DMLRuntimeException if DMLRuntimeException occurs
* @throws IllegalArgumentException if IllegalArgumentException occurs
* @throws JSONException if JSONException occurs
*/
private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols, CSVFileFormatProperties prop, String specWithIDs, String tfMtdPath, boolean isApply, MatrixObject result, String headerLine, boolean isBB, boolean isCSV) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
String[] na = TfUtils.parseNAStrings(prop.getNAStrings());
JSONObject spec = new JSONObject(specWithIDs);
TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath, null, null);
MVImputeAgent _mia = agents.getMVImputeAgent();
RecodeAgent _ra = agents.getRecodeAgent();
BinAgent _ba = agents.getBinAgent();
DummycodeAgent _da = agents.getDummycodeAgent();
// List of files to read
ArrayList<Path> files = collectInputFiles(inputPath, fs);
// ---------------------------------
// Construct transformation metadata
// ---------------------------------
String line = null;
String[] words = null;
int numColumnsTf = 0;
if (!isApply) {
for (int fileNo = 0; fileNo < files.size(); fileNo++) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
if (fileNo == 0 && prop.hasHeader())
//ignore header
br.readLine();
line = null;
while ((line = br.readLine()) != null) {
agents.prepareTfMtd(line);
}
}
}
if (agents.getValid() == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
_mia.outputTransformationMetadata(tfMtdPath, fs, agents);
_ba.outputTransformationMetadata(tfMtdPath, fs, agents);
_ra.outputTransformationMetadata(tfMtdPath, fs, agents);
// prepare agents for the subsequent phase of applying transformation metadata
// NO need to loadTxMtd for _ra, since the maps are already present in the memory
Path tmp = new Path(tfMtdPath);
_mia.loadTxMtd(job, fs, tmp, agents);
_ba.loadTxMtd(job, fs, tmp, agents);
_da.setRecodeMapsCP(_ra.getCPRecodeMaps());
_da.setNumBins(_ba.getColList(), _ba.getNumBins());
_da.loadTxMtd(job, fs, tmp, agents);
} else {
// Count the number of rows
int[] rows = countNumRows(files, prop, fs, agents);
agents.setTotal(rows[0]);
agents.setValid(rows[1]);
if (agents.getValid() == 0)
throw new DMLRuntimeException("Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");
// Load transformation metadata
// prepare agents for the subsequent phase of applying transformation metadata
Path tmp = new Path(tfMtdPath);
_mia.loadTxMtd(job, fs, tmp, agents);
_ra.loadTxMtd(job, fs, tmp, agents);
_ba.loadTxMtd(job, fs, tmp, agents);
_da.setRecodeMaps(_ra.getRecodeMaps());
_da.setNumBins(_ba.getColList(), _ba.getNumBins());
_da.loadTxMtd(job, fs, tmp, agents);
}
// -----------------------------
// Apply transformation metadata
// -----------------------------
numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);
MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
StringBuilder sb = new StringBuilder();
try {
MatrixBlock mb = null;
if (isBB) {
int estNNZ = (int) agents.getValid() * ncols;
mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);
if (mb.isInSparseFormat())
mb.allocateSparseRowsBlock();
else
mb.allocateDenseBlock();
}
// rowid to be used in filling the matrix block
int rowID = 0;
for (int fileNo = 0; fileNo < files.size(); fileNo++) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
if (fileNo == 0) {
if (prop.hasHeader())
// ignore the header line from data file
br.readLine();
//TODO: fix hard-wired header propagation to meta data column names
String dcdHeader = _da.constructDummycodedHeader(headerLine, agents.getDelim());
numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
generateHeaderFiles(fs, tfMtdPath, headerLine, dcdHeader);
}
line = null;
while ((line = br.readLine()) != null) {
words = agents.getWords(line);
if (!agents.omit(words)) {
words = agents.apply(words);
if (isCSV) {
out.write(agents.checkAndPrepOutputString(words, sb));
out.write("\n");
}
if (isBB) {
agents.check(words);
for (int c = 0; c < words.length; c++) {
if (words[c] == null || words[c].isEmpty())
;
else
mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
}
}
rowID++;
}
}
}
}
if (mb != null) {
mb.recomputeNonZeros();
mb.examSparsity();
result.acquireModify(mb);
result.release();
result.exportData();
}
} finally {
IOUtilFunctions.closeSilently(out);
}
MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf, (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);
return ret;
}
Aggregations