use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class CorrMatrixBlock method readHeaderAndPayload.
private void readHeaderAndPayload(DataInput dis) throws IOException {
boolean corrExists = (dis.readByte() != 0) ? true : false;
_value = new MatrixBlock();
_value.readFields(dis);
if (corrExists) {
_corr = new MatrixBlock();
_corr.readFields(dis);
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RmmSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
//get input rdds
MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mc2 = sec.getMatrixCharacteristics(input2.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
//execute Spark RMM instruction
//step 1: prepare join keys (w/ replication), i/j/k
JavaPairRDD<TripleIndexes, MatrixBlock> tmp1 = in1.flatMapToPair(new RmmReplicateFunction(mc2.getCols(), mc2.getColsPerBlock(), true));
JavaPairRDD<TripleIndexes, MatrixBlock> tmp2 = in2.flatMapToPair(new RmmReplicateFunction(mc1.getRows(), mc1.getRowsPerBlock(), false));
//step 2: join prepared datasets, multiply, and aggregate
JavaPairRDD<MatrixIndexes, MatrixBlock> out = //join by result block
tmp1.join(tmp2).mapToPair(//do matrix multiplication
new RmmMultiplyFunction());
//aggregation per result block
out = RDDAggregateUtils.sumByKeyStable(out, false);
//put output block into symbol table (no lineage because single block)
updateBinaryMMOutputMatrixCharacteristics(sec, true);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
sec.addLineageRDD(output.getName(), input2.getName());
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class QuantileSortSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
boolean weighted = (input2 != null);
//get input rdds
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> inW = weighted ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
long clen = -1;
if (!weighted) {
//W/O WEIGHTS (default)
out = RDDSortUtils.sortByVal(in, mc.getRows(), mc.getRowsPerBlock());
clen = 1;
} else {
//W/ WEIGHTS
out = RDDSortUtils.sortByVal(in, inW, mc.getRows(), mc.getRowsPerBlock());
clen = 2;
}
//put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
if (weighted)
sec.addLineageRDD(output.getName(), input2.getName());
//update output matrix characteristics
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(mc.getRows(), clen, mc.getRowsPerBlock(), mc.getColsPerBlock());
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class WriteSPInstruction method processMatrixWriteInstruction.
protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi) throws DMLRuntimeException, IOException {
//get input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
//piggyback nnz maintenance on write
LongAccumulator aNnz = null;
if (isInputMatrixBlock && !mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
JavaRDD<String> header = null;
if (oi == OutputInfo.MatrixMarketOutputInfo) {
ArrayList<String> headerContainer = new ArrayList<String>(1);
// First output MM header
String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz
mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
headerContainer.add(headerStr);
header = sec.getSparkContext().parallelize(headerContainer);
}
JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
if (header != null)
customSaveTextFile(header.union(ijv), fname, true);
else
customSaveTextFile(ijv, fname, false);
if (isInputMatrixBlock && !mc.nnzKnown())
mc.setNonZeros(aNnz.value());
} else if (oi == OutputInfo.CSVOutputInfo) {
JavaRDD<String> out = null;
LongAccumulator aNnz = null;
if (isInputMatrixBlock) {
//piggyback nnz computation on actual write
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true);
} else {
// This case is applicable when the CSV output from transform() is written out
// TODO remove once transform over frames supported
@SuppressWarnings("unchecked") JavaPairRDD<Long, String> rdd = (JavaPairRDD<Long, String>) (sec.getMatrixObject(input1.getName())).getRDDHandle().getRDD();
out = rdd.values();
String sep = ",";
boolean hasHeader = false;
if (formatProperties != null) {
sep = ((CSVFileFormatProperties) formatProperties).getDelim();
hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
}
if (hasHeader) {
StringBuffer buf = new StringBuffer();
for (int j = 1; j < mc.getCols(); j++) {
if (j != 1) {
buf.append(sep);
}
buf.append("C" + j);
}
ArrayList<String> headerContainer = new ArrayList<String>(1);
headerContainer.add(0, buf.toString());
JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
out = header.union(out);
}
}
customSaveTextFile(out, fname, false);
if (isInputMatrixBlock && !mc.nnzKnown())
mc.setNonZeros((long) aNnz.value().longValue());
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
//piggyback nnz computation on actual write
LongAccumulator aNnz = null;
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
//save binary block rdd on hdfs
in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
if (!mc.nnzKnown())
mc.setNonZeros((long) aNnz.value().longValue());
} else {
//unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
}
// write meta data file
MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class DataTransform method performTransform.
/**
* Main method to create and/or apply transformation metdata in-memory, on a single node.
*
* @param job job configuration
* @param fs file system
* @param inputPath path to input files
* @param ncols number of columns
* @param prop csv file format properties
* @param specWithIDs JSON transform specification with IDs
* @param tfMtdPath transform metadata path
* @param isApply ?
* @param result output matrix
* @param headerLine header line
* @param isBB true if binary block
* @param isCSV true if CSV
* @return MR job result
* @throws IOException if IOException occurs
* @throws DMLRuntimeException if DMLRuntimeException occurs
* @throws IllegalArgumentException if IllegalArgumentException occurs
* @throws JSONException if JSONException occurs
*/
private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols, CSVFileFormatProperties prop, String specWithIDs, String tfMtdPath, boolean isApply, MatrixObject result, String headerLine, boolean isBB, boolean isCSV) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
String[] na = TfUtils.parseNAStrings(prop.getNAStrings());
JSONObject spec = new JSONObject(specWithIDs);
TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath, null, null);
MVImputeAgent _mia = agents.getMVImputeAgent();
RecodeAgent _ra = agents.getRecodeAgent();
BinAgent _ba = agents.getBinAgent();
DummycodeAgent _da = agents.getDummycodeAgent();
// List of files to read
ArrayList<Path> files = collectInputFiles(inputPath, fs);
// ---------------------------------
// Construct transformation metadata
// ---------------------------------
String line = null;
String[] words = null;
int numColumnsTf = 0;
if (!isApply) {
for (int fileNo = 0; fileNo < files.size(); fileNo++) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
if (fileNo == 0 && prop.hasHeader())
//ignore header
br.readLine();
line = null;
while ((line = br.readLine()) != null) {
agents.prepareTfMtd(line);
}
}
}
if (agents.getValid() == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
_mia.outputTransformationMetadata(tfMtdPath, fs, agents);
_ba.outputTransformationMetadata(tfMtdPath, fs, agents);
_ra.outputTransformationMetadata(tfMtdPath, fs, agents);
// prepare agents for the subsequent phase of applying transformation metadata
// NO need to loadTxMtd for _ra, since the maps are already present in the memory
Path tmp = new Path(tfMtdPath);
_mia.loadTxMtd(job, fs, tmp, agents);
_ba.loadTxMtd(job, fs, tmp, agents);
_da.setRecodeMapsCP(_ra.getCPRecodeMaps());
_da.setNumBins(_ba.getColList(), _ba.getNumBins());
_da.loadTxMtd(job, fs, tmp, agents);
} else {
// Count the number of rows
int[] rows = countNumRows(files, prop, fs, agents);
agents.setTotal(rows[0]);
agents.setValid(rows[1]);
if (agents.getValid() == 0)
throw new DMLRuntimeException("Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");
// Load transformation metadata
// prepare agents for the subsequent phase of applying transformation metadata
Path tmp = new Path(tfMtdPath);
_mia.loadTxMtd(job, fs, tmp, agents);
_ra.loadTxMtd(job, fs, tmp, agents);
_ba.loadTxMtd(job, fs, tmp, agents);
_da.setRecodeMaps(_ra.getRecodeMaps());
_da.setNumBins(_ba.getColList(), _ba.getNumBins());
_da.loadTxMtd(job, fs, tmp, agents);
}
// -----------------------------
// Apply transformation metadata
// -----------------------------
numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);
MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
StringBuilder sb = new StringBuilder();
try {
MatrixBlock mb = null;
if (isBB) {
int estNNZ = (int) agents.getValid() * ncols;
mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);
if (mb.isInSparseFormat())
mb.allocateSparseRowsBlock();
else
mb.allocateDenseBlock();
}
// rowid to be used in filling the matrix block
int rowID = 0;
for (int fileNo = 0; fileNo < files.size(); fileNo++) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
if (fileNo == 0) {
if (prop.hasHeader())
// ignore the header line from data file
br.readLine();
//TODO: fix hard-wired header propagation to meta data column names
String dcdHeader = _da.constructDummycodedHeader(headerLine, agents.getDelim());
numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
generateHeaderFiles(fs, tfMtdPath, headerLine, dcdHeader);
}
line = null;
while ((line = br.readLine()) != null) {
words = agents.getWords(line);
if (!agents.omit(words)) {
words = agents.apply(words);
if (isCSV) {
out.write(agents.checkAndPrepOutputString(words, sb));
out.write("\n");
}
if (isBB) {
agents.check(words);
for (int c = 0; c < words.length; c++) {
if (words[c] == null || words[c].isEmpty())
;
else
mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
}
}
rowID++;
}
}
}
}
if (mb != null) {
mb.recomputeNonZeros();
mb.examSparsity();
result.acquireModify(mb);
result.release();
result.exportData();
}
} finally {
IOUtilFunctions.closeSilently(out);
}
MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf, (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);
return ret;
}
Aggregations