Search in sources :

Example 16 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class CorrMatrixBlock method readHeaderAndPayload.

private void readHeaderAndPayload(DataInput dis) throws IOException {
    boolean corrExists = (dis.readByte() != 0) ? true : false;
    _value = new MatrixBlock();
    _value.readFields(dis);
    if (corrExists) {
        _corr = new MatrixBlock();
        _corr.readFields(dis);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock)

Example 17 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RmmSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    //get input rdds
    MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mc2 = sec.getMatrixCharacteristics(input2.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
    //execute Spark RMM instruction
    //step 1: prepare join keys (w/ replication), i/j/k 
    JavaPairRDD<TripleIndexes, MatrixBlock> tmp1 = in1.flatMapToPair(new RmmReplicateFunction(mc2.getCols(), mc2.getColsPerBlock(), true));
    JavaPairRDD<TripleIndexes, MatrixBlock> tmp2 = in2.flatMapToPair(new RmmReplicateFunction(mc1.getRows(), mc1.getRowsPerBlock(), false));
    //step 2: join prepared datasets, multiply, and aggregate
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = //join by result block 
    tmp1.join(tmp2).mapToPair(//do matrix multiplication
    new RmmMultiplyFunction());
    //aggregation per result block
    out = RDDAggregateUtils.sumByKeyStable(out, false);
    //put output block into symbol table (no lineage because single block)
    updateBinaryMMOutputMatrixCharacteristics(sec, true);
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
    sec.addLineageRDD(output.getName(), input2.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) TripleIndexes(org.apache.sysml.runtime.matrix.data.TripleIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 18 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class QuantileSortSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    boolean weighted = (input2 != null);
    //get input rdds
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> inW = weighted ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
    long clen = -1;
    if (!weighted) {
        //W/O WEIGHTS (default)
        out = RDDSortUtils.sortByVal(in, mc.getRows(), mc.getRowsPerBlock());
        clen = 1;
    } else {
        //W/ WEIGHTS
        out = RDDSortUtils.sortByVal(in, inW, mc.getRows(), mc.getRowsPerBlock());
        clen = 2;
    }
    //put output RDD handle into symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
    if (weighted)
        sec.addLineageRDD(output.getName(), input2.getName());
    //update output matrix characteristics
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    mcOut.set(mc.getRows(), clen, mc.getRowsPerBlock(), mc.getColsPerBlock());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 19 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class WriteSPInstruction method processMatrixWriteInstruction.

protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi) throws DMLRuntimeException, IOException {
    //get input rdd
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
        //piggyback nnz maintenance on write
        LongAccumulator aNnz = null;
        if (isInputMatrixBlock && !mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        JavaRDD<String> header = null;
        if (oi == OutputInfo.MatrixMarketOutputInfo) {
            ArrayList<String> headerContainer = new ArrayList<String>(1);
            // First output MM header
            String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz
            mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
            headerContainer.add(headerStr);
            header = sec.getSparkContext().parallelize(headerContainer);
        }
        JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
        if (header != null)
            customSaveTextFile(header.union(ijv), fname, true);
        else
            customSaveTextFile(ijv, fname, false);
        if (isInputMatrixBlock && !mc.nnzKnown())
            mc.setNonZeros(aNnz.value());
    } else if (oi == OutputInfo.CSVOutputInfo) {
        JavaRDD<String> out = null;
        LongAccumulator aNnz = null;
        if (isInputMatrixBlock) {
            //piggyback nnz computation on actual write
            if (!mc.nnzKnown()) {
                aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
                in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
            }
            out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true);
        } else {
            // This case is applicable when the CSV output from transform() is written out
            // TODO remove once transform over frames supported
            @SuppressWarnings("unchecked") JavaPairRDD<Long, String> rdd = (JavaPairRDD<Long, String>) (sec.getMatrixObject(input1.getName())).getRDDHandle().getRDD();
            out = rdd.values();
            String sep = ",";
            boolean hasHeader = false;
            if (formatProperties != null) {
                sep = ((CSVFileFormatProperties) formatProperties).getDelim();
                hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
            }
            if (hasHeader) {
                StringBuffer buf = new StringBuffer();
                for (int j = 1; j < mc.getCols(); j++) {
                    if (j != 1) {
                        buf.append(sep);
                    }
                    buf.append("C" + j);
                }
                ArrayList<String> headerContainer = new ArrayList<String>(1);
                headerContainer.add(0, buf.toString());
                JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
                out = header.union(out);
            }
        }
        customSaveTextFile(out, fname, false);
        if (isInputMatrixBlock && !mc.nnzKnown())
            mc.setNonZeros((long) aNnz.value().longValue());
    } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        //piggyback nnz computation on actual write
        LongAccumulator aNnz = null;
        if (!mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        //save binary block rdd on hdfs
        in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        if (!mc.nnzKnown())
            mc.setNonZeros((long) aNnz.value().longValue());
    } else {
        //unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
    }
    // write meta data file
    MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList) ComputeBinaryBlockNnzFunction(org.apache.sysml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JavaRDD(org.apache.spark.api.java.JavaRDD) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LongAccumulator(org.apache.spark.util.LongAccumulator) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD)

Example 20 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class DataTransform method performTransform.

/**
	 * Main method to create and/or apply transformation metdata in-memory, on a single node.
	 * 
	 * @param job job configuration
	 * @param fs file system
	 * @param inputPath path to input files
	 * @param ncols number of columns
	 * @param prop csv file format properties
	 * @param specWithIDs JSON transform specification with IDs
	 * @param tfMtdPath transform metadata path
	 * @param isApply ?
	 * @param result output matrix
	 * @param headerLine header line
	 * @param isBB true if binary block
	 * @param isCSV true if CSV
	 * @return MR job result
	 * @throws IOException if IOException occurs
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 * @throws IllegalArgumentException if IllegalArgumentException occurs
	 * @throws JSONException if JSONException occurs
	 */
private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols, CSVFileFormatProperties prop, String specWithIDs, String tfMtdPath, boolean isApply, MatrixObject result, String headerLine, boolean isBB, boolean isCSV) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
    String[] na = TfUtils.parseNAStrings(prop.getNAStrings());
    JSONObject spec = new JSONObject(specWithIDs);
    TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath, null, null);
    MVImputeAgent _mia = agents.getMVImputeAgent();
    RecodeAgent _ra = agents.getRecodeAgent();
    BinAgent _ba = agents.getBinAgent();
    DummycodeAgent _da = agents.getDummycodeAgent();
    // List of files to read
    ArrayList<Path> files = collectInputFiles(inputPath, fs);
    // ---------------------------------
    // Construct transformation metadata
    // ---------------------------------
    String line = null;
    String[] words = null;
    int numColumnsTf = 0;
    if (!isApply) {
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
                if (fileNo == 0 && prop.hasHeader())
                    //ignore header
                    br.readLine();
                line = null;
                while ((line = br.readLine()) != null) {
                    agents.prepareTfMtd(line);
                }
            }
        }
        if (agents.getValid() == 0)
            throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
        _mia.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ba.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ra.outputTransformationMetadata(tfMtdPath, fs, agents);
        // prepare agents for the subsequent phase of applying transformation metadata
        // NO need to loadTxMtd for _ra, since the maps are already present in the memory
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);
        _da.setRecodeMapsCP(_ra.getCPRecodeMaps());
        _da.setNumBins(_ba.getColList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    } else {
        // Count the number of rows
        int[] rows = countNumRows(files, prop, fs, agents);
        agents.setTotal(rows[0]);
        agents.setValid(rows[1]);
        if (agents.getValid() == 0)
            throw new DMLRuntimeException("Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");
        // Load transformation metadata
        // prepare agents for the subsequent phase of applying transformation metadata
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ra.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);
        _da.setRecodeMaps(_ra.getRecodeMaps());
        _da.setNumBins(_ba.getColList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    }
    // -----------------------------
    // Apply transformation metadata
    // -----------------------------
    numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);
    MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
    StringBuilder sb = new StringBuilder();
    try {
        MatrixBlock mb = null;
        if (isBB) {
            int estNNZ = (int) agents.getValid() * ncols;
            mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);
            if (mb.isInSparseFormat())
                mb.allocateSparseRowsBlock();
            else
                mb.allocateDenseBlock();
        }
        // rowid to be used in filling the matrix block
        int rowID = 0;
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))))) {
                if (fileNo == 0) {
                    if (prop.hasHeader())
                        // ignore the header line from data file
                        br.readLine();
                    //TODO: fix hard-wired header propagation to meta data column names
                    String dcdHeader = _da.constructDummycodedHeader(headerLine, agents.getDelim());
                    numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
                    generateHeaderFiles(fs, tfMtdPath, headerLine, dcdHeader);
                }
                line = null;
                while ((line = br.readLine()) != null) {
                    words = agents.getWords(line);
                    if (!agents.omit(words)) {
                        words = agents.apply(words);
                        if (isCSV) {
                            out.write(agents.checkAndPrepOutputString(words, sb));
                            out.write("\n");
                        }
                        if (isBB) {
                            agents.check(words);
                            for (int c = 0; c < words.length; c++) {
                                if (words[c] == null || words[c].isEmpty())
                                    ;
                                else
                                    mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
                            }
                        }
                        rowID++;
                    }
                }
            }
        }
        if (mb != null) {
            mb.recomputeNonZeros();
            mb.examSparsity();
            result.acquireModify(mb);
            result.release();
            result.exportData();
        }
    } finally {
        IOUtilFunctions.closeSilently(out);
    }
    MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf, (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
    JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputStreamReader(java.io.InputStreamReader) JobReturn(org.apache.sysml.runtime.matrix.JobReturn) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) BufferedWriter(java.io.BufferedWriter) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JSONObject(org.apache.wink.json4j.JSONObject) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)393 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)121 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)105 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)87 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)44 IOException (java.io.IOException)43 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)38 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)34 ArrayList (java.util.ArrayList)33 Path (org.apache.hadoop.fs.Path)25 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)21 FileSystem (org.apache.hadoop.fs.FileSystem)20 JobConf (org.apache.hadoop.mapred.JobConf)17 Tuple2 (scala.Tuple2)17 SequenceFile (org.apache.hadoop.io.SequenceFile)14 MatrixReader (org.apache.sysml.runtime.io.MatrixReader)14 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)13 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)12 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)12