Search in sources :

Example 6 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class ResultMergeLocalFile method createTextCellStagingFile.

private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LinkedList<Cell> buffer = new LinkedList<Cell>();
    LongWritable key = new LongWritable();
    Text value = new Text();
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
    FastStringTokenizer st = new FastStringTokenizer(' ');
    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                //reset tokenizer
                st.reset(value.toString());
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());
                Cell tmp = new Cell(row, col, lvalue);
                buffer.addLast(tmp);
                if (//periodic flush
                buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }
            //final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell)

Example 7 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class RemoteDPParForSpark method getPartitionedInput.

@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) throws DMLRuntimeException {
    InputInfo ii = InputInfo.BinaryBlockInputInfo;
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    //NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
    if (hasInputDataSet(dpf, mo)) {
        DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
        Dataset<Row> in = dsObj.getDataset();
        //construct or reuse row ids
        JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : //zip row index
        in.javaRDD().zipWithIndex();
        //convert row to row in matrix block format 
        return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
    } else //binary block input rdd without grouping
    if (!requiresGrouping(dpf, mo)) {
        //get input rdd and data partitioning 
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    } else //default binary block input rdd with grouping
    {
        //get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
        //to reduce memory pressure for shuffle and subsequent 
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
            in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        //data partitioning of input rdd 
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject) DataFrameExtractIDFunction(org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Row(org.apache.spark.sql.Row)

Example 8 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class ResultMergeLocalFile method createBinaryCellResultFile.

@SuppressWarnings("deprecation")
private void createBinaryCellResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew, MatrixFormatMetaData metadata, boolean withCompare) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fnameNew);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    MatrixCharacteristics mc = metadata.getMatrixCharacteristics();
    long rlen = mc.getRows();
    long clen = mc.getCols();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    MatrixIndexes indexes = new MatrixIndexes(1, 1);
    MatrixCell cell = new MatrixCell(0);
    //beware ca 50ms
    SequenceFile.Writer out = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixCell.class);
    try {
        boolean written = false;
        for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++) for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) {
            File dir = new File(fnameStaging + "/" + brow + "_" + bcol);
            File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol);
            MatrixBlock mb = null;
            long row_offset = (brow - 1) * brlen + 1;
            long col_offset = (bcol - 1) * bclen + 1;
            if (dir.exists()) {
                if (//WITH COMPARE BLOCK
                withCompare && dir2.exists()) {
                    //copy only values that are different from the original
                    String[] lnames2 = dir2.list();
                    if (//there should be exactly 1 compare block
                    lnames2.length != 1)
                        throw new DMLRuntimeException("Unable to merge results because multiple compare blocks found.");
                    mb = StagingFileUtils.readCellList2BlockFromLocal(dir2 + "/" + lnames2[0], brlen, bclen);
                    boolean appendOnly = mb.isInSparseFormat();
                    double[][] compare = DataConverter.convertToDoubleMatrix(mb);
                    String[] lnames = dir.list();
                    for (String lname : lnames) {
                        MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
                        mergeWithComp(mb, tmp, compare);
                    }
                    //sort sparse due to append-only
                    if (appendOnly)
                        mb.sortSparseRows();
                    //change sparsity if required after 
                    mb.examSparsity();
                } else //WITHOUT COMPARE BLOCK
                {
                    //copy all non-zeros from all workers
                    String[] lnames = dir.list();
                    boolean appendOnly = false;
                    for (String lname : lnames) {
                        if (mb == null) {
                            mb = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
                            appendOnly = mb.isInSparseFormat();
                        } else {
                            MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
                            mergeWithoutComp(mb, tmp, appendOnly);
                        }
                    }
                    //sort sparse due to append-only
                    if (appendOnly)
                        mb.sortSparseRows();
                    //change sparsity if required after 
                    mb.examSparsity();
                }
            }
            //write the block to binary cell
            if (mb != null) {
                if (mb.isInSparseFormat()) {
                    Iterator<IJV> iter = mb.getSparseBlockIterator();
                    while (iter.hasNext()) {
                        IJV lcell = iter.next();
                        indexes.setIndexes(row_offset + lcell.getI(), col_offset + lcell.getJ());
                        cell.setValue(lcell.getV());
                        out.append(indexes, cell);
                        written = true;
                    }
                } else {
                    for (int i = 0; i < brlen; i++) for (int j = 0; j < bclen; j++) {
                        double lvalue = mb.getValueDenseUnsafe(i, j);
                        if (//for nnz
                        lvalue != 0) {
                            indexes.setIndexes(row_offset + i, col_offset + j);
                            cell.setValue(lvalue);
                            out.append(indexes, cell);
                            written = true;
                        }
                    }
                }
            }
        }
        if (!written)
            out.append(indexes, cell);
    } finally {
        IOUtilFunctions.closeSilently(out);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) SequenceFile(org.apache.hadoop.io.SequenceFile) IJV(org.apache.sysml.runtime.matrix.data.IJV) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Iterator(java.util.Iterator) JobConf(org.apache.hadoop.mapred.JobConf) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter)

Example 9 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class ResultMergeLocalAutomatic method executeSerialMerge.

@Override
public MatrixObject executeSerialMerge() throws DMLRuntimeException {
    Timing time = new Timing(true);
    MatrixCharacteristics mc = _output.getMatrixCharacteristics();
    long rows = mc.getRows();
    long cols = mc.getCols();
    if (OptimizerRuleBased.isInMemoryResultMerge(rows, cols, OptimizerUtils.getLocalMemBudget()))
        _rm = new ResultMergeLocalMemory(_output, _inputs, _outputFName);
    else
        _rm = new ResultMergeLocalFile(_output, _inputs, _outputFName);
    MatrixObject ret = _rm.executeSerialMerge();
    LOG.trace("Automatic result merge (" + _rm.getClass().getName() + ") executed in " + time.stop() + "ms.");
    return ret;
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 10 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class ReblockSPInstruction method processMatrixReblockInstruction.

@SuppressWarnings("unchecked")
protected void processMatrixReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) throws DMLRuntimeException {
    MatrixObject mo = sec.getMatrixObject(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (iinfo == InputInfo.TextCellInputInfo || iinfo == InputInfo.MatrixMarketInputInfo) {
        //check jdk version (prevent double.parseDouble contention on <jdk8)
        sec.checkAndRaiseValidationWarningJDKVersion();
        //get the input textcell rdd
        JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
        //convert textcell to binary block
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, outputEmptyBlocks);
        //put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else if (iinfo == InputInfo.CSVInputInfo) {
        // HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
        // throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
        CSVReblockSPInstruction csvInstruction = null;
        boolean hasHeader = false;
        String delim = ",";
        boolean fill = false;
        double fillValue = 0;
        if (mo.getFileFormatProperties() instanceof CSVFileFormatProperties && mo.getFileFormatProperties() != null) {
            CSVFileFormatProperties props = (CSVFileFormatProperties) mo.getFileFormatProperties();
            hasHeader = props.hasHeader();
            delim = props.getDelim();
            fill = props.isFill();
            fillValue = props.getFillValue();
        }
        csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
        csvInstruction.processInstruction(sec);
        return;
    } else if (iinfo == InputInfo.BinaryCellInputInfo) {
        JavaPairRDD<MatrixIndexes, MatrixCell> binaryCells = (JavaPairRDD<MatrixIndexes, MatrixCell>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), binaryCells, mcOut, outputEmptyBlocks);
        //put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else if (iinfo == InputInfo.BinaryBlockInputInfo) {
        //BINARY BLOCK <- BINARY BLOCK (different sizes)
        JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.flatMapToPair(new ExtractBlockForBinaryReblock(mc, mcOut));
        out = RDDAggregateUtils.mergeByKey(out, false);
        //put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else {
        throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction:" + InputInfo.inputInfoToString(iinfo));
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) ExtractBlockForBinaryReblock(org.apache.sysml.runtime.instructions.spark.functions.ExtractBlockForBinaryReblock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) LongWritable(org.apache.hadoop.io.LongWritable)

Aggregations

MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)258 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)87 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)85 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)57 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)52 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)48 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)41 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)36 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)35 CellIndex (org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex)34 IOException (java.io.IOException)27 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)24 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)21 ArrayList (java.util.ArrayList)19 ValueType (org.apache.sysml.parser.Expression.ValueType)19 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)16 Path (org.apache.hadoop.fs.Path)13 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)13 Test (org.junit.Test)13 LongWritable (org.apache.hadoop.io.LongWritable)12