Search in sources :

Example 56 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RemoteDPParForSpark method getPartitionedInput.

@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
    InputInfo ii = InputInfo.BinaryBlockInputInfo;
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    // NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
    if (hasInputDataSet(dpf, mo)) {
        DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
        Dataset<Row> in = dsObj.getDataset();
        // construct or reuse row ids
        JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
        in.javaRDD().zipWithIndex();
        // convert row to row in matrix block format
        return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
    } else // binary block input rdd without grouping
    if (!requiresGrouping(dpf, mo)) {
        // get input rdd and data partitioning
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    } else // default binary block input rdd with grouping
    {
        // get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
        // to reduce memory pressure for shuffle and subsequent
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
            in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        // data partitioning of input rdd
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject) DataFrameExtractIDFunction(org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Row(org.apache.spark.sql.Row)

Example 57 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RemoteDPParForSparkWorker method collectBinaryCellInput.

/**
 * Collects a matrixblock partition from a given input iterator over
 * binary cells.
 *
 * Note it reuses the instance attribute _partition - multiple calls
 * will overwrite the result.
 *
 * @param valueList iterable writables
 * @return matrix block
 * @throws IOException if IOException occurs
 */
private MatrixBlock collectBinaryCellInput(Iterable<Writable> valueList) throws IOException {
    MatrixBlock partition = null;
    // reset reuse block, keep configured representation
    if (_tSparseCol)
        partition = new MatrixBlock(_clen, _rlen, true);
    else
        partition = new MatrixBlock(_rlen, _clen, false);
    switch(_dpf) {
        case ROW_WISE:
            while (valueList.iterator().hasNext()) {
                PairWritableCell pairValue = (PairWritableCell) valueList.iterator().next();
                if (pairValue.indexes.getColumnIndex() < 0)
                    // cells used to ensure empty partitions
                    continue;
                partition.quickSetValue(0, (int) pairValue.indexes.getColumnIndex() - 1, pairValue.cell.getValue());
            }
            break;
        case COLUMN_WISE:
            while (valueList.iterator().hasNext()) {
                PairWritableCell pairValue = (PairWritableCell) valueList.iterator().next();
                if (pairValue.indexes.getRowIndex() < 0)
                    // cells used to ensure empty partitions
                    continue;
                if (_tSparseCol)
                    partition.appendValue(0, (int) pairValue.indexes.getRowIndex() - 1, pairValue.cell.getValue());
                else
                    partition.quickSetValue((int) pairValue.indexes.getRowIndex() - 1, 0, pairValue.cell.getValue());
            }
            break;
        default:
            throw new IOException("Partition format not yet supported in fused partition-execute: " + _dpf);
    }
    // post-processing: cleanups if required
    try {
        if (partition.isInSparseFormat() && _tSparseCol)
            partition.sortSparseRows();
        partition.recomputeNonZeros();
        partition.examSparsity();
    } catch (DMLRuntimeException ex) {
        throw new IOException(ex);
    }
    return partition;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) IOException(java.io.IOException) PairWritableCell(org.apache.sysml.runtime.controlprogram.parfor.util.PairWritableCell) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 58 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RemoteDPParForSparkWorker method call.

@Override
public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0) throws Exception {
    ArrayList<Tuple2<Long, String>> ret = new ArrayList<>();
    // lazy parworker initialization
    configureWorker(TaskContext.get().taskAttemptId());
    // process all matrix partitions of this data partition
    MatrixBlock partition = null;
    while (arg0.hasNext()) {
        Tuple2<Long, Iterable<Writable>> larg = arg0.next();
        // collect input partition (check via equals because oinfo deserialized instance)
        if (_oinfo.equals(OutputInfo.BinaryBlockOutputInfo))
            partition = collectBinaryBlock(larg._2(), partition);
        else
            partition = collectBinaryCellInput(larg._2());
        // update in-memory matrix partition
        MatrixObject mo = _ec.getMatrixObject(_inputVar);
        mo.setInMemoryPartition(partition);
        // create tasks for input data
        Task lTask = new Task(_iterVar, TaskType.SET);
        lTask.addIteration(new IntObject(larg._1()));
        // execute program
        long numIter = getExecutedIterations();
        super.executeTask(lTask);
        // maintain accumulators
        _aTasks.add(1);
        _aIters.add((int) (getExecutedIterations() - numIter));
    }
    // write output if required (matrix indexed write)
    ArrayList<String> tmp = RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars);
    for (String val : tmp) ret.add(new Tuple2<>(_workerID, val));
    return ret.iterator();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ArrayList(java.util.ArrayList) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) Tuple2(scala.Tuple2)

Example 59 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RemoteDPParWorkerReducer method configure.

@Override
public void configure(JobConf job) {
    // Step 1: configure data partitioning information
    _dpf = MRJobConfiguration.getPartitioningFormat(job);
    MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
    PartitionFormat pf = new PartitionFormat(_dpf, MRJobConfiguration.getPartitioningSizeN(job));
    _rlen = (int) pf.getNumRows(mc);
    _clen = (int) pf.getNumColumns(mc);
    _brlen = mc.getRowsPerBlock();
    _bclen = mc.getColsPerBlock();
    _iterVar = MRJobConfiguration.getPartitioningItervar(job);
    _inputVar = MRJobConfiguration.getPartitioningMatrixvar(job);
    _info = MRJobConfiguration.getPartitioningOutputInfo(job);
    _tSparseCol = MRJobConfiguration.getPartitioningTransposedCol(job);
    if (_tSparseCol)
        _partition = new MatrixBlock((int) _clen, _rlen, true);
    else
        _partition = new MatrixBlock((int) _rlen, _clen, false);
    // Step 1: configure parworker
    String taskID = job.get(MRConfigurationNames.MR_TASK_ID);
    LOG.trace("configure RemoteDPParWorkerReducer " + taskID);
    try {
        _stringID = taskID;
        // int task ID
        _workerID = IDHandler.extractIntID(_stringID);
        // in the context of mr jobs (for example this config points to local fs instead of hdfs by default).
        if (!InfrastructureAnalyzer.isLocalMode(job)) {
            ConfigurationManager.setCachedJobConf(job);
        }
        // create local runtime program
        String in = MRJobConfiguration.getProgramBlocks(job);
        ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID);
        _childBlocks = body.getChildBlocks();
        _ec = body.getEc();
        _resultVars = body.getResultVariables();
        // init local cache manager
        if (!CacheableData.isCachingActive()) {
            String uuid = IDHandler.createDistributedUniqueID();
            LocalFileUtils.createWorkingDirectoryWithUUID(uuid);
            // incl activation, cache dir creation (each map task gets its own dir for simplified cleanup)
            CacheableData.initCaching(uuid);
        }
        if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) {
            // account for local mode
            CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_" + _workerID;
        }
        // ensure that resultvar files are not removed
        super.pinResultVariables();
        // enable/disable caching (if required)
        boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job);
        if (!cpCaching)
            CacheableData.disableCaching();
        _numTasks = 0;
        _numIters = 0;
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }
    // disable parfor stat monitoring, reporting execution times via counters not useful
    StatisticMonitor.disableStatMonitoring();
    // always reset stats because counters per map task (for case of JVM reuse)
    if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job))
        Statistics.reset();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) PartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PartitionFormat) PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 60 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class ResultMergeLocalFile method createTextCellResultFile.

private void createTextCellResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew, MetaDataFormat metadata, boolean withCompare) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fnameNew);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    MatrixCharacteristics mc = metadata.getMatrixCharacteristics();
    long rlen = mc.getRows();
    long clen = mc.getCols();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)))) {
        // for obj reuse and preventing repeated buffer re-allocations
        StringBuilder sb = new StringBuilder();
        boolean written = false;
        for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++) for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) {
            File dir = new File(fnameStaging + "/" + brow + "_" + bcol);
            File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol);
            MatrixBlock mb = null;
            long row_offset = (brow - 1) * brlen + 1;
            long col_offset = (bcol - 1) * bclen + 1;
            if (dir.exists()) {
                if (// WITH COMPARE BLOCK
                withCompare && dir2.exists()) {
                    // copy only values that are different from the original
                    String[] lnames2 = dir2.list();
                    if (// there should be exactly 1 compare block
                    lnames2.length != 1)
                        throw new DMLRuntimeException("Unable to merge results because multiple compare blocks found.");
                    mb = StagingFileUtils.readCellList2BlockFromLocal(dir2 + "/" + lnames2[0], brlen, bclen);
                    boolean appendOnly = mb.isInSparseFormat();
                    DenseBlock compare = DataConverter.convertToDenseBlock(mb, false);
                    for (String lname : dir.list()) {
                        MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
                        mergeWithComp(mb, tmp, compare);
                    }
                    // sort sparse and exam sparsity due to append-only
                    if (appendOnly && !_isAccum)
                        mb.sortSparseRows();
                    // change sparsity if required after
                    mb.examSparsity();
                } else // WITHOUT COMPARE BLOCK
                {
                    // copy all non-zeros from all workers
                    boolean appendOnly = false;
                    for (String lname : dir.list()) {
                        if (mb == null) {
                            mb = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
                            appendOnly = mb.isInSparseFormat();
                        } else {
                            MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
                            mergeWithoutComp(mb, tmp, appendOnly);
                        }
                    }
                    // sort sparse due to append-only
                    if (appendOnly && !_isAccum)
                        mb.sortSparseRows();
                    // change sparsity if required after
                    mb.examSparsity();
                }
            }
            // write the block to text cell
            if (mb != null) {
                if (mb.isInSparseFormat()) {
                    Iterator<IJV> iter = mb.getSparseBlockIterator();
                    while (iter.hasNext()) {
                        IJV lcell = iter.next();
                        sb.append(row_offset + lcell.getI());
                        sb.append(' ');
                        sb.append(col_offset + lcell.getJ());
                        sb.append(' ');
                        sb.append(lcell.getV());
                        sb.append('\n');
                        out.write(sb.toString());
                        sb.setLength(0);
                        written = true;
                    }
                } else {
                    for (int i = 0; i < brlen; i++) for (int j = 0; j < bclen; j++) {
                        double lvalue = mb.getValueDenseUnsafe(i, j);
                        if (// for nnz
                        lvalue != 0) {
                            sb.append(row_offset + i);
                            sb.append(' ');
                            sb.append(col_offset + j);
                            sb.append(' ');
                            sb.append(lvalue);
                            sb.append('\n');
                            out.write(sb.toString());
                            sb.setLength(0);
                            written = true;
                        }
                    }
                }
            }
        }
        if (!written)
            out.write(IOUtilFunctions.EMPTY_TEXT_LINE);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) BufferedWriter(java.io.BufferedWriter) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DenseBlock(org.apache.sysml.runtime.matrix.data.DenseBlock) IJV(org.apache.sysml.runtime.matrix.data.IJV) FileSystem(org.apache.hadoop.fs.FileSystem) Iterator(java.util.Iterator) OutputStreamWriter(java.io.OutputStreamWriter) JobConf(org.apache.hadoop.mapred.JobConf) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Aggregations

MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)459 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)142 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)111 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)102 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)48 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)48 IOException (java.io.IOException)44 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)41 ArrayList (java.util.ArrayList)40 Path (org.apache.hadoop.fs.Path)29 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)24 FileSystem (org.apache.hadoop.fs.FileSystem)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Tuple2 (scala.Tuple2)19 SequenceFile (org.apache.hadoop.io.SequenceFile)17 Row (org.apache.spark.sql.Row)14 SparseBlock (org.apache.sysml.runtime.matrix.data.SparseBlock)14 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)14 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)13