Search in sources :

Example 1 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RemoteDPParWorkerReducer method configure.

@Override
public void configure(JobConf job) {
    //Step 1: configure data partitioning information
    _rlen = (int) MRJobConfiguration.getPartitioningNumRows(job);
    _clen = (int) MRJobConfiguration.getPartitioningNumCols(job);
    _brlen = MRJobConfiguration.getPartitioningBlockNumRows(job);
    _bclen = MRJobConfiguration.getPartitioningBlockNumCols(job);
    _iterVar = MRJobConfiguration.getPartitioningItervar(job);
    _inputVar = MRJobConfiguration.getPartitioningMatrixvar(job);
    _dpf = MRJobConfiguration.getPartitioningFormat(job);
    switch(//create matrix partition for reuse
    _dpf) {
        case ROW_WISE:
            _rlen = 1;
            break;
        case COLUMN_WISE:
            _clen = 1;
            break;
        default:
            throw new RuntimeException("Partition format not yet supported in fused partition-execute: " + _dpf);
    }
    _info = MRJobConfiguration.getPartitioningOutputInfo(job);
    _tSparseCol = MRJobConfiguration.getPartitioningTransposedCol(job);
    if (_tSparseCol)
        _partition = new MatrixBlock((int) _clen, _rlen, true);
    else
        _partition = new MatrixBlock((int) _rlen, _clen, false);
    //Step 1: configure parworker
    String taskID = job.get(MRConfigurationNames.MR_TASK_ID);
    LOG.trace("configure RemoteDPParWorkerReducer " + taskID);
    try {
        _stringID = taskID;
        //int task ID
        _workerID = IDHandler.extractIntID(_stringID);
        //in the context of mr jobs (for example this config points to local fs instead of hdfs by default). 
        if (!InfrastructureAnalyzer.isLocalMode(job)) {
            ConfigurationManager.setCachedJobConf(job);
        }
        //create local runtime program
        String in = MRJobConfiguration.getProgramBlocks(job);
        ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID);
        _childBlocks = body.getChildBlocks();
        _ec = body.getEc();
        _resultVars = body.getResultVarNames();
        //init local cache manager 
        if (!CacheableData.isCachingActive()) {
            String uuid = IDHandler.createDistributedUniqueID();
            LocalFileUtils.createWorkingDirectoryWithUUID(uuid);
            //incl activation, cache dir creation (each map task gets its own dir for simplified cleanup)
            CacheableData.initCaching(uuid);
        }
        if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) {
            //account for local mode
            CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_" + _workerID;
        }
        //ensure that resultvar files are not removed
        super.pinResultVariables();
        //enable/disable caching (if required)
        boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job);
        if (!cpCaching)
            CacheableData.disableCaching();
        _numTasks = 0;
        _numIters = 0;
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }
    //disable parfor stat monitoring, reporting execution times via counters not useful 
    StatisticMonitor.disableStatMonitoring();
    //always reset stats because counters per map task (for case of JVM reuse)
    if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job)) {
        CacheStatistics.reset();
        Statistics.reset();
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Example 2 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RemoteDPParForSparkWorker method call.

@Override
public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0) throws Exception {
    ArrayList<Tuple2<Long, String>> ret = new ArrayList<Tuple2<Long, String>>();
    //lazy parworker initialization
    configureWorker(TaskContext.get().taskAttemptId());
    //process all matrix partitions of this data partition
    MatrixBlock partition = null;
    while (arg0.hasNext()) {
        Tuple2<Long, Iterable<Writable>> larg = arg0.next();
        //collect input partition (check via equals because oinfo deserialized instance)
        if (_oinfo.equals(OutputInfo.BinaryBlockOutputInfo))
            partition = collectBinaryBlock(larg._2(), partition);
        else
            partition = collectBinaryCellInput(larg._2());
        //update in-memory matrix partition
        MatrixObject mo = _ec.getMatrixObject(_inputVar);
        mo.setInMemoryPartition(partition);
        //create tasks for input data
        Task lTask = new Task(TaskType.SET);
        lTask.addIteration(new IntObject(_iterVar, larg._1()));
        //execute program
        long numIter = getExecutedIterations();
        super.executeTask(lTask);
        //maintain accumulators
        _aTasks.add(1);
        _aIters.add((int) (getExecutedIterations() - numIter));
    }
    //write output if required (matrix indexed write) 
    ArrayList<String> tmp = RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars);
    for (String val : tmp) ret.add(new Tuple2<Long, String>(_workerID, val));
    return ret.iterator();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ArrayList(java.util.ArrayList) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) Tuple2(scala.Tuple2)

Example 3 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RemoteDPParForSparkWorker method collectBinaryCellInput.

/**
	 * Collects a matrixblock partition from a given input iterator over 
	 * binary cells.
	 * 
	 * Note it reuses the instance attribute _partition - multiple calls
	 * will overwrite the result.
	 * 
	 * @param valueList iterable writables
	 * @return matrix block
	 * @throws IOException if IOException occurs
	 */
private MatrixBlock collectBinaryCellInput(Iterable<Writable> valueList) throws IOException {
    MatrixBlock partition = null;
    //reset reuse block, keep configured representation
    if (_tSparseCol)
        partition = new MatrixBlock(_clen, _rlen, true);
    else
        partition = new MatrixBlock(_rlen, _clen, false);
    switch(_dpf) {
        case ROW_WISE:
            while (valueList.iterator().hasNext()) {
                PairWritableCell pairValue = (PairWritableCell) valueList.iterator().next();
                if (pairValue.indexes.getColumnIndex() < 0)
                    //cells used to ensure empty partitions
                    continue;
                partition.quickSetValue(0, (int) pairValue.indexes.getColumnIndex() - 1, pairValue.cell.getValue());
            }
            break;
        case COLUMN_WISE:
            while (valueList.iterator().hasNext()) {
                PairWritableCell pairValue = (PairWritableCell) valueList.iterator().next();
                if (pairValue.indexes.getRowIndex() < 0)
                    //cells used to ensure empty partitions
                    continue;
                if (_tSparseCol)
                    partition.appendValue(0, (int) pairValue.indexes.getRowIndex() - 1, pairValue.cell.getValue());
                else
                    partition.quickSetValue((int) pairValue.indexes.getRowIndex() - 1, 0, pairValue.cell.getValue());
            }
            break;
        default:
            throw new IOException("Partition format not yet supported in fused partition-execute: " + _dpf);
    }
    //post-processing: cleanups if required
    try {
        if (partition.isInSparseFormat() && _tSparseCol)
            partition.sortSparseRows();
        partition.recomputeNonZeros();
        partition.examSparsity();
    } catch (DMLRuntimeException ex) {
        throw new IOException(ex);
    }
    return partition;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) IOException(java.io.IOException) PairWritableCell(org.apache.sysml.runtime.controlprogram.parfor.util.PairWritableCell) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 4 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class ResultMergeLocalFile method createBinaryBlockResultFile.

@SuppressWarnings("deprecation")
private void createBinaryBlockResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew, MatrixFormatMetaData metadata, boolean withCompare) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fnameNew);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    MatrixCharacteristics mc = metadata.getMatrixCharacteristics();
    long rlen = mc.getRows();
    long clen = mc.getCols();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //beware ca 50ms
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class);
    try {
        MatrixIndexes indexes = new MatrixIndexes();
        for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++) for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) {
            File dir = new File(fnameStaging + "/" + brow + "_" + bcol);
            File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol);
            MatrixBlock mb = null;
            if (dir.exists()) {
                if (//WITH COMPARE BLOCK
                withCompare && dir2.exists()) {
                    //copy only values that are different from the original
                    String[] lnames2 = dir2.list();
                    if (//there should be exactly 1 compare block
                    lnames2.length != 1)
                        throw new DMLRuntimeException("Unable to merge results because multiple compare blocks found.");
                    mb = LocalFileUtils.readMatrixBlockFromLocal(dir2 + "/" + lnames2[0]);
                    boolean appendOnly = mb.isInSparseFormat();
                    double[][] compare = DataConverter.convertToDoubleMatrix(mb);
                    String[] lnames = dir.list();
                    for (String lname : lnames) {
                        MatrixBlock tmp = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
                        mergeWithComp(mb, tmp, compare);
                    }
                    //sort sparse due to append-only
                    if (appendOnly)
                        mb.sortSparseRows();
                    //change sparsity if required after 
                    mb.examSparsity();
                } else //WITHOUT COMPARE BLOCK
                {
                    //copy all non-zeros from all workers
                    String[] lnames = dir.list();
                    boolean appendOnly = false;
                    for (String lname : lnames) {
                        if (mb == null) {
                            mb = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
                            appendOnly = mb.isInSparseFormat();
                        } else {
                            MatrixBlock tmp = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
                            mergeWithoutComp(mb, tmp, appendOnly);
                        }
                    }
                    //sort sparse due to append-only
                    if (appendOnly)
                        mb.sortSparseRows();
                    //change sparsity if required after 
                    mb.examSparsity();
                }
            } else {
                //NOTE: whenever runtime does not need all blocks anymore, this can be removed
                int maxRow = (int) (((brow - 1) * brlen + brlen < rlen) ? brlen : rlen - (brow - 1) * brlen);
                int maxCol = (int) (((bcol - 1) * bclen + bclen < clen) ? bclen : clen - (bcol - 1) * bclen);
                mb = new MatrixBlock(maxRow, maxCol, true);
            }
            //mb.examSparsity(); //done on write anyway and mb not reused
            indexes.setIndexes(brow, bcol);
            writer.append(indexes, mb);
        }
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter)

Example 5 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RemoteDPParForSpark method getPartitionedInput.

@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) throws DMLRuntimeException {
    InputInfo ii = InputInfo.BinaryBlockInputInfo;
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    //NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
    if (hasInputDataSet(dpf, mo)) {
        DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
        Dataset<Row> in = dsObj.getDataset();
        //construct or reuse row ids
        JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : //zip row index
        in.javaRDD().zipWithIndex();
        //convert row to row in matrix block format 
        return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
    } else //binary block input rdd without grouping
    if (!requiresGrouping(dpf, mo)) {
        //get input rdd and data partitioning 
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    } else //default binary block input rdd with grouping
    {
        //get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
        //to reduce memory pressure for shuffle and subsequent 
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
            in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        //data partitioning of input rdd 
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject) DataFrameExtractIDFunction(org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Row(org.apache.spark.sql.Row)

Aggregations

MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)393 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)121 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)105 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)87 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)44 IOException (java.io.IOException)43 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)38 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)34 ArrayList (java.util.ArrayList)33 Path (org.apache.hadoop.fs.Path)25 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)21 FileSystem (org.apache.hadoop.fs.FileSystem)20 JobConf (org.apache.hadoop.mapred.JobConf)17 Tuple2 (scala.Tuple2)17 SequenceFile (org.apache.hadoop.io.SequenceFile)14 MatrixReader (org.apache.sysml.runtime.io.MatrixReader)14 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)13 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)12 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)12