Search in sources :

Example 86 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class RDDConverterUtils method libsvmToBinaryBlock.

/**
 * Converts a libsvm text input file into two binary block matrices for features
 * and labels, and saves these to the specified output files. This call also deletes
 * existing files at the specified output locations, as well as determines and
 * writes the meta data files of both output matrices.
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing
 * the libsvm input files in order to ensure consistency with Spark.
 *
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, MatrixCharacteristics mcOutX) {
    if (!mcOutX.dimsKnown())
        throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation.");
    try {
        // cleanup existing output files
        MapReduceTool.deleteFileIfExistOnHDFS(pathX);
        MapReduceTool.deleteFileIfExistOnHDFS(pathY);
        // convert libsvm to labeled points
        int numFeatures = (int) mcOutX.getCols();
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
        JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
        // append row index and best-effort caching to avoid repeated text parsing
        JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint, Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK());
        // extract labels and convert to binary block
        MatrixCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
        LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
        JavaPairRDD<MatrixIndexes, MatrixBlock> out1 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
        int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
        out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
        out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        // update nnz after triggered save
        mc1.setNonZeros(aNnz1.value());
        MapReduceTool.writeMetaDataFile(pathY + ".mtd", ValueType.DOUBLE, mc1, OutputInfo.BinaryBlockOutputInfo);
        // extract data and convert to binary block
        MatrixCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
        LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
        JavaPairRDD<MatrixIndexes, MatrixBlock> out2 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
        out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
        out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        // update nnz after triggered save
        mc2.setNonZeros(aNnz2.value());
        MapReduceTool.writeMetaDataFile(pathX + ".mtd", ValueType.DOUBLE, mc2, OutputInfo.BinaryBlockOutputInfo);
        // asynchronous cleanup of cached intermediates
        ilpoints.unpersist(false);
    } catch (IOException ex) {
        throw new DMLRuntimeException(ex);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) IOException(java.io.IOException) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) LongAccumulator(org.apache.spark.util.LongAccumulator)

Example 87 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class RDDSortUtils method sortByVals.

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortByVals(JavaPairRDD<MatrixIndexes, MatrixBlock> in, long rlen, long clen, int brlen) {
    // create value-index rdd from inputs
    JavaRDD<MatrixBlock> dvals = in.values().flatMap(new ExtractRowsFunction());
    // sort (creates sorted range per partition)
    int numPartitions = SparkUtils.getNumPreferredPartitions(new MatrixCharacteristics(rlen, clen, brlen, brlen), in);
    JavaRDD<MatrixBlock> sdvals = dvals.sortBy(new CreateDoubleKeysFunction(), true, numPartitions);
    // create binary block output
    JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals.zipWithIndex().mapPartitionsToPair(new ConvertToBinaryBlockFunction5(rlen, brlen));
    ret = RDDAggregateUtils.mergeByKey(ret, false);
    return ret;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) RowMatrixBlock(org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 88 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class CostEstimator method maintainMRJobInstVariableStatistics.

private void maintainMRJobInstVariableStatistics(Instruction inst, HashMap<String, VarStats> stats) {
    MRJobInstruction jobinst = (MRJobInstruction) inst;
    // input sizes (varname, index mapping)
    String[] inVars = jobinst.getInputVars();
    int index = -1;
    for (String varname : inVars) {
        VarStats vs = stats.get(varname);
        if (vs == null)
            vs = _unknownStats;
        stats.put(String.valueOf(++index), vs);
    }
    // rand output
    String rdInst = jobinst.getIv_randInstructions();
    if (rdInst != null && rdInst.length() > 0) {
        StringTokenizer st = new StringTokenizer(rdInst, Lop.INSTRUCTION_DELIMITOR);
        while (// foreach rand instruction
        st.hasMoreTokens()) {
            String[] parts = InstructionUtils.getInstructionParts(st.nextToken());
            byte outIndex = Byte.parseByte(parts[2]);
            long rlen = parts[3].contains(Lop.VARIABLE_NAME_PLACEHOLDER) ? -1 : UtilFunctions.parseToLong(parts[3]);
            long clen = parts[4].contains(Lop.VARIABLE_NAME_PLACEHOLDER) ? -1 : UtilFunctions.parseToLong(parts[4]);
            int brlen = Integer.parseInt(parts[5]);
            int bclen = Integer.parseInt(parts[6]);
            long nnz = (long) (Double.parseDouble(parts[9]) * rlen * clen);
            VarStats vs = new VarStats(rlen, clen, brlen, bclen, nnz, false);
            stats.put(String.valueOf(outIndex), vs);
        }
    }
    // compute intermediate result indices
    HashMap<Byte, MatrixCharacteristics> dims = new HashMap<>();
    // populate input indices
    for (Entry<String, VarStats> e : stats.entrySet()) {
        if (UtilFunctions.isIntegerNumber(e.getKey())) {
            byte ix = Byte.parseByte(e.getKey());
            VarStats vs = e.getValue();
            if (vs != null) {
                MatrixCharacteristics mc = new MatrixCharacteristics(vs._rlen, vs._clen, vs._brlen, vs._bclen, (long) vs._nnz);
                dims.put(ix, mc);
            }
        }
    }
    // compute dims for all instructions
    String[] instCat = new String[] { jobinst.getIv_randInstructions(), jobinst.getIv_recordReaderInstructions(), jobinst.getIv_instructionsInMapper(), jobinst.getIv_shuffleInstructions(), jobinst.getIv_aggInstructions(), jobinst.getIv_otherInstructions() };
    for (String linstCat : instCat) if (linstCat != null && linstCat.length() > 0) {
        String[] linst = linstCat.split(Instruction.INSTRUCTION_DELIM);
        for (String instStr : linst) {
            String instStr2 = replaceInstructionPatch(instStr);
            MRInstruction mrinst = MRInstructionParser.parseSingleInstruction(instStr2);
            MatrixCharacteristics.computeDimension(dims, mrinst);
        }
    }
    // create varstats if necessary
    for (Entry<Byte, MatrixCharacteristics> e : dims.entrySet()) {
        byte ix = e.getKey();
        if (!stats.containsKey(String.valueOf(ix))) {
            MatrixCharacteristics mc = e.getValue();
            VarStats vs = new VarStats(mc.getRows(), mc.getCols(), mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros(), false);
            stats.put(String.valueOf(ix), vs);
        }
    }
    // map result indexes
    String[] outLabels = jobinst.getOutputVars();
    byte[] resultIndexes = jobinst.getIv_resultIndices();
    for (int i = 0; i < resultIndexes.length; i++) {
        String varname = outLabels[i];
        VarStats varvs = stats.get(String.valueOf(resultIndexes[i]));
        if (varvs == null) {
            varvs = stats.get(outLabels[i]);
        }
        varvs._inmem = false;
        stats.put(varname, varvs);
    }
}
Also used : MRJobInstruction(org.apache.sysml.runtime.instructions.MRJobInstruction) HashMap(java.util.HashMap) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) StringTokenizer(java.util.StringTokenizer) MRInstruction(org.apache.sysml.runtime.instructions.mr.MRInstruction)

Example 89 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class InterProceduralAnalysis method createOutputMatrix.

private static MatrixObject createOutputMatrix(long dim1, long dim2, long nnz) {
    MatrixObject moOut = new MatrixObject(ValueType.DOUBLE, null);
    MatrixCharacteristics mc = new MatrixCharacteristics(dim1, dim2, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), nnz);
    MetaDataFormat meta = new MetaDataFormat(mc, null, null);
    moOut.setMetaData(meta);
    return moOut;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 90 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class Recompiler method reconcileUpdatedCallVarsIf.

public static LocalVariableMap reconcileUpdatedCallVarsIf(LocalVariableMap oldCallVars, LocalVariableMap callVarsIf, LocalVariableMap callVarsElse, StatementBlock sb) {
    for (String varname : sb.variablesUpdated().getVariableNames()) {
        Data origVar = oldCallVars.get(varname);
        Data ifVar = callVarsIf.get(varname);
        Data elseVar = callVarsElse.get(varname);
        Data dat1 = null, dat2 = null;
        if (ifVar != null && elseVar != null) {
            // both branches exists
            dat1 = ifVar;
            dat2 = elseVar;
        } else if (ifVar != null && elseVar == null) {
            // only if
            dat1 = origVar;
            dat2 = ifVar;
        } else {
            // only else
            dat1 = origVar;
            dat2 = elseVar;
        }
        // because we do not allow data type changes)
        if (dat1 != null && dat1 instanceof MatrixObject && dat2 != null) {
            // handle matrices
            if (dat1 instanceof MatrixObject && dat2 instanceof MatrixObject) {
                MatrixObject moOld = (MatrixObject) dat1;
                MatrixObject mo = (MatrixObject) dat2;
                MatrixCharacteristics mcOld = moOld.getMatrixCharacteristics();
                MatrixCharacteristics mc = mo.getMatrixCharacteristics();
                if (mcOld.getRows() != mc.getRows() || mcOld.getCols() != mc.getCols() || mcOld.getNonZeros() != mc.getNonZeros()) {
                    long ldim1 = mc.getRows(), ldim2 = mc.getCols(), lnnz = mc.getNonZeros();
                    // handle row dimension change
                    if (mcOld.getRows() != mc.getRows()) {
                        // unknown
                        ldim1 = -1;
                    }
                    if (mcOld.getCols() != mc.getCols()) {
                        // unknown
                        ldim2 = -1;
                    }
                    // handle sparsity change
                    if (mcOld.getNonZeros() != mc.getNonZeros()) {
                        // unknown
                        lnnz = -1;
                    }
                    MatrixObject moNew = createOutputMatrix(ldim1, ldim2, lnnz);
                    callVarsIf.put(varname, moNew);
                }
            }
        }
    }
    return callVarsIf;
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) CacheableData(org.apache.sysml.runtime.controlprogram.caching.CacheableData) Data(org.apache.sysml.runtime.instructions.cp.Data) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Aggregations

MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)296 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)102 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)89 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)70 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)50 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)47 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)45 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)42 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)37 CellIndex (org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex)37 IOException (java.io.IOException)30 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)27 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)22 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)22 ArrayList (java.util.ArrayList)19 ValueType (org.apache.sysml.parser.Expression.ValueType)19 Path (org.apache.hadoop.fs.Path)17 LongWritable (org.apache.hadoop.io.LongWritable)16 Test (org.junit.Test)15 Text (org.apache.hadoop.io.Text)14