Search in sources :

Example 16 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class WriteSPInstruction method processMatrixWriteInstruction.

protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi) throws DMLRuntimeException, IOException {
    //get input rdd
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
        //piggyback nnz maintenance on write
        LongAccumulator aNnz = null;
        if (isInputMatrixBlock && !mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        JavaRDD<String> header = null;
        if (oi == OutputInfo.MatrixMarketOutputInfo) {
            ArrayList<String> headerContainer = new ArrayList<String>(1);
            // First output MM header
            String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz
            mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
            headerContainer.add(headerStr);
            header = sec.getSparkContext().parallelize(headerContainer);
        }
        JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
        if (header != null)
            customSaveTextFile(header.union(ijv), fname, true);
        else
            customSaveTextFile(ijv, fname, false);
        if (isInputMatrixBlock && !mc.nnzKnown())
            mc.setNonZeros(aNnz.value());
    } else if (oi == OutputInfo.CSVOutputInfo) {
        JavaRDD<String> out = null;
        LongAccumulator aNnz = null;
        if (isInputMatrixBlock) {
            //piggyback nnz computation on actual write
            if (!mc.nnzKnown()) {
                aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
                in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
            }
            out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true);
        } else {
            // This case is applicable when the CSV output from transform() is written out
            // TODO remove once transform over frames supported
            @SuppressWarnings("unchecked") JavaPairRDD<Long, String> rdd = (JavaPairRDD<Long, String>) (sec.getMatrixObject(input1.getName())).getRDDHandle().getRDD();
            out = rdd.values();
            String sep = ",";
            boolean hasHeader = false;
            if (formatProperties != null) {
                sep = ((CSVFileFormatProperties) formatProperties).getDelim();
                hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
            }
            if (hasHeader) {
                StringBuffer buf = new StringBuffer();
                for (int j = 1; j < mc.getCols(); j++) {
                    if (j != 1) {
                        buf.append(sep);
                    }
                    buf.append("C" + j);
                }
                ArrayList<String> headerContainer = new ArrayList<String>(1);
                headerContainer.add(0, buf.toString());
                JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
                out = header.union(out);
            }
        }
        customSaveTextFile(out, fname, false);
        if (isInputMatrixBlock && !mc.nnzKnown())
            mc.setNonZeros((long) aNnz.value().longValue());
    } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        //piggyback nnz computation on actual write
        LongAccumulator aNnz = null;
        if (!mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        //save binary block rdd on hdfs
        in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        if (!mc.nnzKnown())
            mc.setNonZeros((long) aNnz.value().longValue());
    } else {
        //unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
    }
    // write meta data file
    MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList) ComputeBinaryBlockNnzFunction(org.apache.sysml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JavaRDD(org.apache.spark.api.java.JavaRDD) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LongAccumulator(org.apache.spark.util.LongAccumulator) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD)

Example 17 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class PairWritableBlock method readFields.

@Override
public void readFields(DataInput in) throws IOException {
    indexes = new MatrixIndexes();
    indexes.readFields(in);
    block = new MatrixBlock();
    block.readFields(in);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes)

Example 18 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class PairWritableCell method readFields.

@Override
public void readFields(DataInput in) throws IOException {
    indexes = new MatrixIndexes();
    indexes.readFields(in);
    cell = new MatrixCell();
    cell.readFields(in);
}
Also used : MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell)

Example 19 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class MatrixIndexingSPInstruction method createPartitionPruningRDD.

/**
	 * Wraps the input RDD into a PartitionPruningRDD, which acts as a filter
	 * of required partitions. The distinct set of required partitions is determined
	 * via the partitioner of the input RDD.
	 * 
	 * @param in input matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
	 * @param filter partition filter
	 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
	 */
private static JavaPairRDD<MatrixIndexes, MatrixBlock> createPartitionPruningRDD(JavaPairRDD<MatrixIndexes, MatrixBlock> in, List<MatrixIndexes> filter) {
    //build hashset of required partition ids
    HashSet<Integer> flags = new HashSet<Integer>();
    Partitioner partitioner = in.rdd().partitioner().get();
    for (MatrixIndexes key : filter) flags.add(partitioner.getPartition(key));
    //create partition pruning rdd
    Function1<Object, Object> f = new PartitionPruningFunction(flags);
    PartitionPruningRDD<Tuple2<MatrixIndexes, MatrixBlock>> ppRDD = PartitionPruningRDD.create(in.rdd(), f);
    //wrap output into java pair rdd
    return new JavaPairRDD<MatrixIndexes, MatrixBlock>(ppRDD, ClassManifestFactory.fromClass(MatrixIndexes.class), ClassManifestFactory.fromClass(MatrixBlock.class));
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Partitioner(org.apache.spark.Partitioner) HashSet(java.util.HashSet)

Example 20 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class MatrixReshapeSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    //get parameters
    //save cast
    int rows = (int) ec.getScalarInput(_opRows.getName(), _opRows.getValueType(), _opRows.isLiteral()).getLongValue();
    //save cast
    int cols = (int) ec.getScalarInput(_opCols.getName(), _opCols.getValueType(), _opCols.isLiteral()).getLongValue();
    boolean byRow = ec.getScalarInput(_opByRow.getName(), ValueType.BOOLEAN, _opByRow.isLiteral()).getBooleanValue();
    //get inputs 
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    //update output characteristics and sanity check
    mcOut.set(rows, cols, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
    if (mcIn.getRows() * mcIn.getCols() != mcOut.getRows() * mcOut.getCols()) {
        throw new DMLRuntimeException("Incompatible matrix characteristics for reshape: " + mcIn.getRows() + "x" + mcIn.getCols() + " vs " + mcOut.getRows() + "x" + mcOut.getCols());
    }
    //execute reshape instruction
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.flatMapToPair(new RDDReshapeFunction(mcIn, mcOut, byRow));
    out = RDDAggregateUtils.mergeByKey(out);
    //put output RDD handle into symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)144 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)121 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)57 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)44 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)38 Path (org.apache.hadoop.fs.Path)21 SequenceFile (org.apache.hadoop.io.SequenceFile)21 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)20 IOException (java.io.IOException)19 ArrayList (java.util.ArrayList)18 FileSystem (org.apache.hadoop.fs.FileSystem)18 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)18 Tuple2 (scala.Tuple2)17 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)15 JobConf (org.apache.hadoop.mapred.JobConf)11 MatrixValue (org.apache.sysml.runtime.matrix.data.MatrixValue)10 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)9 File (java.io.File)7 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)7 RecordReader (org.apache.hadoop.mapred.RecordReader)6