Search in sources :

Example 1 with LongAccumulator

use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.

the class WriteSPInstruction method processMatrixWriteInstruction.

protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi) throws DMLRuntimeException, IOException {
    //get input rdd
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
        //piggyback nnz maintenance on write
        LongAccumulator aNnz = null;
        if (isInputMatrixBlock && !mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        JavaRDD<String> header = null;
        if (oi == OutputInfo.MatrixMarketOutputInfo) {
            ArrayList<String> headerContainer = new ArrayList<String>(1);
            // First output MM header
            String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz
            mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
            headerContainer.add(headerStr);
            header = sec.getSparkContext().parallelize(headerContainer);
        }
        JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
        if (header != null)
            customSaveTextFile(header.union(ijv), fname, true);
        else
            customSaveTextFile(ijv, fname, false);
        if (isInputMatrixBlock && !mc.nnzKnown())
            mc.setNonZeros(aNnz.value());
    } else if (oi == OutputInfo.CSVOutputInfo) {
        JavaRDD<String> out = null;
        LongAccumulator aNnz = null;
        if (isInputMatrixBlock) {
            //piggyback nnz computation on actual write
            if (!mc.nnzKnown()) {
                aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
                in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
            }
            out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true);
        } else {
            // This case is applicable when the CSV output from transform() is written out
            // TODO remove once transform over frames supported
            @SuppressWarnings("unchecked") JavaPairRDD<Long, String> rdd = (JavaPairRDD<Long, String>) (sec.getMatrixObject(input1.getName())).getRDDHandle().getRDD();
            out = rdd.values();
            String sep = ",";
            boolean hasHeader = false;
            if (formatProperties != null) {
                sep = ((CSVFileFormatProperties) formatProperties).getDelim();
                hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
            }
            if (hasHeader) {
                StringBuffer buf = new StringBuffer();
                for (int j = 1; j < mc.getCols(); j++) {
                    if (j != 1) {
                        buf.append(sep);
                    }
                    buf.append("C" + j);
                }
                ArrayList<String> headerContainer = new ArrayList<String>(1);
                headerContainer.add(0, buf.toString());
                JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
                out = header.union(out);
            }
        }
        customSaveTextFile(out, fname, false);
        if (isInputMatrixBlock && !mc.nnzKnown())
            mc.setNonZeros((long) aNnz.value().longValue());
    } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        //piggyback nnz computation on actual write
        LongAccumulator aNnz = null;
        if (!mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        //save binary block rdd on hdfs
        in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        if (!mc.nnzKnown())
            mc.setNonZeros((long) aNnz.value().longValue());
    } else {
        //unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
    }
    // write meta data file
    MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList) ComputeBinaryBlockNnzFunction(org.apache.sysml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JavaRDD(org.apache.spark.api.java.JavaRDD) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LongAccumulator(org.apache.spark.util.LongAccumulator) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD)

Example 2 with LongAccumulator

use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.

the class SparkExecutionContext method writeRDDtoHDFS.

@SuppressWarnings("unchecked")
public static long writeRDDtoHDFS(RDDObject rdd, String path, OutputInfo oinfo) {
    JavaPairRDD<MatrixIndexes, MatrixBlock> lrdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd.getRDD();
    //piggyback nnz maintenance on write
    LongAccumulator aNnz = getSparkContextStatic().sc().longAccumulator("nnz");
    lrdd = lrdd.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
    //save file is an action which also triggers nnz maintenance
    lrdd.saveAsHadoopFile(path, oinfo.outputKeyClass, oinfo.outputValueClass, oinfo.outputFormatClass);
    //return nnz aggregate of all blocks
    return aNnz.value();
}
Also used : LongAccumulator(org.apache.spark.util.LongAccumulator) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ComputeBinaryBlockNnzFunction(org.apache.sysml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction)

Example 3 with LongAccumulator

use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.

the class RDDConverterUtils method libsvmToBinaryBlock.

/**
	 * Converts a libsvm text input file into two binary block matrices for features 
	 * and labels, and saves these to the specified output files. This call also deletes 
	 * existing files at the specified output locations, as well as determines and 
	 * writes the meta data files of both output matrices. 
	 * <p>
	 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing 
	 * the libsvm input files in order to ensure consistency with Spark.
	 * 
	 * @param sc java spark context
	 * @param pathIn path to libsvm input file
	 * @param pathX path to binary block output file of features
	 * @param pathY path to binary block output file of labels
	 * @param mcOutX matrix characteristics of output matrix X
	 * @throws DMLRuntimeException if output path not writable or conversion failure
	 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, MatrixCharacteristics mcOutX) throws DMLRuntimeException {
    if (!mcOutX.dimsKnown())
        throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation.");
    try {
        //cleanup existing output files
        MapReduceTool.deleteFileIfExistOnHDFS(pathX);
        MapReduceTool.deleteFileIfExistOnHDFS(pathY);
        //convert libsvm to labeled points
        int numFeatures = (int) mcOutX.getCols();
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
        JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
        //append row index and best-effort caching to avoid repeated text parsing
        JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint, Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK());
        //extract labels and convert to binary block
        MatrixCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
        LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
        JavaPairRDD<MatrixIndexes, MatrixBlock> out1 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
        int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
        out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
        out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        //update nnz after triggered save
        mc1.setNonZeros(aNnz1.value());
        MapReduceTool.writeMetaDataFile(pathY + ".mtd", ValueType.DOUBLE, mc1, OutputInfo.BinaryBlockOutputInfo);
        //extract data and convert to binary block
        MatrixCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
        LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
        JavaPairRDD<MatrixIndexes, MatrixBlock> out2 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
        out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
        out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        //update nnz after triggered save
        mc2.setNonZeros(aNnz2.value());
        MapReduceTool.writeMetaDataFile(pathX + ".mtd", ValueType.DOUBLE, mc2, OutputInfo.BinaryBlockOutputInfo);
        //asynchronous cleanup of cached intermediates
        ilpoints.unpersist(false);
    } catch (IOException ex) {
        throw new DMLRuntimeException(ex);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) IOException(java.io.IOException) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) LongAccumulator(org.apache.spark.util.LongAccumulator)

Example 4 with LongAccumulator

use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.

the class RDDConverterUtils method dataFrameToBinaryBlock.

public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc, Dataset<Row> df, MatrixCharacteristics mc, boolean containsID, boolean isVector) {
    //determine unknown dimensions and sparsity if required
    if (!mc.dimsKnown(true)) {
        LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
        JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
        long rlen = tmp.count();
        long clen = !isVector ? df.columns().length - (containsID ? 1 : 0) : ((Vector) tmp.first().get(containsID ? 1 : 0)).size();
        long nnz = UtilFunctions.toLong(aNnz.value());
        mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
    }
    //ensure valid blocksizes
    if (mc.getRowsPerBlock() <= 1 || mc.getColsPerBlock() <= 1) {
        mc.setBlockSize(ConfigurationManager.getBlocksize());
    }
    //construct or reuse row ids
    JavaPairRDD<Row, Long> prepinput = containsID ? df.javaRDD().mapToPair(new DataFrameExtractIDFunction(df.schema().fieldIndex(DF_ID_COLUMN))) : //zip row index
    df.javaRDD().zipWithIndex();
    //convert csv rdd to binary block rdd (w/ partial blocks)
    boolean sparse = requiresSparseAllocation(prepinput, mc);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
    //aggregate partial matrix blocks (w/ preferred number of output 
    //partitions as the data is likely smaller in binary block format,
    //but also to bound the size of partitions for compressed inputs)
    int parts = SparkUtils.getNumPreferredPartitions(mc, out);
    return RDDAggregateUtils.mergeByKey(out, parts, false);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) LongAccumulator(org.apache.spark.util.LongAccumulator) Row(org.apache.spark.sql.Row)

Example 5 with LongAccumulator

use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.

the class RDDConverterUtils method csvToBinaryBlock.

public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue) throws DMLRuntimeException {
    //determine unknown dimensions and sparsity if required
    if (!mc.dimsKnown(true)) {
        LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
        JavaRDD<String> tmp = input.values().map(new CSVAnalysisFunction(aNnz, delim));
        long rlen = tmp.count() - (hasHeader ? 1 : 0);
        long clen = tmp.first().split(delim).length;
        long nnz = UtilFunctions.toLong(aNnz.value());
        mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
    }
    //prepare csv w/ row indexes (sorted by filenames)
    JavaPairRDD<Text, Long> prepinput = input.values().zipWithIndex();
    //convert csv rdd to binary block rdd (w/ partial blocks)
    boolean sparse = requiresSparseAllocation(prepinput, mc);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new CSVToBinaryBlockFunction(mc, sparse, hasHeader, delim, fill, fillValue));
    //aggregate partial matrix blocks (w/ preferred number of output 
    //partitions as the data is likely smaller in binary block format,
    //but also to bound the size of partitions for compressed inputs)
    int parts = SparkUtils.getNumPreferredPartitions(mc, out);
    return RDDAggregateUtils.mergeByKey(out, parts, false);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Text(org.apache.hadoop.io.Text) SerText(org.apache.sysml.runtime.instructions.spark.data.SerText) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint) LongAccumulator(org.apache.spark.util.LongAccumulator)

Aggregations

LongAccumulator (org.apache.spark.util.LongAccumulator)7 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)6 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)6 LabeledPoint (org.apache.spark.ml.feature.LabeledPoint)3 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)3 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)2 LocalVariableMap (org.apache.sysml.runtime.controlprogram.LocalVariableMap)2 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)2 ComputeBinaryBlockNnzFunction (org.apache.sysml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction)2 Tuple2 (scala.Tuple2)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Text (org.apache.hadoop.io.Text)1 Writable (org.apache.hadoop.io.Writable)1 JavaRDD (org.apache.spark.api.java.JavaRDD)1 Row (org.apache.spark.sql.Row)1 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)1 SerText (org.apache.sysml.runtime.instructions.spark.data.SerText)1