use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class FrameRDDConverterUtils method matrixBlockToBinaryBlockLongIndex.
public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc, JavaPairRDD<MatrixIndexes, MatrixBlock> input, MatrixCharacteristics mcIn) {
JavaPairRDD<MatrixIndexes, MatrixBlock> in = input;
MatrixCharacteristics mc = new MatrixCharacteristics(mcIn);
// reblock matrix blocks if required (multiple column blocks)
if (mcIn.getCols() > mcIn.getColsPerBlock()) {
// split matrix blocks into extended matrix blocks
in = in.flatMapToPair(new MatrixFrameReblockFunction(mcIn));
mc.setBlockSize(MatrixFrameReblockFunction.computeBlockSize(mc), (int) mc.getCols());
// shuffle matrix blocks (instead of frame blocks) in order to exploit
// sparse formats (for sparse or wide matrices) during shuffle
in = RDDAggregateUtils.mergeByKey(in, false);
}
// convert individual matrix blocks to frame blocks (w/o shuffle)
return in.mapToPair(new MatrixToFrameBlockFunction(mc));
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RDDConverterUtils method csvToBinaryBlock.
public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue) {
// determine unknown dimensions and sparsity if required
if (!mc.dimsKnown(true)) {
LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
JavaRDD<String> tmp = input.values().map(new CSVAnalysisFunction(aNnz, delim));
long rlen = tmp.count() - (hasHeader ? 1 : 0);
long clen = tmp.first().split(delim).length;
long nnz = UtilFunctions.toLong(aNnz.value());
mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
}
// prepare csv w/ row indexes (sorted by filenames)
JavaPairRDD<Text, Long> prepinput = input.values().zipWithIndex();
// convert csv rdd to binary block rdd (w/ partial blocks)
boolean sparse = requiresSparseAllocation(prepinput, mc);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new CSVToBinaryBlockFunction(mc, sparse, hasHeader, delim, fill, fillValue));
// aggregate partial matrix blocks (w/ preferred number of output
// partitions as the data is likely smaller in binary block format,
// but also to bound the size of partitions for compressed inputs)
int parts = SparkUtils.getNumPreferredPartitions(mc, out);
return RDDAggregateUtils.mergeByKey(out, parts, false);
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RDDConverterUtils method dataFrameToBinaryBlock.
public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc, Dataset<Row> df, MatrixCharacteristics mc, boolean containsID, boolean isVector) {
// determine unknown dimensions and sparsity if required
if (!mc.dimsKnown(true)) {
LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
long rlen = tmp.count();
long clen = !isVector ? df.columns().length - (containsID ? 1 : 0) : ((Vector) tmp.first().get(containsID ? 1 : 0)).size();
long nnz = UtilFunctions.toLong(aNnz.value());
mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
}
// ensure valid blocksizes
if (mc.getRowsPerBlock() <= 1 || mc.getColsPerBlock() <= 1) {
mc.setBlockSize(ConfigurationManager.getBlocksize());
}
// construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = containsID ? df.javaRDD().mapToPair(new DataFrameExtractIDFunction(df.schema().fieldIndex(DF_ID_COLUMN))) : // zip row index
df.javaRDD().zipWithIndex();
// convert csv rdd to binary block rdd (w/ partial blocks)
boolean sparse = requiresSparseAllocation(prepinput, mc);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
// aggregate partial matrix blocks (w/ preferred number of output
// partitions as the data is likely smaller in binary block format,
// but also to bound the size of partitions for compressed inputs)
int parts = SparkUtils.getNumPreferredPartitions(mc, out);
return RDDAggregateUtils.mergeByKey(out, parts, false);
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RDDConverterUtils method libsvmToBinaryBlock.
/**
* Converts a libsvm text input file into two binary block matrices for features
* and labels, and saves these to the specified output files. This call also deletes
* existing files at the specified output locations, as well as determines and
* writes the meta data files of both output matrices.
* <p>
* Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing
* the libsvm input files in order to ensure consistency with Spark.
*
* @param sc java spark context
* @param pathIn path to libsvm input file
* @param pathX path to binary block output file of features
* @param pathY path to binary block output file of labels
* @param mcOutX matrix characteristics of output matrix X
*/
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, MatrixCharacteristics mcOutX) {
if (!mcOutX.dimsKnown())
throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation.");
try {
// cleanup existing output files
MapReduceTool.deleteFileIfExistOnHDFS(pathX);
MapReduceTool.deleteFileIfExistOnHDFS(pathY);
// convert libsvm to labeled points
int numFeatures = (int) mcOutX.getCols();
int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
// append row index and best-effort caching to avoid repeated text parsing
JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint, Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK());
// extract labels and convert to binary block
MatrixCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
JavaPairRDD<MatrixIndexes, MatrixBlock> out1 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
// update nnz after triggered save
mc1.setNonZeros(aNnz1.value());
MapReduceTool.writeMetaDataFile(pathY + ".mtd", ValueType.DOUBLE, mc1, OutputInfo.BinaryBlockOutputInfo);
// extract data and convert to binary block
MatrixCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
JavaPairRDD<MatrixIndexes, MatrixBlock> out2 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
// update nnz after triggered save
mc2.setNonZeros(aNnz2.value());
MapReduceTool.writeMetaDataFile(pathX + ".mtd", ValueType.DOUBLE, mc2, OutputInfo.BinaryBlockOutputInfo);
// asynchronous cleanup of cached intermediates
ilpoints.unpersist(false);
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.
the class RDDConverterUtilsExt method convertPy4JArrayToMB.
public static MatrixBlock convertPy4JArrayToMB(byte[] data, int rlen, int clen, boolean isSparse) {
MatrixBlock mb = new MatrixBlock(rlen, clen, isSparse, -1);
if (isSparse) {
throw new DMLRuntimeException("Convertion to sparse format not supported");
} else {
long limit = rlen * clen;
if (limit > Integer.MAX_VALUE)
throw new DMLRuntimeException("Dense NumPy array of size " + limit + " cannot be converted to MatrixBlock");
double[] denseBlock = new double[(int) limit];
ByteBuffer buf = ByteBuffer.wrap(data);
buf.order(ByteOrder.nativeOrder());
for (int i = 0; i < rlen * clen; i++) {
denseBlock[i] = buf.getDouble();
}
mb.init(denseBlock, rlen, clen);
}
mb.recomputeNonZeros();
mb.examSparsity();
return mb;
}
Aggregations