use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.
the class WriteSPInstruction method processMatrixWriteInstruction.
protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi) throws DMLRuntimeException, IOException {
//get input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
//piggyback nnz maintenance on write
LongAccumulator aNnz = null;
if (isInputMatrixBlock && !mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
JavaRDD<String> header = null;
if (oi == OutputInfo.MatrixMarketOutputInfo) {
ArrayList<String> headerContainer = new ArrayList<String>(1);
// First output MM header
String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz
mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
headerContainer.add(headerStr);
header = sec.getSparkContext().parallelize(headerContainer);
}
JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
if (header != null)
customSaveTextFile(header.union(ijv), fname, true);
else
customSaveTextFile(ijv, fname, false);
if (isInputMatrixBlock && !mc.nnzKnown())
mc.setNonZeros(aNnz.value());
} else if (oi == OutputInfo.CSVOutputInfo) {
JavaRDD<String> out = null;
LongAccumulator aNnz = null;
if (isInputMatrixBlock) {
//piggyback nnz computation on actual write
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true);
} else {
// This case is applicable when the CSV output from transform() is written out
// TODO remove once transform over frames supported
@SuppressWarnings("unchecked") JavaPairRDD<Long, String> rdd = (JavaPairRDD<Long, String>) (sec.getMatrixObject(input1.getName())).getRDDHandle().getRDD();
out = rdd.values();
String sep = ",";
boolean hasHeader = false;
if (formatProperties != null) {
sep = ((CSVFileFormatProperties) formatProperties).getDelim();
hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
}
if (hasHeader) {
StringBuffer buf = new StringBuffer();
for (int j = 1; j < mc.getCols(); j++) {
if (j != 1) {
buf.append(sep);
}
buf.append("C" + j);
}
ArrayList<String> headerContainer = new ArrayList<String>(1);
headerContainer.add(0, buf.toString());
JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
out = header.union(out);
}
}
customSaveTextFile(out, fname, false);
if (isInputMatrixBlock && !mc.nnzKnown())
mc.setNonZeros((long) aNnz.value().longValue());
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
//piggyback nnz computation on actual write
LongAccumulator aNnz = null;
if (!mc.nnzKnown()) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
//save binary block rdd on hdfs
in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
if (!mc.nnzKnown())
mc.setNonZeros((long) aNnz.value().longValue());
} else {
//unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
}
// write meta data file
MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
}
use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.
the class SparkExecutionContext method writeRDDtoHDFS.
@SuppressWarnings("unchecked")
public static long writeRDDtoHDFS(RDDObject rdd, String path, OutputInfo oinfo) {
JavaPairRDD<MatrixIndexes, MatrixBlock> lrdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd.getRDD();
//piggyback nnz maintenance on write
LongAccumulator aNnz = getSparkContextStatic().sc().longAccumulator("nnz");
lrdd = lrdd.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
//save file is an action which also triggers nnz maintenance
lrdd.saveAsHadoopFile(path, oinfo.outputKeyClass, oinfo.outputValueClass, oinfo.outputFormatClass);
//return nnz aggregate of all blocks
return aNnz.value();
}
use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.
the class RDDConverterUtils method libsvmToBinaryBlock.
/**
* Converts a libsvm text input file into two binary block matrices for features
* and labels, and saves these to the specified output files. This call also deletes
* existing files at the specified output locations, as well as determines and
* writes the meta data files of both output matrices.
* <p>
* Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing
* the libsvm input files in order to ensure consistency with Spark.
*
* @param sc java spark context
* @param pathIn path to libsvm input file
* @param pathX path to binary block output file of features
* @param pathY path to binary block output file of labels
* @param mcOutX matrix characteristics of output matrix X
* @throws DMLRuntimeException if output path not writable or conversion failure
*/
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, MatrixCharacteristics mcOutX) throws DMLRuntimeException {
if (!mcOutX.dimsKnown())
throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation.");
try {
//cleanup existing output files
MapReduceTool.deleteFileIfExistOnHDFS(pathX);
MapReduceTool.deleteFileIfExistOnHDFS(pathY);
//convert libsvm to labeled points
int numFeatures = (int) mcOutX.getCols();
int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
//append row index and best-effort caching to avoid repeated text parsing
JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint, Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK());
//extract labels and convert to binary block
MatrixCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
JavaPairRDD<MatrixIndexes, MatrixBlock> out1 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
//update nnz after triggered save
mc1.setNonZeros(aNnz1.value());
MapReduceTool.writeMetaDataFile(pathY + ".mtd", ValueType.DOUBLE, mc1, OutputInfo.BinaryBlockOutputInfo);
//extract data and convert to binary block
MatrixCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getRowsPerBlock(), mcOutX.getColsPerBlock(), -1);
LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
JavaPairRDD<MatrixIndexes, MatrixBlock> out2 = ilpoints.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
//update nnz after triggered save
mc2.setNonZeros(aNnz2.value());
MapReduceTool.writeMetaDataFile(pathX + ".mtd", ValueType.DOUBLE, mc2, OutputInfo.BinaryBlockOutputInfo);
//asynchronous cleanup of cached intermediates
ilpoints.unpersist(false);
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
}
}
use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.
the class RDDConverterUtils method dataFrameToBinaryBlock.
public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc, Dataset<Row> df, MatrixCharacteristics mc, boolean containsID, boolean isVector) {
//determine unknown dimensions and sparsity if required
if (!mc.dimsKnown(true)) {
LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
long rlen = tmp.count();
long clen = !isVector ? df.columns().length - (containsID ? 1 : 0) : ((Vector) tmp.first().get(containsID ? 1 : 0)).size();
long nnz = UtilFunctions.toLong(aNnz.value());
mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
}
//ensure valid blocksizes
if (mc.getRowsPerBlock() <= 1 || mc.getColsPerBlock() <= 1) {
mc.setBlockSize(ConfigurationManager.getBlocksize());
}
//construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = containsID ? df.javaRDD().mapToPair(new DataFrameExtractIDFunction(df.schema().fieldIndex(DF_ID_COLUMN))) : //zip row index
df.javaRDD().zipWithIndex();
//convert csv rdd to binary block rdd (w/ partial blocks)
boolean sparse = requiresSparseAllocation(prepinput, mc);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
//aggregate partial matrix blocks (w/ preferred number of output
//partitions as the data is likely smaller in binary block format,
//but also to bound the size of partitions for compressed inputs)
int parts = SparkUtils.getNumPreferredPartitions(mc, out);
return RDDAggregateUtils.mergeByKey(out, parts, false);
}
use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.
the class RDDConverterUtils method csvToBinaryBlock.
public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue) throws DMLRuntimeException {
//determine unknown dimensions and sparsity if required
if (!mc.dimsKnown(true)) {
LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
JavaRDD<String> tmp = input.values().map(new CSVAnalysisFunction(aNnz, delim));
long rlen = tmp.count() - (hasHeader ? 1 : 0);
long clen = tmp.first().split(delim).length;
long nnz = UtilFunctions.toLong(aNnz.value());
mc.set(rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), nnz);
}
//prepare csv w/ row indexes (sorted by filenames)
JavaPairRDD<Text, Long> prepinput = input.values().zipWithIndex();
//convert csv rdd to binary block rdd (w/ partial blocks)
boolean sparse = requiresSparseAllocation(prepinput, mc);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new CSVToBinaryBlockFunction(mc, sparse, hasHeader, delim, fill, fillValue));
//aggregate partial matrix blocks (w/ preferred number of output
//partitions as the data is likely smaller in binary block format,
//but also to bound the size of partitions for compressed inputs)
int parts = SparkUtils.getNumPreferredPartitions(mc, out);
return RDDAggregateUtils.mergeByKey(out, parts, false);
}
Aggregations