use of org.apache.spark.api.java.JavaPairRDD in project systemml by apache.
the class DataPartitionerRemoteSpark method partitionMatrix.
@Override
@SuppressWarnings("unchecked")
protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) {
String jobname = "ParFor-DPSP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) _ec;
try {
// cleanup existing output files
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
// get input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> inRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(in, InputInfo.BinaryBlockInputInfo);
// determine degree of parallelism
MatrixCharacteristics mc = in.getMatrixCharacteristics();
int numRed = (int) determineNumReducers(inRdd, mc, _numRed);
// run spark remote data partition job
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, _format, _n);
DataPartitionerRemoteSparkReducer wfun = new DataPartitionerRemoteSparkReducer(fnameNew, oi, _replication);
// partition the input blocks
inRdd.flatMapToPair(dpfun).groupByKey(// group partition blocks
numRed).foreach(// write partitions to hdfs
wfun);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
// maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS) {
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
}
}
use of org.apache.spark.api.java.JavaPairRDD in project systemml by apache.
the class SparkExecutionContext method repartitionAndCacheMatrixObject.
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject(String var) {
MatrixObject mo = getMatrixObject(var);
MatrixCharacteristics mcIn = mo.getMatrixCharacteristics();
// double check size to avoid unnecessary spark context creation
if (!OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), (double) OptimizerUtils.estimateSizeExactSparsity(mcIn)))
return;
// get input rdd and default storage level
JavaPairRDD<MatrixIndexes, MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);
// avoid unnecessary caching of input in order to reduce memory pressure
if (mo.getRDDHandle().allowsShortCircuitRead() && isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id())) {
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
// investigate issue of unnecessarily large number of partitions
int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
if (numPartitions < in.getNumPartitions())
in = in.coalesce(numPartitions);
}
// repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
// executed on the original data, because there will be no merge, i.e., no key duplicates
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);
// convert mcsr into memory-efficient csr if potentially sparse
if (OptimizerUtils.checkSparseBlockCSRConversion(mcIn)) {
out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
}
// persist rdd in default storage level
out.persist(Checkpoint.DEFAULT_STORAGE_LEVEL).count();
// create new rdd handle, in-place of current matrix object
// guaranteed to exist (see above)
RDDObject inro = mo.getRDDHandle();
// create new rdd object
RDDObject outro = new RDDObject(out);
// mark as checkpointed
outro.setCheckpointRDD(true);
// keep lineage to prevent cycles on cleanup
outro.addLineageChild(inro);
mo.setRDDHandle(outro);
}
use of org.apache.spark.api.java.JavaPairRDD in project systemml by apache.
the class SparkUtils method getEmptyBlockRDD.
/**
* Creates an RDD of empty blocks according to the given matrix characteristics. This is
* done in a scalable manner by parallelizing block ranges and generating empty blocks
* in a distributed manner, under awareness of preferred output partition sizes.
*
* @param sc spark context
* @param mc matrix characteristics
* @return pair rdd of empty matrix blocks
*/
public static JavaPairRDD<MatrixIndexes, MatrixBlock> getEmptyBlockRDD(JavaSparkContext sc, MatrixCharacteristics mc) {
// compute degree of parallelism and block ranges
long size = mc.getNumBlocks() * OptimizerUtils.estimateSizeEmptyBlock(Math.min(Math.max(mc.getRows(), 1), mc.getRowsPerBlock()), Math.min(Math.max(mc.getCols(), 1), mc.getColsPerBlock()));
int par = (int) Math.min(Math.max(SparkExecutionContext.getDefaultParallelism(true), Math.ceil(size / InfrastructureAnalyzer.getHDFSBlockSize())), mc.getNumBlocks());
long pNumBlocks = (long) Math.ceil((double) mc.getNumBlocks() / par);
// generate block offsets per partition
List<Long> offsets = LongStream.iterate(0, n -> n + pNumBlocks).limit(par).boxed().collect(Collectors.toList());
// parallelize offsets and generate all empty blocks
return (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.parallelize(offsets, par).flatMapToPair(new GenerateEmptyBlocks(mc, pNumBlocks));
}
use of org.apache.spark.api.java.JavaPairRDD in project mm-dev by sbl-sdsc.
the class StructureAligner method getAllVsAllAlignments.
/**
* Calculates all vs. all structural alignments of protein chains using the
* specified alignment algorithm. The input structures must contain single
* protein chains.
*
* @param targets structures containing single protein chains
* @param alignmentAlgorithm name of the algorithm
* @return dataset with alignment metrics
*/
public static Dataset<Row> getAllVsAllAlignments(JavaPairRDD<String, StructureDataInterface> targets, String alignmentAlgorithm) {
SparkSession session = SparkSession.builder().getOrCreate();
JavaSparkContext sc = new JavaSparkContext(session.sparkContext());
// create a list of chainName/ C Alpha coordinates
List<Tuple2<String, Point3d[]>> chains = targets.mapValues(s -> new ColumnarStructureX(s, true).getcAlphaCoordinates()).collect();
// create an RDD of all pair indices (0,1), (0,2), ..., (1,2), (1,3), ...
JavaRDD<Tuple2<Integer, Integer>> pairs = getPairs(sc, chains.size());
// calculate structural alignments for all pairs.
// broadcast (copy) chains to all worker nodes for efficient processing.
// for each pair there can be zero or more solutions, therefore we flatmap the pairs.
JavaRDD<Row> rows = pairs.flatMap(new StructuralAlignmentMapper(sc.broadcast(chains), alignmentAlgorithm));
// convert rows to a dataset
return session.createDataFrame(rows, getSchema());
}
use of org.apache.spark.api.java.JavaPairRDD in project mm-dev by sbl-sdsc.
the class D3RLigandProteinMerger method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
// instantiate Spark
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("D3RLigandProteinMerger");
JavaSparkContext sc = new JavaSparkContext(conf);
// String path = "/Users/peter/Downloads/Pose_prediction/417-1-hciq4/";
String path = "/Users/peter/Downloads/Pose_prediction/";
JavaPairRDD<String, StructureDataInterface> ligands = Molmporter.importMolFiles(path, sc);
ligands = ligands.mapToPair(t -> new Tuple2<String, StructureDataInterface>(removeExtension(t._1), t._2));
JavaPairRDD<String, StructureDataInterface> proteins = MmtfImporter.importPdbFiles(path, sc);
proteins = proteins.mapToPair(t -> new Tuple2<String, StructureDataInterface>(removeExtension(t._1), t._2));
JavaPairRDD<String, Tuple2<StructureDataInterface, StructureDataInterface>> pairs = proteins.join(ligands);
JavaPairRDD<String, StructureDataInterface> complexes = pairs.mapToPair(t -> new Tuple2<String, StructureDataInterface>(t._1, MergeMmtf.MergeStructures(t._1, t._2._1, t._2._2)));
complexes.foreach(t -> TraverseStructureHierarchy.printChainInfo(t._2));
// System.out.println("Complexes: " + complexes.count());
// complexes.keys().foreach(k -> System.out.println(k));
// TraverseStructureHierarchy.printChainInfo(complexes.first()._2);
sc.close();
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + " sec.");
}
Aggregations