use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method joinWithWorkersAndMap.
/**
* A generic function for handling a blockified list of objects to their corresponding compute nodes
*
* If Spark is enabled:
*
* Joins an instance of {@code List<Tuple2<LinearlySpacedIndexBlock, V>>} with {@link #computeRDD}, calls the provided
* map {@code mapper} on the RDD, and the reference to the old RDD will be replaced with the new RDD.
*
* If Spark is disabled:
*
* Only a single target-space block is assumed, such that {@code data} is a singleton. The map function
* {@code mapper} will be called on the value contained in {@code data} and {@link #localComputeBlock}, and
* the old instance of {@link CoverageModelEMComputeBlock} is replaced with the new instance returned
* by {@code mapper.}
*
* @param data the list to joined and mapped together with the compute block(s)
* @param mapper a mapper binary function that takes a compute block together with an object of type {@code V} and
* returns a new compute block
* @param <V> the type of the object to the broadcasted
*/
@UpdatesRDD
private <V> void joinWithWorkersAndMap(@Nonnull final List<Tuple2<LinearlySpacedIndexBlock, V>> data, @Nonnull final Function<Tuple2<CoverageModelEMComputeBlock, V>, CoverageModelEMComputeBlock> mapper) {
if (sparkContextIsAvailable) {
final JavaPairRDD<LinearlySpacedIndexBlock, V> newRDD = ctx.parallelizePairs(data, numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks));
computeRDD = computeRDD.join(newRDD).mapValues(mapper);
} else {
try {
Utils.validateArg(data.size() == 1, "Only a single data block is expected in the local mode");
localComputeBlock = mapper.call(new Tuple2<>(localComputeBlock, data.get(0)._2));
} catch (Exception e) {
throw new RuntimeException("Can not apply the map function to the local compute block: " + e.getMessage());
}
}
}
use of org.apache.spark.HashPartitioner in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method joinWithWorkersAndMap.
/**
* A generic function for handling a blockified list of objects to their corresponding compute nodes
*
* If Spark is enabled:
*
* Joins an instance of {@code List<Tuple2<LinearlySpacedIndexBlock, V>>} with {@link #computeRDD}, calls the provided
* map {@code mapper} on the RDD, and the reference to the old RDD will be replaced with the new RDD.
*
* If Spark is disabled:
*
* Only a single target-space block is assumed, such that {@code data} is a singleton. The map function
* {@code mapper} will be called on the value contained in {@code data} and {@link #localComputeBlock}, and
* the old instance of {@link CoverageModelEMComputeBlock} is replaced with the new instance returned
* by {@code mapper.}
*
* @param data the list to joined and mapped together with the compute block(s)
* @param mapper a mapper binary function that takes a compute block together with an object of type {@code V} and
* returns a new compute block
* @param <V> the type of the object to the broadcasted
*/
@UpdatesRDD
private <V> void joinWithWorkersAndMap(@Nonnull final List<Tuple2<LinearlySpacedIndexBlock, V>> data, @Nonnull final Function<Tuple2<CoverageModelEMComputeBlock, V>, CoverageModelEMComputeBlock> mapper) {
if (sparkContextIsAvailable) {
final JavaPairRDD<LinearlySpacedIndexBlock, V> newRDD = ctx.parallelizePairs(data, numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks));
computeRDD = computeRDD.join(newRDD).mapValues(mapper);
} else {
try {
Utils.validateArg(data.size() == 1, "Only a single data block is expected in the local mode");
localComputeBlock = mapper.call(new Tuple2<>(localComputeBlock, data.get(0)._2));
} catch (Exception e) {
throw new RuntimeException("Can not apply the map function to the local compute block: " + e.getMessage());
}
}
}
use of org.apache.spark.HashPartitioner in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method instantiateWorkers.
/**
* Instantiate compute block(s). If Spark is disabled, a single {@link CoverageModelEMComputeBlock} is
* instantiated. Otherwise, a {@link JavaPairRDD} of compute nodes will be created.
*/
private void instantiateWorkers() {
if (sparkContextIsAvailable) {
/* initialize the RDD */
logger.info("Initializing an RDD of compute blocks");
computeRDD = ctx.parallelizePairs(targetBlockStream().map(tb -> new Tuple2<>(tb, new CoverageModelEMComputeBlock(tb, numSamples, numLatents, ardEnabled))).collect(Collectors.toList()), numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks)).cache();
} else {
logger.info("Initializing a local compute block");
localComputeBlock = new CoverageModelEMComputeBlock(targetBlocks.get(0), numSamples, numLatents, ardEnabled);
}
prevCheckpointedComputeRDD = null;
cacheCallCounter = 0;
}
use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method instantiateWorkers.
/**
* Instantiate compute block(s). If Spark is disabled, a single {@link CoverageModelEMComputeBlock} is
* instantiated. Otherwise, a {@link JavaPairRDD} of compute nodes will be created.
*/
private void instantiateWorkers() {
if (sparkContextIsAvailable) {
/* initialize the RDD */
logger.info("Initializing an RDD of compute blocks");
computeRDD = ctx.parallelizePairs(targetBlockStream().map(tb -> new Tuple2<>(tb, new CoverageModelEMComputeBlock(tb, numSamples, numLatents, ardEnabled))).collect(Collectors.toList()), numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks)).cache();
} else {
logger.info("Initializing a local compute block");
localComputeBlock = new CoverageModelEMComputeBlock(targetBlocks.get(0), numSamples, numLatents, ardEnabled);
}
prevCheckpointedComputeRDD = null;
cacheCallCounter = 0;
}
use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.
the class FindBreakpointEvidenceSpark method handleAssemblies.
/**
* Transform all the reads for a supplied set of template names in each interval into FASTQ records
* for each interval, and do something with the list of FASTQ records for each interval (like write it to a file).
*/
@VisibleForTesting
static List<AlignedAssemblyOrExcuse> handleAssemblies(final JavaSparkContext ctx, final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap, final JavaRDD<GATKRead> reads, final int nIntervals, final boolean includeMappingLocation, final boolean dumpFASTQs, final LocalAssemblyHandler localAssemblyHandler) {
final Broadcast<HopscotchUniqueMultiMap<String, Integer, QNameAndInterval>> broadcastQNamesMultiMap = ctx.broadcast(qNamesMultiMap);
final List<AlignedAssemblyOrExcuse> intervalDispositions = reads.mapPartitionsToPair(readItr -> new ReadsForQNamesFinder(broadcastQNamesMultiMap.value(), nIntervals, includeMappingLocation, dumpFASTQs).call(readItr).iterator(), false).combineByKey(x -> x, FindBreakpointEvidenceSpark::combineLists, FindBreakpointEvidenceSpark::combineLists, new HashPartitioner(nIntervals), false, null).map(localAssemblyHandler::apply).collect();
broadcastQNamesMultiMap.destroy();
BwaMemIndexSingleton.closeAllDistributedInstances(ctx);
return intervalDispositions;
}
Aggregations