Search in sources :

Example 51 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project gatk by broadinstitute.

the class ReadsSparkSink method writeReadsADAM.

private static void writeReadsADAM(final JavaSparkContext ctx, final String outputFile, final JavaRDD<SAMRecord> reads, final SAMFileHeader header) throws IOException {
    final SequenceDictionary seqDict = SequenceDictionary.fromSAMSequenceDictionary(header.getSequenceDictionary());
    final RecordGroupDictionary readGroups = RecordGroupDictionary.fromSAMHeader(header);
    final JavaPairRDD<Void, AlignmentRecord> rddAlignmentRecords = reads.map(read -> {
        read.setHeaderStrict(header);
        AlignmentRecord alignmentRecord = GATKReadToBDGAlignmentRecordConverter.convert(read, seqDict, readGroups);
        read.setHeaderStrict(null);
        return alignmentRecord;
    }).mapToPair(alignmentRecord -> new Tuple2<>(null, alignmentRecord));
    // instantiating a Job is necessary here in order to set the Hadoop Configuration...
    final Job job = Job.getInstance(ctx.hadoopConfiguration());
    // ...here, which sets a config property that the AvroParquetOutputFormat needs when writing data. Specifically,
    // we are writing the Avro schema to the Configuration as a JSON string. The AvroParquetOutputFormat class knows
    // how to translate objects in the Avro data model to the Parquet primitives that get written.
    AvroParquetOutputFormat.setSchema(job, AlignmentRecord.getClassSchema());
    deleteHadoopFile(outputFile, ctx.hadoopConfiguration());
    rddAlignmentRecords.saveAsNewAPIHadoopFile(outputFile, Void.class, AlignmentRecord.class, AvroParquetOutputFormat.class, job.getConfiguration());
}
Also used : NullWritable(org.apache.hadoop.io.NullWritable) CramIO(htsjdk.samtools.cram.build.CramIO) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadsWriteFormat(org.broadinstitute.hellbender.utils.read.ReadsWriteFormat) SAMFileHeader(htsjdk.samtools.SAMFileHeader) RecordGroupDictionary(org.bdgenomics.adam.models.RecordGroupDictionary) BamFileIoUtils(htsjdk.samtools.BamFileIoUtils) org.apache.hadoop.mapreduce(org.apache.hadoop.mapreduce) org.seqdoop.hadoop_bam(org.seqdoop.hadoop_bam) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) Configuration(org.apache.hadoop.conf.Configuration) AvroParquetOutputFormat(org.apache.parquet.avro.AvroParquetOutputFormat) Path(org.apache.hadoop.fs.Path) AlignmentRecord(org.bdgenomics.formats.avro.AlignmentRecord) SequenceDictionary(org.bdgenomics.adam.models.SequenceDictionary) JavaRDD(org.apache.spark.api.java.JavaRDD) Broadcast(org.apache.spark.broadcast.Broadcast) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) GATKReadToBDGAlignmentRecordConverter(org.broadinstitute.hellbender.utils.read.GATKReadToBDGAlignmentRecordConverter) IOException(java.io.IOException) Tuple2(scala.Tuple2) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SAMRecord(htsjdk.samtools.SAMRecord) File(java.io.File) UserException(org.broadinstitute.hellbender.exceptions.UserException) HeaderlessSAMRecordCoordinateComparator(org.broadinstitute.hellbender.utils.read.HeaderlessSAMRecordCoordinateComparator) SAMFileMerger(org.seqdoop.hadoop_bam.util.SAMFileMerger) Comparator(java.util.Comparator) AlignmentRecord(org.bdgenomics.formats.avro.AlignmentRecord) RecordGroupDictionary(org.bdgenomics.adam.models.RecordGroupDictionary) SequenceDictionary(org.bdgenomics.adam.models.SequenceDictionary)

Example 52 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project gatk by broadinstitute.

the class ReadsSparkSource method getParallelReads.

/**
     * Loads Reads using Hadoop-BAM. For local files, bam must have the fully-qualified path,
     * i.e., file:///path/to/bam.bam.
     * @param readFileName file to load
     * @param referencePath Reference path or null if not available. Reference is required for CRAM files.
     * @param intervals intervals of reads to include. If <code>null</code> then all the reads (both mapped and unmapped) will be returned.
     * @param splitSize maximum bytes of bam file to read into a single partition, increasing this will result in fewer partitions. A value of zero means
     *                  use the default split size (determined by the Hadoop input format, typically the size of one HDFS block).
     * @return RDD of (SAMRecord-backed) GATKReads from the file.
     */
public JavaRDD<GATKRead> getParallelReads(final String readFileName, final String referencePath, final List<SimpleInterval> intervals, final long splitSize) {
    SAMFileHeader header = getHeader(readFileName, referencePath);
    // use the Hadoop configuration attached to the Spark context to maintain cumulative settings
    final Configuration conf = ctx.hadoopConfiguration();
    if (splitSize > 0) {
        conf.set("mapreduce.input.fileinputformat.split.maxsize", Long.toString(splitSize));
    }
    final JavaPairRDD<LongWritable, SAMRecordWritable> rdd2;
    setHadoopBAMConfigurationProperties(readFileName, referencePath);
    boolean isBam = IOUtils.isBamFileName(readFileName);
    if (isBam && intervals != null && !intervals.isEmpty()) {
        BAMInputFormat.setIntervals(conf, intervals);
    } else {
        conf.unset(BAMInputFormat.INTERVALS_PROPERTY);
    }
    rdd2 = ctx.newAPIHadoopFile(readFileName, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, conf);
    JavaRDD<GATKRead> reads = rdd2.map(v1 -> {
        SAMRecord sam = v1._2().get();
        if (isBam || samRecordOverlaps(sam, intervals)) {
            return (GATKRead) SAMRecordToGATKReadAdapter.headerlessReadAdapter(sam);
        }
        return null;
    }).filter(v1 -> v1 != null);
    return putPairsInSamePartition(header, reads);
}
Also used : ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) SAMHeaderReader(org.seqdoop.hadoop_bam.util.SAMHeaderReader) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PathFilter(org.apache.hadoop.fs.PathFilter) FlatMapFunction2(org.apache.spark.api.java.function.FlatMapFunction2) FileStatus(org.apache.hadoop.fs.FileStatus) BDGAlignmentRecordToGATKReadAdapter(org.broadinstitute.hellbender.utils.read.BDGAlignmentRecordToGATKReadAdapter) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) PeekingIterator(com.google.common.collect.PeekingIterator) LongWritable(org.apache.hadoop.io.LongWritable) Iterators(com.google.common.collect.Iterators) SAMRecordWritable(org.seqdoop.hadoop_bam.SAMRecordWritable) ArrayList(java.util.ArrayList) AvroParquetInputFormat(org.apache.parquet.avro.AvroParquetInputFormat) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) AlignmentRecord(org.bdgenomics.formats.avro.AlignmentRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) Broadcast(org.apache.spark.broadcast.Broadcast) ReadConstants(org.broadinstitute.hellbender.utils.read.ReadConstants) Iterator(java.util.Iterator) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SAMRecordToGATKReadAdapter(org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter) BAMInputFormat(org.seqdoop.hadoop_bam.BAMInputFormat) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) File(java.io.File) Serializable(java.io.Serializable) CRAMInputFormat(org.seqdoop.hadoop_bam.CRAMInputFormat) List(java.util.List) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) Job(org.apache.hadoop.mapreduce.Job) AnySAMInputFormat(org.seqdoop.hadoop_bam.AnySAMInputFormat) SparkUtils(org.broadinstitute.hellbender.utils.spark.SparkUtils) htsjdk.samtools(htsjdk.samtools) LogManager(org.apache.logging.log4j.LogManager) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Configuration(org.apache.hadoop.conf.Configuration) AnySAMInputFormat(org.seqdoop.hadoop_bam.AnySAMInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) SAMRecordWritable(org.seqdoop.hadoop_bam.SAMRecordWritable)

Example 53 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project gatk by broadinstitute.

the class HaplotypeCallerSpark method createReadShards.

/**
     * Create an RDD of {@link Shard} from an RDD of {@link GATKRead}
     * @param shardBoundariesBroadcast  broadcast of an {@link OverlapDetector} loaded with the intervals that should be used for creating ReadShards
     * @param reads Rdd of {@link GATKRead}
     * @return a Rdd of reads grouped into potentially overlapping shards
     */
private static JavaRDD<Shard<GATKRead>> createReadShards(final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast, final JavaRDD<GATKRead> reads) {
    final JavaPairRDD<ShardBoundary, GATKRead> paired = reads.flatMapToPair(read -> {
        final Collection<ShardBoundary> overlappingShards = shardBoundariesBroadcast.value().getOverlaps(read);
        return overlappingShards.stream().map(key -> new Tuple2<>(key, read)).iterator();
    });
    final JavaPairRDD<ShardBoundary, Iterable<GATKRead>> shardsWithReads = paired.groupByKey();
    return shardsWithReads.map(shard -> new SparkReadShard(shard._1(), shard._2()));
}
Also used : CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) SparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.SparkProgramGroup) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Advanced(org.broadinstitute.barclay.argparser.Advanced) org.broadinstitute.hellbender.cmdline(org.broadinstitute.hellbender.cmdline) ArgumentCollection(org.broadinstitute.barclay.argparser.ArgumentCollection) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMFileHeader(htsjdk.samtools.SAMFileHeader) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) Function(java.util.function.Function) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) HaplotypeCallerArgumentCollection(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection) SparkReadShard(org.broadinstitute.hellbender.engine.spark.SparkReadShard) StreamSupport(java.util.stream.StreamSupport) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) org.broadinstitute.barclay.argparser(org.broadinstitute.barclay.argparser) Broadcast(org.apache.spark.broadcast.Broadcast) OverlapDetector(htsjdk.samtools.util.OverlapDetector) Iterator(java.util.Iterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Collection(java.util.Collection) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) IOException(java.io.IOException) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HaplotypeCaller(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCaller) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) HaplotypeCallerEngine(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerEngine) Serializable(java.io.Serializable) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) List(java.util.List) Stream(java.util.stream.Stream) UserException(org.broadinstitute.hellbender.exceptions.UserException) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Utils(org.broadinstitute.hellbender.utils.Utils) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Collections(java.util.Collections) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SparkReadShard(org.broadinstitute.hellbender.engine.spark.SparkReadShard) Tuple2(scala.Tuple2)

Example 54 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project gatk by broadinstitute.

the class ShuffleJoinReadsWithRefBases method addBases.

/**
     * Joins each read of an RDD<GATKRead> with that read's corresponding reference sequence.
     *
     * @param referenceDataflowSource The source of the reference sequence information
     * @param reads The reads for which to extract reference sequence information
     * @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object
     */
public static JavaPairRDD<GATKRead, ReferenceBases> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaRDD<GATKRead> reads) {
    // TODO: reimpl this method by calling out to the more complex version?
    SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
    JavaPairRDD<ReferenceShard, GATKRead> shardRead = reads.mapToPair(gatkRead -> {
        ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(gatkRead));
        return new Tuple2<>(shard, gatkRead);
    });
    JavaPairRDD<ReferenceShard, Iterable<GATKRead>> shardiRead = shardRead.groupByKey();
    return shardiRead.flatMapToPair(in -> {
        List<Tuple2<GATKRead, ReferenceBases>> out = Lists.newArrayList();
        Iterable<GATKRead> iReads = in._2();
        final List<SimpleInterval> readWindows = Utils.stream(iReads).map(read -> windowFunction.apply(read)).collect(Collectors.toList());
        SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
        ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
        for (GATKRead r : iReads) {
            final ReferenceBases subset = bases.getSubset(windowFunction.apply(r));
            out.add(new Tuple2<>(r, subset));
        }
        return out.iterator();
    });
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) List(java.util.List) Lists(com.google.common.collect.Lists) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Utils(org.broadinstitute.hellbender.utils.Utils) StreamSupport(java.util.stream.StreamSupport) SerializableFunction(org.broadinstitute.hellbender.utils.SerializableFunction) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) JavaRDD(org.apache.spark.api.java.JavaRDD) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Tuple2(scala.Tuple2) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 55 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project gatk-protected by broadinstitute.

the class CoverageModelEMWorkspace method instantiateWorkers.

/**
     * Instantiate compute block(s). If Spark is disabled, a single {@link CoverageModelEMComputeBlock} is
     * instantiated. Otherwise, a {@link JavaPairRDD} of compute nodes will be created.
     */
private void instantiateWorkers() {
    if (sparkContextIsAvailable) {
        /* initialize the RDD */
        logger.info("Initializing an RDD of compute blocks");
        computeRDD = ctx.parallelizePairs(targetBlockStream().map(tb -> new Tuple2<>(tb, new CoverageModelEMComputeBlock(tb, numSamples, numLatents, ardEnabled))).collect(Collectors.toList()), numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks)).cache();
    } else {
        logger.info("Initializing a local compute block");
        localComputeBlock = new CoverageModelEMComputeBlock(targetBlocks.get(0), numSamples, numLatents, ardEnabled);
    }
    prevCheckpointedComputeRDD = null;
    cacheCallCounter = 0;
}
Also used : ScalarProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.ScalarProducer) Function2(org.apache.spark.api.java.function.Function2) HMMSegmentProcessor(org.broadinstitute.hellbender.utils.hmm.segmentation.HMMSegmentProcessor) GermlinePloidyAnnotatedTargetCollection(org.broadinstitute.hellbender.tools.exome.sexgenotyper.GermlinePloidyAnnotatedTargetCollection) HiddenStateSegmentRecordWriter(org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecordWriter) BiFunction(java.util.function.BiFunction) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) SexGenotypeData(org.broadinstitute.hellbender.tools.exome.sexgenotyper.SexGenotypeData) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) CallStringProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.CallStringProducer) StorageLevel(org.apache.spark.storage.StorageLevel) SynchronizedUnivariateSolver(org.broadinstitute.hellbender.tools.coveragemodel.math.SynchronizedUnivariateSolver) CopyRatioExpectationsCalculator(org.broadinstitute.hellbender.tools.coveragemodel.interfaces.CopyRatioExpectationsCalculator) UnivariateSolverSpecifications(org.broadinstitute.hellbender.tools.coveragemodel.math.UnivariateSolverSpecifications) IndexRange(org.broadinstitute.hellbender.utils.IndexRange) Broadcast(org.apache.spark.broadcast.Broadcast) ExitStatus(org.broadinstitute.hellbender.tools.coveragemodel.linalg.IterativeLinearSolverNDArray.ExitStatus) SexGenotypeDataCollection(org.broadinstitute.hellbender.tools.exome.sexgenotyper.SexGenotypeDataCollection) HashPartitioner(org.apache.spark.HashPartitioner) Predicate(java.util.function.Predicate) GeneralLinearOperator(org.broadinstitute.hellbender.tools.coveragemodel.linalg.GeneralLinearOperator) Nd4j(org.nd4j.linalg.factory.Nd4j) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) FastMath(org.apache.commons.math3.util.FastMath) org.broadinstitute.hellbender.tools.exome(org.broadinstitute.hellbender.tools.exome) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) AbstractUnivariateSolver(org.apache.commons.math3.analysis.solvers.AbstractUnivariateSolver) FourierLinearOperatorNDArray(org.broadinstitute.hellbender.tools.coveragemodel.linalg.FourierLinearOperatorNDArray) Logger(org.apache.logging.log4j.Logger) Stream(java.util.stream.Stream) UserException(org.broadinstitute.hellbender.exceptions.UserException) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) TooManyEvaluationsException(org.apache.commons.math3.exception.TooManyEvaluationsException) Utils(org.broadinstitute.hellbender.utils.Utils) Function(org.apache.spark.api.java.function.Function) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) IntStream(java.util.stream.IntStream) java.util(java.util) NDArrayIndex(org.nd4j.linalg.indexing.NDArrayIndex) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AlleleMetadataProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.AlleleMetadataProducer) EmissionCalculationStrategy(org.broadinstitute.hellbender.tools.coveragemodel.CoverageModelCopyRatioEmissionProbabilityCalculator.EmissionCalculationStrategy) RobustBrentSolver(org.broadinstitute.hellbender.tools.coveragemodel.math.RobustBrentSolver) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) HiddenStateSegmentRecord(org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecord) ImmutableTriple(org.apache.commons.lang3.tuple.ImmutableTriple) IterativeLinearSolverNDArray(org.broadinstitute.hellbender.tools.coveragemodel.linalg.IterativeLinearSolverNDArray) GATKProtectedMathUtils(org.broadinstitute.hellbender.utils.GATKProtectedMathUtils) Nd4jIOUtils(org.broadinstitute.hellbender.tools.coveragemodel.nd4jutils.Nd4jIOUtils) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) File(java.io.File) INDArray(org.nd4j.linalg.api.ndarray.INDArray) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Transforms(org.nd4j.linalg.ops.transforms.Transforms) LogManager(org.apache.logging.log4j.LogManager) NoBracketingException(org.apache.commons.math3.exception.NoBracketingException) Tuple2(scala.Tuple2) HashPartitioner(org.apache.spark.HashPartitioner)

Aggregations

JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)99 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)44 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)42 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)42 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)41 Tuple2 (scala.Tuple2)35 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)33 JavaRDD (org.apache.spark.api.java.JavaRDD)28 List (java.util.List)27 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)24 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 Collectors (java.util.stream.Collectors)22 IOException (java.io.IOException)17 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)16 LongWritable (org.apache.hadoop.io.LongWritable)15 Broadcast (org.apache.spark.broadcast.Broadcast)15 Text (org.apache.hadoop.io.Text)12 UserException (org.broadinstitute.hellbender.exceptions.UserException)12 Function (org.apache.spark.api.java.function.Function)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11