Search in sources :

Example 6 with RScriptExecutor

use of org.broadinstitute.hellbender.utils.R.RScriptExecutor in project gatk by broadinstitute.

the class VariantRecalibrator method createVisualizationScript.

//TODO: does this R code have to be embedded here?
private void createVisualizationScript(final List<VariantDatum> randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys) {
    PrintStream stream;
    try {
        stream = new PrintStream(RSCRIPT_FILE);
    } catch (FileNotFoundException e) {
        throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, e);
    }
    // We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/
    stream.println("library(ggplot2)");
    // For compactPDF in R 2.13+
    stream.println("library(tools)");
    // For graphical functions R 2.14.2+
    stream.println("library(grid)");
    createArrangeFunction(stream);
    stream.println("outputPDF <- \"" + RSCRIPT_FILE + ".pdf\"");
    // Unfortunately this is a huge pdf file, BUGBUG: need to work on reducing the file size
    stream.println("pdf(outputPDF)");
    for (int iii = 0; iii < annotationKeys.length; iii++) {
        for (int jjj = iii + 1; jjj < annotationKeys.length; jjj++) {
            logger.info("Building " + annotationKeys[iii] + " x " + annotationKeys[jjj] + " plot...");
            final List<VariantDatum> fakeData = new ExpandingArrayList<>();
            double minAnn1 = 100.0, maxAnn1 = -100.0, minAnn2 = 100.0, maxAnn2 = -100.0;
            for (final VariantDatum datum : randomData) {
                minAnn1 = Math.min(minAnn1, datum.annotations[iii]);
                maxAnn1 = Math.max(maxAnn1, datum.annotations[iii]);
                minAnn2 = Math.min(minAnn2, datum.annotations[jjj]);
                maxAnn2 = Math.max(maxAnn2, datum.annotations[jjj]);
            }
            // Create a fake set of data which spans the full extent of these two annotation dimensions in order
            // to calculate the model PDF projected to 2D
            final double NUM_STEPS = 60.0;
            for (double ann1 = minAnn1; ann1 <= maxAnn1; ann1 += (maxAnn1 - minAnn1) / NUM_STEPS) {
                for (double ann2 = minAnn2; ann2 <= maxAnn2; ann2 += (maxAnn2 - minAnn2) / NUM_STEPS) {
                    final VariantDatum datum = new VariantDatum();
                    datum.prior = 0.0;
                    datum.annotations = new double[randomData.get(0).annotations.length];
                    datum.isNull = new boolean[randomData.get(0).annotations.length];
                    for (int ann = 0; ann < datum.annotations.length; ann++) {
                        datum.annotations[ann] = 0.0;
                        datum.isNull[ann] = true;
                    }
                    datum.annotations[iii] = ann1;
                    datum.annotations[jjj] = ann2;
                    datum.isNull[iii] = false;
                    datum.isNull[jjj] = false;
                    fakeData.add(datum);
                }
            }
            engine.evaluateData(fakeData, goodModel, false);
            engine.evaluateData(fakeData, badModel, true);
            stream.print("surface <- c(");
            for (final VariantDatum datum : fakeData) {
                stream.print(String.format("%.4f, %.4f, %.4f, ", dataManager.denormalizeDatum(datum.annotations[iii], iii), dataManager.denormalizeDatum(datum.annotations[jjj], jjj), Math.min(4.0, Math.max(-4.0, datum.lod))));
            }
            stream.println("NA,NA,NA)");
            stream.println("s <- matrix(surface,ncol=3,byrow=T)");
            stream.print("data <- c(");
            for (final VariantDatum datum : randomData) {
                stream.print(String.format("%.4f, %.4f, %.4f, %d, %d,", dataManager.denormalizeDatum(datum.annotations[iii], iii), dataManager.denormalizeDatum(datum.annotations[jjj], jjj), (datum.lod < lodCutoff ? -1.0 : 1.0), (datum.atAntiTrainingSite ? -1 : (datum.atTrainingSite ? 1 : 0)), (datum.isKnown ? 1 : -1)));
            }
            stream.println("NA,NA,NA,NA,1)");
            stream.println("d <- matrix(data,ncol=5,byrow=T)");
            final String surfaceFrame = "sf." + annotationKeys[iii] + "." + annotationKeys[jjj];
            final String dataFrame = "df." + annotationKeys[iii] + "." + annotationKeys[jjj];
            stream.println(surfaceFrame + " <- data.frame(x=s[,1], y=s[,2], lod=s[,3])");
            stream.println(dataFrame + " <- data.frame(x=d[,1], y=d[,2], retained=d[,3], training=d[,4], novelty=d[,5])");
            stream.println("dummyData <- " + dataFrame + "[1,]");
            stream.println("dummyData$x <- NaN");
            stream.println("dummyData$y <- NaN");
            stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
            stream.println("p1 = p +ggtitle(\"model PDF\") + labs(x=\"" + annotationKeys[iii] + "\", y=\"" + annotationKeys[jjj] + "\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\", space=\"rgb\")");
            stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
            stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)");
            stream.println("p2 = p + q + labs(x=\"" + annotationKeys[iii] + "\", y=\"" + annotationKeys[jjj] + "\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),guide=\"legend\",labels=c(\"filtered\",\"retained\"))");
            stream.println("p <- qplot(x,y,data=" + dataFrame + "[" + dataFrame + "$training != 0,], color=training, alpha=I(1/7)) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
            stream.println("q <- geom_point(aes(x=x,y=y,color=training),data=dummyData, alpha=1.0, na.rm=TRUE)");
            stream.println("p3 = p + q + labs(x=\"" + annotationKeys[iii] + "\", y=\"" + annotationKeys[jjj] + "\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1),guide=\"legend\", labels=c(\"neg\", \"pos\"))");
            stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
            stream.println("q <- geom_point(aes(x=x,y=y,color=novelty),data=dummyData, alpha=1.0, na.rm=TRUE)");
            stream.println("p4 = p + q + labs(x=\"" + annotationKeys[iii] + "\", y=\"" + annotationKeys[jjj] + "\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1),guide=\"legend\", labels=c(\"novel\",\"known\"))");
            stream.println("arrange(p1, p2, p3, p4, ncol=2)");
        }
    }
    stream.println("dev.off()");
    stream.println("if (exists(\"compactPDF\")) {");
    stream.println("compactPDF(outputPDF)");
    stream.println("}");
    stream.close();
    // Execute Rscript command to generate the clustering plots
    RScriptExecutor executor = new RScriptExecutor();
    executor.addScript(new File(RSCRIPT_FILE));
    logger.info("Executing: " + executor.getApproximateCommandLine());
    executor.exec();
}
Also used : ExpandingArrayList(org.broadinstitute.hellbender.utils.collections.ExpandingArrayList) PrintStream(java.io.PrintStream) FileNotFoundException(java.io.FileNotFoundException) RScriptExecutor(org.broadinstitute.hellbender.utils.R.RScriptExecutor) UserException(org.broadinstitute.hellbender.exceptions.UserException) File(java.io.File)

Example 7 with RScriptExecutor

use of org.broadinstitute.hellbender.utils.R.RScriptExecutor in project gatk by broadinstitute.

the class RCBSSegmenter method writeSegmentFile.

/**
     * Create a segmentation file using CBS in R.
     *
     * <p>https://www.bioconductor.org/packages/release/bioc/manuals/DNAcopy/man/DNAcopy.pdf</p>
     *
     * <p>Please see the above documentation for a more detailed description of the parameters.</p>
     *
     * <p>IMPORTANT:  There is no check that the weights have the same number of entries as the tnFile.</p>
     *
     * Wraps the call:
     * segment(x, weights = NULL, alpha = 0.01, nperm = 10000, p.method =
     *  c("hybrid", "perm"), min.width=2, kmax=25, nmin=200,
     *  eta=0.05, sbdry=NULL, trim = 0.025, undo.splits =
     *  c("none", "prune", "sdundo"), undo.prune=0.05,
     *  undo.SD=3, verbose=1)
     *
     * <p> Note that sbdry and verbose are unavailable through this interface. </p>
     *
     * @param sampleName Name of the sample being run through the segmenter.  Never {@code null}
     * @param tnFile Tangent-normalized targets file.  Never {@code null}
     * @param outputFile Full path to the outputted segment file.  Never {@code null}
     * @param log whether the tnFile input has already been put into log2CR.  Never {@code null}
     * @param minWidth minimum length for a segment
     * @param weightFile File containing weights for each target (doubles; one per line).  Must be the same length as what is in the tnFile (note that this is not
     *                enforced here).  Typically, 1/var(target) is what is used.  All values must be greater than 0.
     *                Use {@code null} if weighting is not desired.  These values should not be log space.
     */
public static void writeSegmentFile(final String sampleName, final String tnFile, final String outputFile, final Boolean log, final File weightFile, final double alpha, final int nperm, final PMethod pmethod, final int minWidth, final int kmax, final int nmin, final double eta, final double trim, final UndoSplits undoSplits, final double undoPrune, final int undoSD) {
    String logArg = log ? "TRUE" : "FALSE";
    final RScriptExecutor executor = new RScriptExecutor();
    executor.addScript(new Resource(R_SCRIPT, RCBSSegmenter.class));
    /*--args is needed for Rscript to recognize other arguments properly*/
    executor.addArgs("--args", "--sample_name=" + sampleName, "--targets_file=" + tnFile, "--output_file=" + outputFile, "--log2_input=" + logArg, "--min_width=" + String.valueOf(minWidth), "--alpha=" + String.valueOf(alpha), "--nperm=" + String.valueOf(nperm), "--pmethod=" + pmethod.toString(), "--kmax=" + String.valueOf(kmax), "--nmin=" + String.valueOf(nmin), "--eta=" + String.valueOf(eta), "--trim=" + String.valueOf(trim), "--undosplits=" + undoSplits.toString(), "--undoprune=" + String.valueOf(undoPrune), "--undoSD=" + String.valueOf(undoSD));
    if (weightFile != null) {
        final double[] weights = ParamUtils.readValuesFromFile(weightFile);
        // Check to make sure that no weights are zero.
        if (!DoubleStream.of(weights).allMatch(d -> d > 0 && !Double.isNaN(d) && Double.isFinite(d))) {
            throw new GATKException("A weight for a target was zero or less, which is not allowed.  If you truly want zero, you must remove the target from consideration.");
        }
        // Add the argument to specify weights
        executor.addArgs("--weights_file=" + weightFile.getAbsolutePath());
    }
    executor.exec();
}
Also used : Resource(org.broadinstitute.hellbender.utils.io.Resource) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File(java.io.File) RScriptExecutor(org.broadinstitute.hellbender.utils.R.RScriptExecutor) DoubleStream(java.util.stream.DoubleStream) Resource(org.broadinstitute.hellbender.utils.io.Resource) RScriptExecutor(org.broadinstitute.hellbender.utils.R.RScriptExecutor) GATKException(org.broadinstitute.hellbender.exceptions.GATKException)

Example 8 with RScriptExecutor

use of org.broadinstitute.hellbender.utils.R.RScriptExecutor in project gatk-protected by broadinstitute.

the class HMMUnitTest method setUp.

/**
     * This method runs the Test models and sequences against R's HMM package to
     * obtain independent estimates for the expected values for the Viterbi and Forward-Backward
     * algorithm.
     * <p>
     *     Typically this would be annotated as a <code>@BeforeClass</> method but unfortunately the data-provider
     *     code may be run before it and so we need to implement lazy initialization.
     * </p>
     * <p>
     *     Therefore the data-providers will call it once and only once before any of their data consumer test are run.
     * </p>
     *
     * @throws IOException
     */
private void setUp() throws IOException {
    final List<Integer> positions = IntStream.range(0, TEST_PATH_LENGTH).boxed().collect(Collectors.toList());
    TEST_SEQUENCES = TEST_MODELS.stream().map(m -> m.generate(positions, RANDOM)).collect(Collectors.toList());
    TEST_SEQUENCE_FILE = createTempFile("sequences", ".seq");
    final PrintWriter writer = new PrintWriter(new FileWriter(TEST_SEQUENCE_FILE));
    TEST_SEQUENCES.forEach(s -> {
        final StringBuilder builder = new StringBuilder(TEST_PATH_LENGTH);
        s.getSecond().forEach(c -> {
            builder.append(c.toString());
            builder.append(',');
        });
        builder.setLength(builder.length() - 1);
        writer.println(builder.toString());
    });
    writer.close();
    TEST_R_RESULTS_FILE = createTempFile("r-output", ".tab");
    RScriptExecutor rExecutor = new RScriptExecutor();
    final File script = composeRScriptFile();
    rExecutor.addScript(script);
    if (!rExecutor.exec()) {
        Assert.fail("could not obtain expected results from R");
    }
    @SuppressWarnings({ "rawtypes", "unchecked" }) final List<RResultRecord>[][] recordsByModelAndSequence = (List<RResultRecord>[][]) new List[TEST_MODELS.size()][TEST_SEQUENCES.size()];
    try (final RResultReader reader = new RResultReader(TEST_R_RESULTS_FILE)) {
        reader.stream().forEach(r -> {
            if (recordsByModelAndSequence[r.modelIndex][r.sequenceIndex] == null) {
                recordsByModelAndSequence[r.modelIndex][r.sequenceIndex] = new ArrayList<>(TEST_PATH_LENGTH);
            }
            recordsByModelAndSequence[r.modelIndex][r.sequenceIndex].add(r);
        });
    }
    TEST_EXPECTED_RESULTS = new ArrayList<>(TEST_MODELS.size() * TEST_SEQUENCES.size());
    for (int i = 0; i < recordsByModelAndSequence.length; i++) {
        for (int j = 0; j < recordsByModelAndSequence[i].length; j++) {
            TEST_EXPECTED_RESULTS.add(composeExpectedResult(recordsByModelAndSequence[i][j], i, j));
        }
    }
}
Also used : FileWriter(java.io.FileWriter) RScriptExecutor(org.broadinstitute.hellbender.utils.R.RScriptExecutor) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 9 with RScriptExecutor

use of org.broadinstitute.hellbender.utils.R.RScriptExecutor in project gatk by broadinstitute.

the class VariantRecalibrator method onTraversalSuccess.

//---------------------------------------------------------------------------------------------------------------
//
// on traversal success
//
//---------------------------------------------------------------------------------------------------------------
@Override
public Object onTraversalSuccess() {
    // finish processing any queued variants
    consumeQueuedVariants();
    for (int i = 1; i <= max_attempts; i++) {
        try {
            dataManager.setData(reduceSum);
            // Each data point is now (x - mean) / standard deviation
            dataManager.normalizeData();
            // Generate the positive model using the training data and evaluate each variant
            final List<VariantDatum> positiveTrainingData = dataManager.getTrainingData();
            final GaussianMixtureModel goodModel = engine.generateModel(positiveTrainingData, VRAC.MAX_GAUSSIANS);
            engine.evaluateData(dataManager.getData(), goodModel, false);
            // Generate the negative model using the worst performing data and evaluate each variant contrastively
            final List<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants();
            final GaussianMixtureModel badModel = engine.generateModel(negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS));
            // Don't need the aggregate data anymore so let's free up the memory
            dataManager.dropAggregateData();
            engine.evaluateData(dataManager.getData(), badModel, true);
            if (badModel.failedToConverge || goodModel.failedToConverge) {
                throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example)."));
            }
            if (outputModel) {
                GATKReport report = writeModelReport(goodModel, badModel, USE_ANNOTATIONS);
                try (final PrintStream modelReportStream = new PrintStream(modelReport)) {
                    report.print(modelReportStream);
                } catch (FileNotFoundException e) {
                    throw new UserException.CouldNotCreateOutputFile("File: (" + modelReport + ")", e);
                }
            }
            engine.calculateWorstPerformingAnnotation(dataManager.getData(), goodModel, badModel);
            // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user
            final int nCallsAtTruth = TrancheManager.countCallsAtTruth(dataManager.getData(), Double.NEGATIVE_INFINITY);
            final TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric(nCallsAtTruth);
            final List<Tranche> tranches = TrancheManager.findTranches(dataManager.getData(), TS_TRANCHES, metric, VRAC.MODE);
            tranchesStream.print(Tranche.tranchesString(tranches));
            logger.info("Writing out recalibration table...");
            dataManager.writeOutRecalibrationTable(recalWriter, getBestAvailableSequenceDictionary());
            if (RSCRIPT_FILE != null) {
                logger.info("Writing out visualization Rscript file...");
                createVisualizationScript(dataManager.getRandomDataForPlotting(1000, positiveTrainingData, negativeTrainingData, dataManager.getEvaluationData()), goodModel, badModel, 0.0, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.size()]));
            }
            if (VRAC.MODE == VariantRecalibratorArgumentCollection.Mode.INDEL) {
                // Print out an info message to make it clear why the tranches plot is not generated
                logger.info("Tranches plot will not be generated since we are running in INDEL mode");
            } else {
                // Execute the RScript command to plot the table of truth values
                RScriptExecutor executor = new RScriptExecutor();
                executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class));
                executor.addArgs(new File(TRANCHES_FILE).getAbsoluteFile(), TARGET_TITV);
                // Print out the command line to make it clear to the user what is being executed and how one might modify it
                logger.info("Executing: " + executor.getApproximateCommandLine());
                executor.exec();
            }
            return true;
        } catch (Exception e) {
            // IllegalArgumentException.
            if (i == max_attempts) {
                throw e;
            } else {
                logger.info(String.format("Exception occurred on attempt %d of %d. Trying again. Message was: '%s'", i, max_attempts, e.getMessage()));
            }
        }
    }
    return false;
}
Also used : GATKReport(org.broadinstitute.hellbender.utils.report.GATKReport) PrintStream(java.io.PrintStream) FileNotFoundException(java.io.FileNotFoundException) Resource(org.broadinstitute.hellbender.utils.io.Resource) FileNotFoundException(java.io.FileNotFoundException) UserException(org.broadinstitute.hellbender.exceptions.UserException) CommandLineException(org.broadinstitute.barclay.argparser.CommandLineException) RScriptExecutor(org.broadinstitute.hellbender.utils.R.RScriptExecutor) UserException(org.broadinstitute.hellbender.exceptions.UserException) File(java.io.File)

Example 10 with RScriptExecutor

use of org.broadinstitute.hellbender.utils.R.RScriptExecutor in project gatk by broadinstitute.

the class CollectRnaSeqMetrics method finish.

@Override
protected void finish() {
    collector.finish();
    final MetricsFile<RnaSeqMetrics, Integer> file = getMetricsFile();
    collector.addAllLevelsToFile(file);
    file.write(OUTPUT);
    boolean atLeastOneHistogram = false;
    for (Histogram<Integer> histo : file.getAllHistograms()) {
        atLeastOneHistogram = atLeastOneHistogram || !histo.isEmpty();
    }
    // Generate the coverage by position plot
    if (CHART_OUTPUT != null && atLeastOneHistogram) {
        final RScriptExecutor executor = new RScriptExecutor();
        executor.addScript(new Resource(R_SCRIPT, CollectRnaSeqMetrics.class));
        executor.addArgs(OUTPUT.getAbsolutePath(), CHART_OUTPUT.getAbsolutePath(), INPUT.getName(), plotSubtitle);
        executor.exec();
    }
}
Also used : Resource(org.broadinstitute.hellbender.utils.io.Resource) RScriptExecutor(org.broadinstitute.hellbender.utils.R.RScriptExecutor)

Aggregations

RScriptExecutor (org.broadinstitute.hellbender.utils.R.RScriptExecutor)18 Resource (org.broadinstitute.hellbender.utils.io.Resource)14 File (java.io.File)8 UserException (org.broadinstitute.hellbender.exceptions.UserException)3 MetricsFile (htsjdk.samtools.metrics.MetricsFile)2 FileNotFoundException (java.io.FileNotFoundException)2 FileWriter (java.io.FileWriter)2 PrintStream (java.io.PrintStream)2 PrintWriter (java.io.PrintWriter)2 DoubleStream (java.util.stream.DoubleStream)2 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)2 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)2 SAMRecord (htsjdk.samtools.SAMRecord)1 SamReader (htsjdk.samtools.SamReader)1 ReferenceSequence (htsjdk.samtools.reference.ReferenceSequence)1 ReferenceSequenceFileWalker (htsjdk.samtools.reference.ReferenceSequenceFileWalker)1 Histogram (htsjdk.samtools.util.Histogram)1 NumberFormat (java.text.NumberFormat)1 CommandLineException (org.broadinstitute.barclay.argparser.CommandLineException)1 ExpandingArrayList (org.broadinstitute.hellbender.utils.collections.ExpandingArrayList)1