Search in sources :

Example 1 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class LilacBatch method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript commands, final RuntimeFiles executionFlags) {
    // Inputs: SampleId,ExpectedAlleles
    final InputFileDescriptor runData = inputs.get();
    final String batchInputs = runData.inputValue();
    final String[] batchItems = batchInputs.split(",");
    List<String> sampleIds = Lists.newArrayList(batchItems[0]);
    boolean hasRna = batchItems.length > 1 && batchItems[1].equals("RNA");
    // String runDirectory = "run_ref_18";
    String runDirectory = "run_ref_non_truth_01";
    // download pilot Lilac jar
    addLilacDownloadCommands(commands);
    for (String sampleId : sampleIds) {
        addSampleCommands(runData, commands, runDirectory, sampleId, hasRna);
    }
    commands.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "lilac"), executionFlags));
    // and copy the run log files to a single directory for convenience
    String commonLogDir = String.format("gs://%s/%s/logs/", LILAC_BATCH_BUCKET, runDirectory);
    commands.addCommand(() -> format("gsutil -m cp /data/output/*.log %s", commonLogDir));
    return ImmutableVirtualMachineJobDefinition.builder().name("lilac").startupCommand(commands).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
Also used : OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor)

Example 2 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class SageBenchmarks method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String[] inputData = descriptor.inputValue().split(",", -1);
    final String sampleId = inputData[0];
    final String referenceId = inputData[1];
    String runData = inputData[2];
    boolean runTumorNormal = runData.equals("TumorNormal");
    boolean runTumorOnly = runData.equals("TumorOnly");
    boolean runGermline = runData.equals("Germline");
    final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, SAGE_DIR, SAGE_JAR, VmDirectories.TOOLS));
    /*
        startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s",
                BATCH_TOOLS_BUCKET, PAVE_DIR, PAVE_JAR, VmDirectories.TOOLS));

        String ponFile = "SageGermlinePon.1000x.37.tsv.gz";

        startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s",
                BATCH_RESOURCE_BUCKET, SAGE_DIR, ponFile, VmDirectories.INPUT));
        */
    // download tumor and ref BAMs as required
    String tumorBamFile = String.format("%s.bam", sampleId);
    String referenceBamFile = String.format("%s.bam", referenceId);
    if (runTumorNormal || runTumorOnly) {
        startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s* %s", BATCH_BENCHMARKS_BUCKET, sampleId, tumorBamFile, VmDirectories.INPUT));
    }
    if (runTumorNormal || runGermline) {
        startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s* %s", BATCH_BENCHMARKS_BUCKET, sampleId, referenceBamFile, VmDirectories.INPUT));
    }
    // run Sage
    final StringJoiner sageArgs = new StringJoiner(" ");
    if (runTumorNormal || runTumorOnly) {
        sageArgs.add(String.format("-tumor %s", sampleId));
        sageArgs.add(String.format("-tumor_bam %s/%s", VmDirectories.INPUT, tumorBamFile));
    } else if (runGermline) {
        sageArgs.add(String.format("-tumor %s", sampleId));
        sageArgs.add(String.format("-tumor_bam %s/%s", VmDirectories.INPUT, referenceBamFile));
    }
    if (runTumorNormal) {
        sageArgs.add(String.format("-reference %s", referenceId));
        sageArgs.add(String.format("-reference_bam %s/%s", VmDirectories.INPUT, referenceBamFile));
    } else if (runGermline) {
        sageArgs.add(String.format("-reference %s", referenceId));
        sageArgs.add(String.format("-reference_bam %s/%s", VmDirectories.INPUT, tumorBamFile));
    }
    if (runGermline) {
        sageArgs.add(String.format("-hotspots %s", resourceFiles.sageGermlineHotspots()));
        sageArgs.add(String.format("-panel_bed %s", resourceFiles.sageGermlineCodingPanel()));
    } else {
        sageArgs.add(String.format("-hotspots %s", resourceFiles.sageSomaticHotspots()));
        sageArgs.add(String.format("-panel_bed %s", resourceFiles.sageSomaticCodingPanel()));
    }
    sageArgs.add(String.format("-high_confidence_bed %s", resourceFiles.giabHighConfidenceBed()));
    sageArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
    sageArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
    sageArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
    if (runGermline) {
        sageArgs.add("-panel_only");
        sageArgs.add("-hotspot_min_tumor_qual 50");
        sageArgs.add("-panel_min_tumor_qual 75");
        sageArgs.add("-hotspot_max_germline_vaf 100");
        sageArgs.add("-hotspot_max_germline_rel_raw_base_qual 100");
        sageArgs.add("-panel_max_germline_vaf 100");
        sageArgs.add("-panel_max_germline_rel_raw_base_qual 100");
        sageArgs.add("-mnv_filter_enabled false");
    }
    String sageVcf;
    if (runTumorOnly)
        sageVcf = String.format("%s/%s.sage.tumor_only.vcf.gz", VmDirectories.OUTPUT, sampleId);
    else if (runGermline)
        sageVcf = String.format("%s/%s.sage.germline.vcf.gz", VmDirectories.OUTPUT, sampleId);
    else
        sageVcf = String.format("%s/%s.sage.somatic.vcf.gz", VmDirectories.OUTPUT, sampleId);
    sageArgs.add(String.format("-out %s", sageVcf));
    sageArgs.add(String.format("-perf_warn_time 50"));
    sageArgs.add(String.format("-threads %s", Bash.allCpus()));
    startupScript.addCommand(() -> format("java -Xmx48G -jar %s/%s %s", VmDirectories.TOOLS, SAGE_JAR, sageArgs.toString()));
    /*
        // annotate with Pave - PON and gene impacts
        final StringJoiner paveArgs = new StringJoiner(" ");
        String ponFilters = "HOTSPOT:5:5;PANEL:2:5;UNKNOWN:2:0";

        final String paveVcf = String.format("%s/%s.sage.somatic.pon.pave.vcf.gz", VmDirectories.OUTPUT, sampleId);

        paveArgs.add(String.format("-sample %s", sampleId));
        paveArgs.add(String.format("-vcf_file %s", sageVcf)); // ponFilterVcf from BCF Tools

        paveArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
        paveArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
        paveArgs.add(String.format("-driver_gene_panel %s", resourceFiles.driverGenePanel()));
        paveArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
        paveArgs.add(String.format("-pon_file %s/%s", VmDirectories.INPUT, ponFile));
        paveArgs.add(String.format("-pon_filters \"%s\"", ponFilters));
        paveArgs.add(String.format("-output_vcf_file %s", paveVcf));

        String paveJar = String.format("%s/%s", VmDirectories.TOOLS, PAVE_JAR);

        startupScript.addCommand(() -> format("java -jar %s %s", paveJar, paveArgs.toString()));
        */
    // upload output
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
    return ImmutableVirtualMachineJobDefinition.builder().name("sage").startupCommand(startupScript).performanceProfile(custom(24, 64)).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
Also used : ResourceFiles(com.hartwig.pipeline.resource.ResourceFiles) OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) StringJoiner(java.util.StringJoiner)

Example 3 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class SageCompare method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String[] sampleData = descriptor.inputValue().split(",", -1);
    final String sampleId = sampleData[0];
    String runTypes = sampleData.length > 1 ? sampleData[1] : RUN_BOTH;
    boolean runBoth = runTypes.equalsIgnoreCase(RUN_BOTH);
    boolean cramVsBam = runTypes.equalsIgnoreCase(RUN_CRAM_VS_BAM);
    boolean runOld = runBoth || runTypes.equalsIgnoreCase(RUN_OLD);
    boolean runNew = runBoth || cramVsBam || runTypes.equalsIgnoreCase(RUN_NEW);
    final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, SAGE_DIR, SAGE_JAR, VmDirectories.TOOLS));
    final RemoteLocationsApi locations = new RemoteLocationsApi("hmf-crunch", sampleId);
    String[] tumorCramData = getCramFileData(locations.getTumorAlignment());
    String tumorCramFile = tumorCramData[CRAM_FILENAME];
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp gs://%s* %s", tumorCramData[CRAM_FULL_PATH], VmDirectories.INPUT));
    String referenceId = locations.getReference();
    String[] refCramData = getCramFileData(locations.getReferenceAlignment());
    String refCramFile = refCramData[CRAM_FILENAME];
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp gs://%s* %s", refCramData[CRAM_FULL_PATH], VmDirectories.INPUT));
    // download tumor CRAM
    String localTumorCram = String.format("%s/%s", VmDirectories.INPUT, tumorCramFile);
    String localRefCram = String.format("%s/%s", VmDirectories.INPUT, refCramFile);
    // and convert to BAM
    startupScript.addCommands(cramToBam(localTumorCram));
    startupScript.addCommands(cramToBam(localRefCram));
    String localTumorBam = localTumorCram.replace("cram", "bam");
    String localRefBam = localRefCram.replace("cram", "bam");
    if (runOld) {
        final String oldSageVcf = String.format("%s/%s.sage.somatic.vcf.gz", VmDirectories.OUTPUT, sampleId);
        // run old Sage
        final StringJoiner oldSageArgs = new StringJoiner(" ");
        oldSageArgs.add(String.format("-tumor %s", sampleId));
        oldSageArgs.add(String.format("-tumor_bam %s", localTumorBam));
        oldSageArgs.add(String.format("-reference %s", referenceId));
        oldSageArgs.add(String.format("-reference_bam %s", localRefBam));
        oldSageArgs.add(String.format("-hotspots %s", resourceFiles.sageSomaticHotspots()));
        oldSageArgs.add(String.format("-panel_bed %s", resourceFiles.sageSomaticCodingPanel()));
        oldSageArgs.add(String.format("-high_confidence_bed %s", resourceFiles.giabHighConfidenceBed()));
        oldSageArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
        oldSageArgs.add("-assembly hg19");
        oldSageArgs.add("-bqr_plot false");
        oldSageArgs.add(String.format("-out %s", oldSageVcf));
        oldSageArgs.add(String.format("-threads %s", Bash.allCpus()));
        // oldSageArgs.add("-chr 14");
        String oldSageJar = String.format("sage/%s/sage.jar", Versions.SAGE);
        startupScript.addCommand(() -> format("java -Xmx48G -jar %s/%s %s", VmDirectories.TOOLS, oldSageJar, oldSageArgs.toString()));
    }
    if (runNew) {
        final String newSageVcf = String.format("%s/%s.sage.somatic.vcf.gz", VmDirectories.OUTPUT, sampleId);
        final StringJoiner newSageArgs = new StringJoiner(" ");
        newSageArgs.add(String.format("-tumor %s", sampleId));
        newSageArgs.add(String.format("-tumor_bam %s", localTumorBam));
        newSageArgs.add(String.format("-reference %s", referenceId));
        newSageArgs.add(String.format("-reference_bam %s", localRefBam));
        newSageArgs.add(String.format("-hotspots %s", resourceFiles.sageSomaticHotspots()));
        newSageArgs.add(String.format("-panel_bed %s", resourceFiles.sageSomaticCodingPanel()));
        newSageArgs.add(String.format("-high_confidence_bed %s", resourceFiles.giabHighConfidenceBed()));
        newSageArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
        newSageArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
        newSageArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
        newSageArgs.add(String.format("-perf_warn_time 50"));
        newSageArgs.add(String.format("-log_debug"));
        newSageArgs.add(String.format("-out %s", newSageVcf));
        newSageArgs.add(String.format("-threads %s", Bash.allCpus()));
        startupScript.addCommand(() -> format("java -Xmx48G -jar %s/%s %s", VmDirectories.TOOLS, SAGE_JAR, newSageArgs.toString()));
    }
    if (cramVsBam) {
        final String newCramSageVcf = String.format("%s/%s.sage.somatic.cram.vcf.gz", VmDirectories.OUTPUT, sampleId);
        final StringJoiner newSageArgs = new StringJoiner(" ");
        newSageArgs.add(String.format("-tumor %s", sampleId));
        newSageArgs.add(String.format("-tumor_bam %s", localTumorCram));
        newSageArgs.add(String.format("-reference %s", referenceId));
        newSageArgs.add(String.format("-reference_bam %s", localRefCram));
        newSageArgs.add(String.format("-hotspots %s", resourceFiles.sageSomaticHotspots()));
        newSageArgs.add(String.format("-panel_bed %s", resourceFiles.sageSomaticCodingPanel()));
        newSageArgs.add(String.format("-high_confidence_bed %s", resourceFiles.giabHighConfidenceBed()));
        newSageArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
        newSageArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
        newSageArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
        newSageArgs.add(String.format("-perf_warn_time 50"));
        newSageArgs.add(String.format("-log_debug"));
        newSageArgs.add(String.format("-out %s", newCramSageVcf));
        newSageArgs.add(String.format("-threads %s", Bash.allCpus()));
        startupScript.addCommand(() -> format("java -Xmx48G -jar %s/%s %s", VmDirectories.TOOLS, SAGE_JAR, newSageArgs.toString()));
    }
    // upload output
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
    return ImmutableVirtualMachineJobDefinition.builder().name("sage").startupCommand(startupScript).performanceProfile(custom(24, 64)).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
Also used : ResourceFiles(com.hartwig.pipeline.resource.ResourceFiles) OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) StringJoiner(java.util.StringJoiner) RemoteLocationsApi(com.hartwig.batch.api.RemoteLocationsApi)

Example 4 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class RnaSalmon method execute.

@Override
public VirtualMachineJobDefinition execute(InputBundle inputs, RuntimeBucket bucket, BashStartupScript startupScript, RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String batchInputs = descriptor.inputValue();
    final String[] batchItems = batchInputs.split(",");
    if (batchItems.length != 2) {
        System.out.print(String.format("invalid input arguments(%s) - expected SampleId,PathToFastqFiles", batchInputs));
        return null;
    }
    final String sampleId = batchItems[0];
    final String fastqFilelist = batchItems[1];
    final List<String> sampleFastqFiles = getSampleFastqFileList(sampleId, fastqFilelist);
    if (sampleFastqFiles.isEmpty()) {
        System.out.print(String.format("sampleId(%s) fastq files not found", sampleId));
        return null;
    }
    // copy down FASTQ files for this sample
    for (final String fastqFile : sampleFastqFiles) {
        startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s %s", fastqFile, VmDirectories.INPUT));
    }
    // copy down the executable
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp -r %s/%s %s", SALMON_RESOURCES, SALMON, VmDirectories.TOOLS));
    startupScript.addCommand(() -> format("chmod a+x %s/%s", VmDirectories.TOOLS, SALMON_BINARY));
    // locate the FASTQ files for reads 1 and 2
    final String r1Files = format("%s/*R1_001.fastq.gz", VmDirectories.INPUT);
    final String r2Files = format("%s/*R2_001.fastq.gz", VmDirectories.INPUT);
    // copy reference files for SALMON
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp -r %s/%s %s", SALMON_RESOURCES, SALMON_INDEX_DIR, VmDirectories.INPUT));
    final String salmonGeneIndexDir = String.format("%s/%s", VmDirectories.INPUT, SALMON_INDEX_DIR);
    // logging
    final String threadCount = Bash.allCpus();
    startupScript.addCommand(() -> format("cd %s", VmDirectories.OUTPUT));
    /*
        genome_ref=$ref_root/salmon_gene_ihs37d5_GENCODE19ndex
        READ1=${input_dir}/*R1_001.fastq.gz
        READ2=${input_dir}/*R2_001.fastq.gz
        threads=6

        ${salmon} quant \
          -i ${genome_ref} \
          -l A -1 ${READ1} -2 ${READ2} \
          -p ${threads} --validateMappings \
          -o ${output_dir}
         */
    // run the STAR mapper
    StringBuilder salmonArgs = new StringBuilder();
    salmonArgs.append("quant");
    salmonArgs.append(String.format(" -i %s", salmonGeneIndexDir));
    salmonArgs.append(" -l A");
    salmonArgs.append(String.format(" -1 %s", r1Files));
    salmonArgs.append(String.format(" -2 %s", r2Files));
    salmonArgs.append(String.format(" -p %s", threadCount));
    salmonArgs.append(" --validateMappings");
    salmonArgs.append(String.format(" -o %s", VmDirectories.OUTPUT));
    startupScript.addCommand(() -> format("%s/%s %s", VmDirectories.TOOLS, SALMON_BINARY, salmonArgs.toString()));
    final String rawOutputFile = "quant.sf";
    final String outputFile = sampleId + ".salmon.tsv";
    startupScript.addCommand(() -> format("mv %s %s", rawOutputFile, outputFile));
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(bucket.name(), "salmon"), executionFlags));
    // copy results to rna-analysis location on crunch
    startupScript.addCommand(() -> format("gsutil -m cp %s/* %s/%s/salmon/", VmDirectories.OUTPUT, RNA_COHORT_LOCATION_V37, sampleId));
    return ImmutableVirtualMachineJobDefinition.builder().name("rna-salmon").startupCommand(startupScript).namespacedResults(ResultsDirectory.defaultDirectory()).workingDiskSpaceGb(500).performanceProfile(VirtualMachinePerformanceProfile.custom(12, 36)).build();
}
Also used : OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor)

Example 5 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class RnaStarMapping method execute.

@Override
public VirtualMachineJobDefinition execute(InputBundle inputs, RuntimeBucket bucket, BashStartupScript startupScript, RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String batchInputs = descriptor.inputValue();
    final String[] batchItems = batchInputs.split(",");
    // required format: SampleId,RefGenomeVersion (37 by default),FASTA file bucket
    /*
        if(batchItems.length != 2)
        {
            System.out.print(String.format("invalid input arguments(%s) - expected SampleId,RefGenomeVersion,FastqFileBucketDir", batchInputs));
            return null;
        }
        */
    final String sampleId = batchItems[0];
    final RefGenomeVersion refGenomeVersion = batchItems.length >= 2 ? RefGenomeVersion.valueOf(batchItems[1]) : V37;
    final String sampleBucket = batchItems[2];
    /*
        if(batchItems.length >= 3)
        {
            final String fastqFilelist = batchItems[2];

            final List<String> sampleFastqFiles = getSampleFastqFileList(sampleId, fastqFilelist);

            if(sampleFastqFiles.isEmpty()) {
                System.out.print(String.format("sampleId(%s) fastq files not found", sampleId));
                return null;
            }

            // copy down FASTQ files for this sample
            for(final String fastqFile : sampleFastqFiles)
            {
                startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s %s", fastqFile, VmDirectories.INPUT));
            }
        }
        else
        {
            // expected location: "gs://cpct02010255tii-rna-reads/1.3/CPCT02010255TII_AHWGLNBGX5_S4_L002_R1_001.fastq.gz
            final String sampleFastqFiles = String.format("gs://%s-rna-reads/1.3/*.fastq.gz", sampleId.toLowerCase());
            startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s %s", sampleFastqFiles, VmDirectories.INPUT));
        }
        */
    final String sampleFastqFiles = String.format("%s/*.fastq.gz", sampleBucket);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s %s", sampleFastqFiles, VmDirectories.INPUT));
    // locate the FASTQ files for reads 1 and 2
    final String r1Files = format("$(ls %s/*_R1* | tr '\\n' ',')", VmDirectories.INPUT);
    final String r2Files = format("$(ls %s/*_R2* | tr '\\n' ',')", VmDirectories.INPUT);
    // copy reference files for STAR
    final String starGenomeDir = getRnaResourceDirectory(refGenomeVersion, STAR_DIR);
    final String localStarGenomeDir = String.format("%s/%s", VmDirectories.INPUT, STAR_DIR);
    startupScript.addCommand(() -> format("mkdir %s", localStarGenomeDir));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/* %s", starGenomeDir, localStarGenomeDir));
    final String threadCount = Bash.allCpus();
    startupScript.addCommand(() -> format("cd %s", VmDirectories.OUTPUT));
    // run the STAR mapper
    final String[] starArgs = { "--runThreadN", threadCount, "--genomeDir", localStarGenomeDir, "--genomeLoad", "NoSharedMemory", "--readFilesIn", r1Files, r2Files, "--readFilesCommand", "zcat", "--outSAMtype", "BAM", "Unsorted", "--outSAMunmapped", "Within", "--outBAMcompression", "0", "--outSAMattributes", "All", "--outFilterMultimapNmax", "10", "--outFilterMismatchNmax", "3", "limitOutSJcollapsed", "3000000", "--chimSegmentMin", "10", "--chimOutType", "WithinBAM", "SoftClip", "--chimJunctionOverhangMin", "10", "--chimSegmentReadGapMax", "3", "--chimScoreMin", "1", "--chimScoreDropMax", "30", "--chimScoreJunctionNonGTAG", "0", "--chimScoreSeparation", "1", "--outFilterScoreMinOverLread", "0.33", "--outFilterMatchNminOverLread", "0.33", "--outFilterMatchNmin", "35", "--alignSplicedMateMapLminOverLmate", "0.33", "--alignSplicedMateMapLmin", "35", "--alignSJstitchMismatchNmax", "5", "-1", "5", "5" };
    startupScript.addCommand(new VersionedToolCommand("star", "STAR", "2.7.3a", starArgs));
    final String bamFile = "Aligned.out.bam";
    // sort the BAM
    final String sortedBam = sampleId + ".sorted.bam";
    final String[] sortArgs = { "sort", "-@", threadCount, "-m", "2G", "-T", "tmp", "-O", "bam", bamFile, "-o", sortedBam };
    startupScript.addCommand(new VersionedToolCommand("samtools", "samtools", Versions.SAMTOOLS, sortArgs));
    // mark duplicate fragment reads within the BAM
    final String sortedDedupedBam = sampleId + ".sorted.dups.bam";
    final String[] dupArgs = { "markdup", "-t", threadCount, "--overflow-list-size=45000000", sortedBam, sortedDedupedBam };
    startupScript.addCommand(new SambambaCommand(dupArgs));
    final String[] indexArgs = { "index", sortedDedupedBam };
    startupScript.addCommand(new VersionedToolCommand("samtools", "samtools", Versions.SAMTOOLS, indexArgs));
    // clean up intermediary BAMs
    startupScript.addCommand(() -> format("rm -f %s", bamFile));
    startupScript.addCommand(() -> format("rm -f %s", sortedBam));
    final String starStats = "Log.final.out";
    final String statsFile = sampleId + "." + starStats;
    startupScript.addCommand(() -> format("mv %s %s", starStats, statsFile));
    // run QC stats on the fast-Qs as well
    // final String fastqcOutputDir = format("%s/fastqc", VmDirectories.OUTPUT);
    // startupScript.addCommand(() -> format("mkdir %s", fastqcOutputDir));
    // final String allFastQs = format("%s/*gz", VmDirectories.INPUT);
    // final String[] fastqcArgs = {"-o", fastqcOutputDir, allFastQs};
    // TEMP until reimage has taken place
    // startupScript.addCommand(() -> format("chmod a+x /opt/tools/fastqc/0.11.4/fastqc"));
    // startupScript.addCommand(new VersionedToolCommand("fastqc", "fastqc", "0.11.4", fastqcArgs));
    // upload the results
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(bucket.name(), "star"), executionFlags));
    // copy results to crunch
    final String samplesDir = String.format("%s/%s", getRnaCohortDirectory(refGenomeVersion), "samples");
    startupScript.addCommand(() -> format("gsutil -m cp %s/* %s/%s/", VmDirectories.OUTPUT, samplesDir, sampleId));
    return ImmutableVirtualMachineJobDefinition.builder().name("rna-star-mapping").startupCommand(startupScript).namespacedResults(ResultsDirectory.defaultDirectory()).workingDiskSpaceGb(500).performanceProfile(VirtualMachinePerformanceProfile.custom(12, 48)).build();
}
Also used : OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) RefGenomeVersion(com.hartwig.pipeline.resource.RefGenomeVersion) SambambaCommand(com.hartwig.pipeline.execution.vm.SambambaCommand) VersionedToolCommand(com.hartwig.pipeline.calling.command.VersionedToolCommand)

Aggregations

OutputUpload (com.hartwig.pipeline.execution.vm.OutputUpload)40 InputFileDescriptor (com.hartwig.batch.input.InputFileDescriptor)35 ResourceFiles (com.hartwig.pipeline.resource.ResourceFiles)24 StringJoiner (java.util.StringJoiner)12 GoogleStorageLocation (com.hartwig.pipeline.storage.GoogleStorageLocation)9 RemoteLocationsApi (com.hartwig.batch.api.RemoteLocationsApi)7 CopyLogToOutput (com.hartwig.pipeline.execution.vm.CopyLogToOutput)6 VersionedToolCommand (com.hartwig.pipeline.calling.command.VersionedToolCommand)5 RefGenomeVersion (com.hartwig.pipeline.resource.RefGenomeVersion)5 ResourceFilesFactory.buildResourceFiles (com.hartwig.pipeline.resource.ResourceFilesFactory.buildResourceFiles)5 SubStageInputOutput (com.hartwig.pipeline.stages.SubStageInputOutput)5 File (java.io.File)5 BwaCommand (com.hartwig.pipeline.calling.command.BwaCommand)3 SamtoolsCommand (com.hartwig.pipeline.calling.command.SamtoolsCommand)3 InputDownload (com.hartwig.pipeline.execution.vm.InputDownload)3 OutputFile (com.hartwig.pipeline.execution.vm.OutputFile)3 SageApplication (com.hartwig.pipeline.calling.sage.SageApplication)2 SageCommandBuilder (com.hartwig.pipeline.calling.sage.SageCommandBuilder)2 GridssAnnotation (com.hartwig.pipeline.calling.structural.gridss.stage.GridssAnnotation)2 PipelineStatus (com.hartwig.pipeline.execution.PipelineStatus)2