Search in sources :

Example 26 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class RnaArriba method execute.

@Override
public VirtualMachineJobDefinition execute(InputBundle inputs, RuntimeBucket bucket, BashStartupScript startupScript, RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String batchInputs = descriptor.inputValue();
    final String sampleId = batchInputs;
    // copy down BAM and index file for this sample
    final String bamFile = String.format("%s%s", sampleId, RNA_BAM_FILE_ID);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", RNA_COHORT_LOCATION_V37, sampleId, bamFile, VmDirectories.INPUT));
    final String bamIndexFile = String.format("%s%s", sampleId, RNA_BAM_INDEX_FILE_ID);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", RNA_COHORT_LOCATION_V37, sampleId, bamIndexFile, VmDirectories.INPUT));
    // copy down the executable
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s %s", ARRIBA_RESOURCES, ARRIBA_TOOL, VmDirectories.TOOLS));
    startupScript.addCommand(() -> format("chmod a+x %s/%s", VmDirectories.TOOLS, ARRIBA_TOOL));
    // copy down required reference files
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp -r %s/%s %s", ARRIBA_RESOURCES, REF_GENOME, VmDirectories.INPUT));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s %s", ARRIBA_RESOURCES, GENE_DEFINITIONS, VmDirectories.INPUT));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s %s", ARRIBA_RESOURCES, BLACKLIST, VmDirectories.INPUT));
    startupScript.addCommand(() -> format("cd %s", VmDirectories.OUTPUT));
    // run Arriba
    StringBuilder arribaArgs = new StringBuilder();
    arribaArgs.append(String.format(" -x %s/%s", VmDirectories.INPUT, bamFile));
    arribaArgs.append(String.format(" -o %s/%s.fusions.tsv", VmDirectories.OUTPUT, sampleId));
    arribaArgs.append(String.format(" -O %s/%s.fusions.discarded.tsv", VmDirectories.OUTPUT, sampleId));
    arribaArgs.append(String.format(" -a %s/%s", VmDirectories.INPUT, REF_GENOME));
    arribaArgs.append(String.format(" -g %s/%s", VmDirectories.INPUT, GENE_DEFINITIONS));
    arribaArgs.append(String.format(" -b %s/%s", VmDirectories.INPUT, BLACKLIST));
    arribaArgs.append(" -T -P");
    startupScript.addCommand(() -> format("%s/%s %s", VmDirectories.TOOLS, ARRIBA_TOOL, arribaArgs.toString()));
    /*
            ./tools/arriba_v1.1.0/arriba
            -x ./runs/CPCT02020378T/CPCT02020378T.sorted.bam
            -o ./runs/CPCT02020378T/fusions.tsv -O
            ./runs/CPCT02020378T/fusions.discarded.tsv
            -a "./ref/hs37d5_GENCODE19/hs37d5.fa"
            -g "./ref/hs37d5_GENCODE19/GENCODE19.gtf"
            -b "./tools/arriba_v1.1.0/database/blacklist_hg19_hs37d5_GRCh37_2018-11-04.tsv.gz"
            -T -P
         */
    // upload the results
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(bucket.name(), "arriba"), executionFlags));
    // copy results to rna-analysis location on crunch
    startupScript.addCommand(() -> format("gsutil -m cp %s/* %s/%s/arriba/", VmDirectories.OUTPUT, RNA_COHORT_LOCATION_V37, sampleId));
    return ImmutableVirtualMachineJobDefinition.builder().name("rna-arriba").startupCommand(startupScript).namespacedResults(ResultsDirectory.defaultDirectory()).workingDiskSpaceGb(100).performanceProfile(VirtualMachinePerformanceProfile.custom(12, 64)).build();
}
Also used : OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor)

Example 27 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class RnaIsofoxExonCounts method execute.

@Override
public VirtualMachineJobDefinition execute(InputBundle inputs, RuntimeBucket bucket, BashStartupScript startupScript, RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String batchInputs = descriptor.inputValue();
    final String[] batchItems = batchInputs.split(",");
    if (batchItems.length < 2) {
        System.out.print(String.format("invalid input arguments(%s) - expected SampleId,ReadLength", batchInputs));
        return null;
    }
    final String sampleId = batchItems[COL_SAMPLE_ID];
    final String geneIds = batchItems[COL_GENE_IDS];
    final RefGenomeVersion refGenomeVersion = V37;
    final ResourceFiles resourceFiles = buildResourceFiles(refGenomeVersion);
    // final String rnaCohortDirectory = getRnaCohortDirectory(refGenomeVersion);
    final String samplesDir = String.format("%s/%s", getRnaCohortDirectory(refGenomeVersion), "samples");
    // copy down BAM and index file for this sample
    final String bamFile = String.format("%s%s", sampleId, RNA_BAM_FILE_ID);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", samplesDir, sampleId, bamFile, VmDirectories.INPUT));
    final String bamIndexFile = String.format("%s%s", sampleId, RNA_BAM_INDEX_FILE_ID);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", samplesDir, sampleId, bamIndexFile, VmDirectories.INPUT));
    // copy down the Isofox JAR
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s %s", ISOFOX_LOCATION, ISOFOX_JAR, VmDirectories.TOOLS));
    startupScript.addCommand(() -> format("cd %s", VmDirectories.OUTPUT));
    // run Isofox
    StringJoiner isofoxArgs = new StringJoiner(" ");
    isofoxArgs.add(String.format("-sample %s", sampleId));
    isofoxArgs.add(String.format("-functions %s", FUNC_TRANSCRIPT_COUNTS));
    isofoxArgs.add(String.format("-output_dir %s/", VmDirectories.OUTPUT));
    isofoxArgs.add(String.format("-bam_file %s/%s", VmDirectories.INPUT, bamFile));
    isofoxArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
    isofoxArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
    isofoxArgs.add(String.format("-write_exon_data"));
    // isofoxArgs.add(String.format("-write_read_data"));
    isofoxArgs.add(String.format("-restricted_gene_ids %s", geneIds));
    startupScript.addCommand(() -> format("java -jar %s/%s %s", VmDirectories.TOOLS, ISOFOX_JAR, isofoxArgs.toString()));
    // upload the results
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(bucket.name(), "isofox"), executionFlags));
    return ImmutableVirtualMachineJobDefinition.builder().name("rna-isofox").startupCommand(startupScript).namespacedResults(ResultsDirectory.defaultDirectory()).workingDiskSpaceGb(MAX_EXPECTED_BAM_SIZE_GB).build();
}
Also used : ResourceFilesFactory.buildResourceFiles(com.hartwig.pipeline.resource.ResourceFilesFactory.buildResourceFiles) ResourceFiles(com.hartwig.pipeline.resource.ResourceFiles) OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) RefGenomeVersion(com.hartwig.pipeline.resource.RefGenomeVersion) StringJoiner(java.util.StringJoiner)

Example 28 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class RnaIsofoxSpliceJunctions method execute.

@Override
public VirtualMachineJobDefinition execute(InputBundle inputs, RuntimeBucket bucket, BashStartupScript startupScript, RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String batchInputs = descriptor.inputValue();
    final String[] batchItems = batchInputs.split(",");
    if (batchItems.length < 2) {
        System.out.print(String.format("invalid input arguments(%s) - expected SampleId,ReadLength", batchInputs));
        return null;
    }
    final String sampleId = batchItems[COL_SAMPLE_ID];
    final String geneIds = batchItems[COL_GENE_IDS];
    final RefGenomeVersion refGenomeVersion = V37;
    final ResourceFiles resourceFiles = buildResourceFiles(refGenomeVersion);
    final String samplesDir = String.format("%s/%s", getRnaCohortDirectory(refGenomeVersion), "samples");
    // copy down BAM and index file for this sample
    final String bamFile = String.format("%s%s", sampleId, RNA_BAM_FILE_ID);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", samplesDir, sampleId, bamFile, VmDirectories.INPUT));
    final String bamIndexFile = String.format("%s%s", sampleId, RNA_BAM_INDEX_FILE_ID);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", samplesDir, sampleId, bamIndexFile, VmDirectories.INPUT));
    // copy down the executable
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s %s", ISOFOX_LOCATION, ISOFOX_JAR, VmDirectories.TOOLS));
    startupScript.addCommand(() -> format("cd %s", VmDirectories.OUTPUT));
    // run Isofox
    StringJoiner isofoxArgs = new StringJoiner(" ");
    isofoxArgs.add(String.format("-sample %s", sampleId));
    isofoxArgs.add(String.format("-functions %s", FUNC_TRANSCRIPT_COUNTS));
    isofoxArgs.add(String.format("-output_dir %s/", VmDirectories.OUTPUT));
    isofoxArgs.add(String.format("-bam_file %s/%s", VmDirectories.INPUT, bamFile));
    isofoxArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
    isofoxArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
    isofoxArgs.add(String.format("-write_exon_data"));
    isofoxArgs.add(String.format("-restricted_gene_ids %s", geneIds));
    isofoxArgs.add(" -output_id gene_sj");
    startupScript.addCommand(() -> format("java -jar %s/%s %s", VmDirectories.TOOLS, ISOFOX_JAR, isofoxArgs.toString()));
    // upload the results
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(bucket.name(), "isofox"), executionFlags));
    return ImmutableVirtualMachineJobDefinition.builder().name("rna-isofox").startupCommand(startupScript).namespacedResults(ResultsDirectory.defaultDirectory()).workingDiskSpaceGb(MAX_EXPECTED_BAM_SIZE_GB).build();
}
Also used : ResourceFilesFactory.buildResourceFiles(com.hartwig.pipeline.resource.ResourceFilesFactory.buildResourceFiles) ResourceFiles(com.hartwig.pipeline.resource.ResourceFiles) OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) RefGenomeVersion(com.hartwig.pipeline.resource.RefGenomeVersion) StringJoiner(java.util.StringJoiner)

Example 29 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class RnaRsem method execute.

@Override
public VirtualMachineJobDefinition execute(InputBundle inputs, RuntimeBucket bucket, BashStartupScript startupScript, RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String batchInputs = descriptor.inputValue();
    final String[] batchItems = batchInputs.split(",");
    if (batchItems.length != 2) {
        System.out.print(String.format("invalid input arguments(%s) - expected SampleId,PathToFastqFiles", batchInputs));
        return null;
    }
    final String sampleId = batchItems[0];
    final String fastqFilelist = batchItems[1];
    final List<String> sampleFastqFiles = getSampleFastqFileList(sampleId, fastqFilelist);
    if (sampleFastqFiles.isEmpty()) {
        System.out.print(String.format("sampleId(%s) fastq files not found", sampleId));
        return null;
    }
    // copy down FASTQ files for this sample
    for (final String fastqFile : sampleFastqFiles) {
        startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s %s", fastqFile, VmDirectories.INPUT));
    }
    // download the executables
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp -r %s/%s %s", RSEM_RESOURCES, RSEM_TOOL, VmDirectories.TOOLS));
    startupScript.addCommand(() -> format("chmod a+x %s/%s/*", VmDirectories.TOOLS, RSEM_TOOL));
    // locate the FASTQ files for reads 1 and 2
    final String r1Files = format("$(ls %s/*_R1* | tr '\\n' ',')", VmDirectories.INPUT);
    final String r2Files = format("$(ls %s/*_R2* | tr '\\n' ',')", VmDirectories.INPUT);
    // download reference files
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp -r %s/%s %s", RNA_RESOURCES, REF_GENCODE_37_DIR, VmDirectories.INPUT));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp -r %s/%s %s", RSEM_RESOURCES, RSEM_GENE_INDEX_DIR, VmDirectories.INPUT));
    startupScript.addCommand(() -> format("cd %s", VmDirectories.OUTPUT));
    // run STAR with transcriptome mapping
    final String refGenomeDir = String.format("%s/%s", VmDirectories.INPUT, REF_GENCODE_37_DIR);
    final String threadCount = Bash.allCpus();
    final String[] starArgs = { "--runThreadN", threadCount, "--genomeDir", refGenomeDir, "--genomeLoad", "NoSharedMemory", "--readFilesIn", r1Files, r2Files, "--readFilesCommand", "zcat", "--outSAMtype", "BAM", "Unsorted", "--outSAMunmapped", "Within", "--outBAMcompression", "0", "--outSAMattributes", "All", "--outFilterMultimapNmax", "10", "--outFilterMismatchNmax", "3", "limitOutSJcollapsed", "3000000", "--chimSegmentMin", "10", "--chimOutType", "WithinBAM", "SoftClip", "--chimJunctionOverhangMin", "10", "--chimSegmentReadGapMax", "3", "--chimScoreMin", "1", "--chimScoreDropMax", "30", "--chimScoreJunctionNonGTAG", "0", "--chimScoreSeparation", "1", "--outFilterScoreMinOverLread", "0.33", "--outFilterMatchNminOverLread", "0.33", "--outFilterMatchNmin", "35", "--alignSplicedMateMapLminOverLmate", "0.33", "--alignSplicedMateMapLmin", "35", "--alignSJstitchMismatchNmax", "5", "-1", "5", "5", "--quantMode", // key line for RSEM;
    "TranscriptomeSAM" };
    startupScript.addCommand(new VersionedToolCommand("star", "STAR", "2.7.3a", starArgs));
    // key output file is 'Aligned.toTranscriptome.out.bam'
    // ./tools/RSEM-1.3.3/rsem-calculate-expression --alignments --paired-end
    // ./runs/CPCT02020378T/Aligned.toTranscriptome.out.bam
    // ./ref/rsem_gene_index/human_gencode
    // CPCT02020378T.rsem -p 6 &
    final String transcriptomeBam = "Aligned.toTranscriptome.out.bam";
    // TMP: copy transcriptome BAM to the bucket
    // startupScript.addCommand(() -> format("gsutil -m cp %s/%s gs://rna-cohort/%s/rsem/", VmDirectories.OUTPUT, transcriptomeBam, sampleId));
    final String rsemGeneIndex = String.format("%s/%s/%s", VmDirectories.INPUT, RSEM_GENE_INDEX_DIR, RSEM_GENE_INDEX);
    final String outputPrefix = String.format("%s.rsem", sampleId);
    // run RSEM
    StringBuilder rsemArgs = new StringBuilder();
    rsemArgs.append(" --alignments");
    rsemArgs.append(" --paired-end");
    rsemArgs.append(String.format(" %s", transcriptomeBam));
    rsemArgs.append(String.format(" %s", rsemGeneIndex));
    rsemArgs.append(String.format(" %s", outputPrefix));
    rsemArgs.append(String.format(" -p %s", threadCount));
    // run RSEM transcript expression calcs
    startupScript.addCommand(() -> format("%s/%s/%s %s", VmDirectories.TOOLS, RSEM_TOOL, RSEM_EXPRESSION_CMD, rsemArgs.toString()));
    startupScript.addCommand(() -> format("mv %s.rsem.genes.results %s.rsem.gene_data.tsv", sampleId, sampleId));
    startupScript.addCommand(() -> format("mv %s.rsem.isoforms.results %s.rsem.trans_data.tsv", sampleId, sampleId));
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(bucket.name(), "rsem"), executionFlags));
    // copy results to rna-analysis location on crunch
    startupScript.addCommand(() -> format("gsutil -m cp %s/*tsv %s/%s/rsem/", VmDirectories.OUTPUT, RNA_COHORT_LOCATION_V37, sampleId));
    return ImmutableVirtualMachineJobDefinition.builder().name("rna-rsem").startupCommand(startupScript).namespacedResults(ResultsDirectory.defaultDirectory()).workingDiskSpaceGb(500).performanceProfile(VirtualMachinePerformanceProfile.custom(12, 36)).build();
}
Also used : OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) VersionedToolCommand(com.hartwig.pipeline.calling.command.VersionedToolCommand)

Example 30 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class SageCreatePon method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
    final String output = String.format("%s/SAGE.pon.vcf.gz", VmDirectories.OUTPUT);
    final BashCommand sageCommand = new SageCommand("com.hartwig.hmftools.sage.pon.PonApplication", "100G", "-in", VmDirectories.INPUT, "-out", output, "-threads", Bash.allCpus());
    // Download required resources
    // startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s %s",
    // "gs://batch-sage-validation/resources/sage.jar",
    // "/opt/tools/sage/" + Versions.SAGE + "/sage.jar"));
    // Download germline VCFS (and indexes)
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch -m cp %s %s", "gs://batch-sage/*/sage/*.sage.somatic.vcf.gz", VmDirectories.INPUT));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch -m cp %s %s", "gs://batch-sage/*/sage/*.sage.somatic.vcf.gz.tbi", VmDirectories.INPUT));
    // Run Pon Generator
    startupScript.addCommand(sageCommand);
    // Store output
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
    return VirtualMachineJobDefinition.sageSomaticCalling(startupScript, ResultsDirectory.defaultDirectory());
}
Also used : OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) BashCommand(com.hartwig.pipeline.execution.vm.BashCommand) SageCommand(com.hartwig.pipeline.calling.sage.SageCommand)

Aggregations

OutputUpload (com.hartwig.pipeline.execution.vm.OutputUpload)40 InputFileDescriptor (com.hartwig.batch.input.InputFileDescriptor)35 ResourceFiles (com.hartwig.pipeline.resource.ResourceFiles)24 StringJoiner (java.util.StringJoiner)12 GoogleStorageLocation (com.hartwig.pipeline.storage.GoogleStorageLocation)9 RemoteLocationsApi (com.hartwig.batch.api.RemoteLocationsApi)7 CopyLogToOutput (com.hartwig.pipeline.execution.vm.CopyLogToOutput)6 VersionedToolCommand (com.hartwig.pipeline.calling.command.VersionedToolCommand)5 RefGenomeVersion (com.hartwig.pipeline.resource.RefGenomeVersion)5 ResourceFilesFactory.buildResourceFiles (com.hartwig.pipeline.resource.ResourceFilesFactory.buildResourceFiles)5 SubStageInputOutput (com.hartwig.pipeline.stages.SubStageInputOutput)5 File (java.io.File)5 BwaCommand (com.hartwig.pipeline.calling.command.BwaCommand)3 SamtoolsCommand (com.hartwig.pipeline.calling.command.SamtoolsCommand)3 InputDownload (com.hartwig.pipeline.execution.vm.InputDownload)3 OutputFile (com.hartwig.pipeline.execution.vm.OutputFile)3 SageApplication (com.hartwig.pipeline.calling.sage.SageApplication)2 SageCommandBuilder (com.hartwig.pipeline.calling.sage.SageCommandBuilder)2 GridssAnnotation (com.hartwig.pipeline.calling.structural.gridss.stage.GridssAnnotation)2 PipelineStatus (com.hartwig.pipeline.execution.PipelineStatus)2