Search in sources :

Example 31 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class SageCreatePonData method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
    final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
    final InputFileDescriptor remoteReferenceFile = inputs.get("reference");
    final InputFileDescriptor remoteReferenceIndex = remoteReferenceFile.index();
    final String localReferenceFile = localFilename(remoteReferenceFile);
    final String localReferenceBam = localReferenceFile.replace("cram", "bam");
    final String referenceSampleName = inputs.get("referenceSample").inputValue();
    // Download latest jar file
    // startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s %s",
    // "gs://batch-sage-validation/resources/sage.jar",
    // "/opt/tools/sage/" + Versions.SAGE + "/sage.jar"));
    // Download normal
    startupScript.addCommand(() -> remoteReferenceFile.toCommandForm(localReferenceFile));
    startupScript.addCommand(() -> remoteReferenceIndex.toCommandForm(localFilename(remoteReferenceIndex)));
    final SageCommandBuilder sageCommandBuilder = new SageCommandBuilder(resourceFiles).ponMode(referenceSampleName, localReferenceBam);
    final SageApplication sageApplication = new SageApplication(sageCommandBuilder);
    // Convert to bam if necessary
    if (!localReferenceFile.equals(localReferenceBam)) {
        startupScript.addCommands(cramToBam(localReferenceFile));
    }
    // Run post processing (NONE for germline)
    final SubStageInputOutput postProcessing = sageApplication.apply(SubStageInputOutput.empty(referenceSampleName));
    startupScript.addCommands(postProcessing.bash());
    // Store output
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
    return VirtualMachineJobDefinition.sageSomaticCalling(startupScript, ResultsDirectory.defaultDirectory());
}
Also used : ResourceFiles(com.hartwig.pipeline.resource.ResourceFiles) OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) SubStageInputOutput(com.hartwig.pipeline.stages.SubStageInputOutput) SageCommandBuilder(com.hartwig.pipeline.calling.sage.SageCommandBuilder) SageApplication(com.hartwig.pipeline.calling.sage.SageApplication)

Example 32 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class SagePanelTumor method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String sampleId = descriptor.inputValue();
    final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V38);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, SAGE_DIR, SAGE_JAR, VmDirectories.TOOLS));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_RESOURCE_BUCKET, SAGE_DIR, PANEL_BED, VmDirectories.INPUT));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, PAVE_DIR, PAVE_JAR, VmDirectories.TOOLS));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/38/* %s", BATCH_RESOURCE_BUCKET, GNOMAD_DIR, VmDirectories.INPUT));
    String ponFile = "SageGermlinePon.98x.38.tsv.gz";
    String ponArtefactFile = "pon_panel_artefact.38.tsv";
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_RESOURCE_BUCKET, SAGE_DIR, ponFile, VmDirectories.INPUT));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_RESOURCE_BUCKET, SAGE_DIR, ponArtefactFile, VmDirectories.INPUT));
    // download tumor BAM
    final String tumorBam = String.format("%s.non_umi_dedup.bam", sampleId);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s* %s", PANEL_BAM_BUCKET, tumorBam, VmDirectories.INPUT));
    final String sageVcf = String.format("%s/%s.sage.somatic.vcf.gz", VmDirectories.OUTPUT, sampleId);
    // run Sage
    final StringJoiner sageArgs = new StringJoiner(" ");
    sageArgs.add(String.format("-tumor %s", sampleId));
    sageArgs.add(String.format("-tumor_bam %s/%s", VmDirectories.INPUT, tumorBam));
    sageArgs.add(String.format("-hotspots %s", resourceFiles.sageSomaticHotspots()));
    sageArgs.add(String.format("-panel_bed %s", resourceFiles.sageSomaticCodingPanel()));
    sageArgs.add(String.format("-high_confidence_bed %s", resourceFiles.giabHighConfidenceBed()));
    sageArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
    sageArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
    sageArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
    sageArgs.add(String.format("-coverage_bed %s/%s", VmDirectories.INPUT, PANEL_BED));
    sageArgs.add(String.format("-out %s", sageVcf));
    sageArgs.add(String.format("-hotspot_min_tumor_qual 100"));
    sageArgs.add(String.format("-panel_min_tumor_qual 200"));
    sageArgs.add(String.format("-high_confidence_min_tumor_qual 200"));
    sageArgs.add(String.format("-low_confidence_min_tumor_qual 300"));
    sageArgs.add(String.format("-mnv_filter_enabled false"));
    sageArgs.add(String.format("-perf_warn_time 50"));
    // sageArgs.add(String.format("-log_debug"));
    sageArgs.add(String.format("-threads %s", Bash.allCpus()));
    startupScript.addCommand(() -> format("java -Xmx48G -jar %s/%s %s", VmDirectories.TOOLS, SAGE_JAR, sageArgs.toString()));
    // annotate with Pave - PON, Gnomad and gene impacts
    /*
        String bcfTools = String.format("%s/bcftools/%s/bcftools", VmDirectories.TOOLS, Versions.BCF_TOOLS);
        String ponVcf = String.format("%s/%s.sage.somatic.pon.vcf.gz", VmDirectories.OUTPUT, sampleId);

        // /data/tools/bcftools/1.9/bcftools annotate -a /data/resources/bucket/sage/37/SageGermlinePon.1000x.37.vcf.gz
        // -c PON_COUNT,PON_MAX
        // FR16648814.sage.somatic.vcf.gz
        // -O z
        // -o FR16648814.sage.somatic.annotated.vcf.gz

        final StringJoiner ponArgs = new StringJoiner(" ");
        ponArgs.add(String.format("-a %s", resourceFiles.sageGermlinePon()));
        ponArgs.add("-c PON_COUNT,PON_MAX");
        ponArgs.add(String.format("%s", sageVcf));
        ponArgs.add("-O z");
        ponArgs.add(String.format("-o %s", ponVcf));

        startupScript.addCommand(() -> format("%s annotate %s", bcfTools, ponArgs.toString()));

        ///data/tools/bcftools/1.9/bcftools filter
        // -e ‘PON_COUNT!=“.” && INFO/TIER=“HOTSPOT” && PON_MAX>=5 && PON_COUNT >= 5’
        // -s PON -m+ FR16648814.sage.somatic.annotated.vcf.gz -O u
        // | /data/tools/bcftools/1.9/bcftools filter -e ‘PON_COUNT!=“.” && INFO/TIER=“PANEL” && PON_MAX>=5 && PON_COUNT >= 2’
        // -s PON -m+ -O u | /data/tools/bcftools/1.9/bcftools filter -e ‘PON_COUNT!=“.” && INFO/TIER!=“HOTSPOT” && INFO/TIER!=“PANEL” && PON_COUNT >= 2’
        // -s PON -m+ -O z -o FR16648814.sage.somatic.pon_filtered.vcf.gz

        String ponFilterVcf = String.format("%s/%s.sage.somatic.pon_filter.vcf.gz", VmDirectories.OUTPUT, sampleId);

        // private static final String HOTSPOT = "INFO/TIER=\"HOTSPOT\" && PON_MAX>=%s && PON_COUNT >= %s";
        // private static final String PANEL = "INFO/TIER=\"PANEL\" && PON_MAX>=%s && PON_COUNT >= %s";
        // private static final String OTHER = "INFO/TIER!=\"HOTSPOT\" && INFO/TIER!=\"PANEL\" && PON_COUNT >= %s";

        final StringJoiner ponFilterArgs = new StringJoiner(" ");
        ponFilterArgs.add("-e 'PON_COUNT!=\".\" && INFO/TIER=\"HOTSPOT\" && PON_MAX>=5 && PON_COUNT >= 5'");
        ponFilterArgs.add(String.format("-s PON -m+ %s -O u", ponVcf));
        ponFilterArgs.add(String.format("| %s filter -e 'PON_COUNT!=\".\" && INFO/TIER=\"PANEL\" && PON_MAX>=5 && PON_COUNT >= 2'", bcfTools));
        ponFilterArgs.add(String.format("-s PON -m+ -O u | %s filter -e 'PON_COUNT!=\".\" && INFO/TIER!=\"HOTSPOT\" && INFO/TIER!=\"PANEL\" && PON_COUNT >= 2'", bcfTools));
        ponFilterArgs.add(String.format("-s PON -m+ -O z -o %s", ponFilterVcf));

        startupScript.addCommand(() -> format("%s filter %s", bcfTools, ponFilterArgs.toString()));
        */
    // finally run Pave
    final StringJoiner paveArgs = new StringJoiner(" ");
    String ponFilters = "HOTSPOT:5:5;PANEL:2:5;UNKNOWN:2:0";
    final String paveVcf = String.format("%s/%s.sage.somatic.pon.pave_pass.vcf.gz", VmDirectories.OUTPUT, sampleId);
    paveArgs.add(String.format("-sample %s", sampleId));
    // ponFilterVcf from BCF Tools
    paveArgs.add(String.format("-vcf_file %s", sageVcf));
    paveArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
    paveArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
    paveArgs.add(String.format("-driver_gene_panel %s", resourceFiles.driverGenePanel()));
    paveArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
    paveArgs.add("-only_canonical");
    paveArgs.add("-filter_pass");
    paveArgs.add(String.format("-gnomad_freq_dir %s", VmDirectories.INPUT));
    paveArgs.add(String.format("-pon_file %s/%s", VmDirectories.INPUT, ponFile));
    paveArgs.add(String.format("-pon_artefact_file %s/%s", VmDirectories.INPUT, ponArtefactFile));
    paveArgs.add(String.format("-pon_filters \"%s\"", ponFilters));
    paveArgs.add("-gnomad_load_chr_on_demand");
    paveArgs.add(String.format("-output_vcf_file %s", paveVcf));
    String paveJar = String.format("%s/%s", VmDirectories.TOOLS, PAVE_JAR);
    // String paveJar = String.format("%s/pave/%s/pave.jar", VmDirectories.TOOLS, Versions.PAVE);
    startupScript.addCommand(() -> format("java -jar %s %s", paveJar, paveArgs.toString()));
    // upload output
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
    return ImmutableVirtualMachineJobDefinition.builder().name("sage").startupCommand(startupScript).performanceProfile(custom(24, 64)).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
Also used : ResourceFiles(com.hartwig.pipeline.resource.ResourceFiles) OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) StringJoiner(java.util.StringJoiner)

Example 33 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class SageRerun method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
    InputFileDescriptor descriptor = inputs.get();
    final String sampleId = descriptor.inputValue();
    final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, SAGE_DIR, SAGE_JAR, VmDirectories.TOOLS));
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, PAVE_DIR, PAVE_JAR, VmDirectories.TOOLS));
    String ponFile = "SageGermlinePon.1000x.37.tsv.gz";
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_RESOURCE_BUCKET, SAGE_DIR, ponFile, VmDirectories.INPUT));
    // download tumor and ref CRAM
    final RemoteLocationsApi locations = new RemoteLocationsApi("hmf-crunch", sampleId);
    String[] tumorCramData = getCramFileData(locations.getTumorAlignment());
    String tumorCramFile = tumorCramData[CRAM_FILENAME];
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp gs://%s* %s", tumorCramData[CRAM_FULL_PATH], VmDirectories.INPUT));
    String referenceId = locations.getReference();
    String[] refCramData = getCramFileData(locations.getReferenceAlignment());
    String refCramFile = refCramData[CRAM_FILENAME];
    startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp gs://%s* %s", refCramData[CRAM_FULL_PATH], VmDirectories.INPUT));
    final String sageVcf = String.format("%s/%s.sage.somatic.vcf.gz", VmDirectories.OUTPUT, sampleId);
    // run Sage
    final StringJoiner sageArgs = new StringJoiner(" ");
    sageArgs.add(String.format("-tumor %s", sampleId));
    sageArgs.add(String.format("-tumor_bam %s/%s", VmDirectories.INPUT, tumorCramFile));
    sageArgs.add(String.format("-reference %s", referenceId));
    sageArgs.add(String.format("-reference_bam %s/%s", VmDirectories.INPUT, refCramFile));
    sageArgs.add(String.format("-hotspots %s", resourceFiles.sageSomaticHotspots()));
    sageArgs.add(String.format("-panel_bed %s", resourceFiles.sageSomaticCodingPanel()));
    sageArgs.add(String.format("-high_confidence_bed %s", resourceFiles.giabHighConfidenceBed()));
    sageArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
    sageArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
    sageArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
    sageArgs.add(String.format("-out %s", sageVcf));
    sageArgs.add(String.format("-perf_warn_time 50"));
    // sageArgs.add(String.format("-log_debug"));
    sageArgs.add(String.format("-threads %s", Bash.allCpus()));
    startupScript.addCommand(() -> format("java -Xmx48G -jar %s/%s %s", VmDirectories.TOOLS, SAGE_JAR, sageArgs.toString()));
    // annotate with Pave - PON and gene impacts
    final StringJoiner paveArgs = new StringJoiner(" ");
    String ponFilters = "HOTSPOT:5:5;PANEL:2:5;UNKNOWN:2:0";
    final String paveVcf = String.format("%s/%s.sage.somatic.pon.pave.vcf.gz", VmDirectories.OUTPUT, sampleId);
    paveArgs.add(String.format("-sample %s", sampleId));
    // ponFilterVcf from BCF Tools
    paveArgs.add(String.format("-vcf_file %s", sageVcf));
    paveArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
    paveArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
    paveArgs.add(String.format("-driver_gene_panel %s", resourceFiles.driverGenePanel()));
    paveArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
    paveArgs.add(String.format("-pon_file %s/%s", VmDirectories.INPUT, ponFile));
    paveArgs.add(String.format("-pon_filters \"%s\"", ponFilters));
    paveArgs.add(String.format("-output_vcf_file %s", paveVcf));
    String paveJar = String.format("%s/%s", VmDirectories.TOOLS, PAVE_JAR);
    startupScript.addCommand(() -> format("java -jar %s %s", paveJar, paveArgs.toString()));
    // upload output
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
    return ImmutableVirtualMachineJobDefinition.builder().name("sage").startupCommand(startupScript).performanceProfile(custom(24, 64)).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
Also used : ResourceFiles(com.hartwig.pipeline.resource.ResourceFiles) OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) StringJoiner(java.util.StringJoiner) RemoteLocationsApi(com.hartwig.batch.api.RemoteLocationsApi)

Example 34 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class SambambaCramaBam method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket bucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
    InputFileDescriptor input = inputs.get();
    String outputFile = VmDirectories.outputFile(new File(input.inputValue()).getName().replaceAll("\\.bam$", ".cram"));
    String localInput = String.format("%s/%s", VmDirectories.INPUT, new File(input.inputValue()).getName());
    startupScript.addCommand(() -> input.toCommandForm(localInput));
    final RefGenome37ResourceFiles resourceFiles = new RefGenome37ResourceFiles();
    startupScript.addCommand(new VersionedToolCommand("sambamba", "sambamba", Versions.SAMBAMBA, "view", localInput, "-o", outputFile, "-t", Bash.allCpus(), "--format=cram", "-T", resourceFiles.refGenomeFile()));
    startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(bucket.name(), "cram"), executionFlags));
    return VirtualMachineJobDefinition.builder().name("cram").startupCommand(startupScript).namespacedResults(ResultsDirectory.defaultDirectory()).performanceProfile(VirtualMachinePerformanceProfile.custom(4, 6)).build();
}
Also used : OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor) RefGenome37ResourceFiles(com.hartwig.pipeline.resource.RefGenome37ResourceFiles) File(java.io.File) VersionedToolCommand(com.hartwig.pipeline.calling.command.VersionedToolCommand)

Example 35 with OutputUpload

use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.

the class LilacCtpacBatch method execute.

@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript commands, final RuntimeFiles executionFlags) {
    final InputFileDescriptor runData = inputs.get();
    final String batchInputs = runData.inputValue();
    final String[] batchItems = batchInputs.split(",");
    String sampleId = batchItems[0];
    String runDirectory = "run_cptac_02";
    // download pilot Lilac jar
    addLilacDownloadCommands(commands);
    addSampleCommands(runData, commands, runDirectory, sampleId);
    commands.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "lilac"), executionFlags));
    // and copy the run log files to a single directory for convenience
    String commonLogDir = String.format("gs://%s/%s/logs/", LILAC_BATCH_BUCKET, runDirectory);
    commands.addCommand(() -> format("gsutil -m cp /data/output/*.log %s", commonLogDir));
    return ImmutableVirtualMachineJobDefinition.builder().name("lilac").startupCommand(commands).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
Also used : OutputUpload(com.hartwig.pipeline.execution.vm.OutputUpload) InputFileDescriptor(com.hartwig.batch.input.InputFileDescriptor)

Aggregations

OutputUpload (com.hartwig.pipeline.execution.vm.OutputUpload)40 InputFileDescriptor (com.hartwig.batch.input.InputFileDescriptor)35 ResourceFiles (com.hartwig.pipeline.resource.ResourceFiles)24 StringJoiner (java.util.StringJoiner)12 GoogleStorageLocation (com.hartwig.pipeline.storage.GoogleStorageLocation)9 RemoteLocationsApi (com.hartwig.batch.api.RemoteLocationsApi)7 CopyLogToOutput (com.hartwig.pipeline.execution.vm.CopyLogToOutput)6 VersionedToolCommand (com.hartwig.pipeline.calling.command.VersionedToolCommand)5 RefGenomeVersion (com.hartwig.pipeline.resource.RefGenomeVersion)5 ResourceFilesFactory.buildResourceFiles (com.hartwig.pipeline.resource.ResourceFilesFactory.buildResourceFiles)5 SubStageInputOutput (com.hartwig.pipeline.stages.SubStageInputOutput)5 File (java.io.File)5 BwaCommand (com.hartwig.pipeline.calling.command.BwaCommand)3 SamtoolsCommand (com.hartwig.pipeline.calling.command.SamtoolsCommand)3 InputDownload (com.hartwig.pipeline.execution.vm.InputDownload)3 OutputFile (com.hartwig.pipeline.execution.vm.OutputFile)3 SageApplication (com.hartwig.pipeline.calling.sage.SageApplication)2 SageCommandBuilder (com.hartwig.pipeline.calling.sage.SageCommandBuilder)2 GridssAnnotation (com.hartwig.pipeline.calling.structural.gridss.stage.GridssAnnotation)2 PipelineStatus (com.hartwig.pipeline.execution.PipelineStatus)2