use of com.hartwig.pipeline.resource.ResourceFiles in project pipeline5 by hartwigmedical.
the class SageCreatePonData method execute.
@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
final InputFileDescriptor remoteReferenceFile = inputs.get("reference");
final InputFileDescriptor remoteReferenceIndex = remoteReferenceFile.index();
final String localReferenceFile = localFilename(remoteReferenceFile);
final String localReferenceBam = localReferenceFile.replace("cram", "bam");
final String referenceSampleName = inputs.get("referenceSample").inputValue();
// Download latest jar file
// startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s %s",
// "gs://batch-sage-validation/resources/sage.jar",
// "/opt/tools/sage/" + Versions.SAGE + "/sage.jar"));
// Download normal
startupScript.addCommand(() -> remoteReferenceFile.toCommandForm(localReferenceFile));
startupScript.addCommand(() -> remoteReferenceIndex.toCommandForm(localFilename(remoteReferenceIndex)));
final SageCommandBuilder sageCommandBuilder = new SageCommandBuilder(resourceFiles).ponMode(referenceSampleName, localReferenceBam);
final SageApplication sageApplication = new SageApplication(sageCommandBuilder);
// Convert to bam if necessary
if (!localReferenceFile.equals(localReferenceBam)) {
startupScript.addCommands(cramToBam(localReferenceFile));
}
// Run post processing (NONE for germline)
final SubStageInputOutput postProcessing = sageApplication.apply(SubStageInputOutput.empty(referenceSampleName));
startupScript.addCommands(postProcessing.bash());
// Store output
startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
return VirtualMachineJobDefinition.sageSomaticCalling(startupScript, ResultsDirectory.defaultDirectory());
}
use of com.hartwig.pipeline.resource.ResourceFiles in project pipeline5 by hartwigmedical.
the class SagePanelTumor method execute.
@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
InputFileDescriptor descriptor = inputs.get();
final String sampleId = descriptor.inputValue();
final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V38);
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, SAGE_DIR, SAGE_JAR, VmDirectories.TOOLS));
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_RESOURCE_BUCKET, SAGE_DIR, PANEL_BED, VmDirectories.INPUT));
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, PAVE_DIR, PAVE_JAR, VmDirectories.TOOLS));
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/38/* %s", BATCH_RESOURCE_BUCKET, GNOMAD_DIR, VmDirectories.INPUT));
String ponFile = "SageGermlinePon.98x.38.tsv.gz";
String ponArtefactFile = "pon_panel_artefact.38.tsv";
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_RESOURCE_BUCKET, SAGE_DIR, ponFile, VmDirectories.INPUT));
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_RESOURCE_BUCKET, SAGE_DIR, ponArtefactFile, VmDirectories.INPUT));
// download tumor BAM
final String tumorBam = String.format("%s.non_umi_dedup.bam", sampleId);
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s* %s", PANEL_BAM_BUCKET, tumorBam, VmDirectories.INPUT));
final String sageVcf = String.format("%s/%s.sage.somatic.vcf.gz", VmDirectories.OUTPUT, sampleId);
// run Sage
final StringJoiner sageArgs = new StringJoiner(" ");
sageArgs.add(String.format("-tumor %s", sampleId));
sageArgs.add(String.format("-tumor_bam %s/%s", VmDirectories.INPUT, tumorBam));
sageArgs.add(String.format("-hotspots %s", resourceFiles.sageSomaticHotspots()));
sageArgs.add(String.format("-panel_bed %s", resourceFiles.sageSomaticCodingPanel()));
sageArgs.add(String.format("-high_confidence_bed %s", resourceFiles.giabHighConfidenceBed()));
sageArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
sageArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
sageArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
sageArgs.add(String.format("-coverage_bed %s/%s", VmDirectories.INPUT, PANEL_BED));
sageArgs.add(String.format("-out %s", sageVcf));
sageArgs.add(String.format("-hotspot_min_tumor_qual 100"));
sageArgs.add(String.format("-panel_min_tumor_qual 200"));
sageArgs.add(String.format("-high_confidence_min_tumor_qual 200"));
sageArgs.add(String.format("-low_confidence_min_tumor_qual 300"));
sageArgs.add(String.format("-mnv_filter_enabled false"));
sageArgs.add(String.format("-perf_warn_time 50"));
// sageArgs.add(String.format("-log_debug"));
sageArgs.add(String.format("-threads %s", Bash.allCpus()));
startupScript.addCommand(() -> format("java -Xmx48G -jar %s/%s %s", VmDirectories.TOOLS, SAGE_JAR, sageArgs.toString()));
// annotate with Pave - PON, Gnomad and gene impacts
/*
String bcfTools = String.format("%s/bcftools/%s/bcftools", VmDirectories.TOOLS, Versions.BCF_TOOLS);
String ponVcf = String.format("%s/%s.sage.somatic.pon.vcf.gz", VmDirectories.OUTPUT, sampleId);
// /data/tools/bcftools/1.9/bcftools annotate -a /data/resources/bucket/sage/37/SageGermlinePon.1000x.37.vcf.gz
// -c PON_COUNT,PON_MAX
// FR16648814.sage.somatic.vcf.gz
// -O z
// -o FR16648814.sage.somatic.annotated.vcf.gz
final StringJoiner ponArgs = new StringJoiner(" ");
ponArgs.add(String.format("-a %s", resourceFiles.sageGermlinePon()));
ponArgs.add("-c PON_COUNT,PON_MAX");
ponArgs.add(String.format("%s", sageVcf));
ponArgs.add("-O z");
ponArgs.add(String.format("-o %s", ponVcf));
startupScript.addCommand(() -> format("%s annotate %s", bcfTools, ponArgs.toString()));
///data/tools/bcftools/1.9/bcftools filter
// -e ‘PON_COUNT!=“.” && INFO/TIER=“HOTSPOT” && PON_MAX>=5 && PON_COUNT >= 5’
// -s PON -m+ FR16648814.sage.somatic.annotated.vcf.gz -O u
// | /data/tools/bcftools/1.9/bcftools filter -e ‘PON_COUNT!=“.” && INFO/TIER=“PANEL” && PON_MAX>=5 && PON_COUNT >= 2’
// -s PON -m+ -O u | /data/tools/bcftools/1.9/bcftools filter -e ‘PON_COUNT!=“.” && INFO/TIER!=“HOTSPOT” && INFO/TIER!=“PANEL” && PON_COUNT >= 2’
// -s PON -m+ -O z -o FR16648814.sage.somatic.pon_filtered.vcf.gz
String ponFilterVcf = String.format("%s/%s.sage.somatic.pon_filter.vcf.gz", VmDirectories.OUTPUT, sampleId);
// private static final String HOTSPOT = "INFO/TIER=\"HOTSPOT\" && PON_MAX>=%s && PON_COUNT >= %s";
// private static final String PANEL = "INFO/TIER=\"PANEL\" && PON_MAX>=%s && PON_COUNT >= %s";
// private static final String OTHER = "INFO/TIER!=\"HOTSPOT\" && INFO/TIER!=\"PANEL\" && PON_COUNT >= %s";
final StringJoiner ponFilterArgs = new StringJoiner(" ");
ponFilterArgs.add("-e 'PON_COUNT!=\".\" && INFO/TIER=\"HOTSPOT\" && PON_MAX>=5 && PON_COUNT >= 5'");
ponFilterArgs.add(String.format("-s PON -m+ %s -O u", ponVcf));
ponFilterArgs.add(String.format("| %s filter -e 'PON_COUNT!=\".\" && INFO/TIER=\"PANEL\" && PON_MAX>=5 && PON_COUNT >= 2'", bcfTools));
ponFilterArgs.add(String.format("-s PON -m+ -O u | %s filter -e 'PON_COUNT!=\".\" && INFO/TIER!=\"HOTSPOT\" && INFO/TIER!=\"PANEL\" && PON_COUNT >= 2'", bcfTools));
ponFilterArgs.add(String.format("-s PON -m+ -O z -o %s", ponFilterVcf));
startupScript.addCommand(() -> format("%s filter %s", bcfTools, ponFilterArgs.toString()));
*/
// finally run Pave
final StringJoiner paveArgs = new StringJoiner(" ");
String ponFilters = "HOTSPOT:5:5;PANEL:2:5;UNKNOWN:2:0";
final String paveVcf = String.format("%s/%s.sage.somatic.pon.pave_pass.vcf.gz", VmDirectories.OUTPUT, sampleId);
paveArgs.add(String.format("-sample %s", sampleId));
// ponFilterVcf from BCF Tools
paveArgs.add(String.format("-vcf_file %s", sageVcf));
paveArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
paveArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
paveArgs.add(String.format("-driver_gene_panel %s", resourceFiles.driverGenePanel()));
paveArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
paveArgs.add("-only_canonical");
paveArgs.add("-filter_pass");
paveArgs.add(String.format("-gnomad_freq_dir %s", VmDirectories.INPUT));
paveArgs.add(String.format("-pon_file %s/%s", VmDirectories.INPUT, ponFile));
paveArgs.add(String.format("-pon_artefact_file %s/%s", VmDirectories.INPUT, ponArtefactFile));
paveArgs.add(String.format("-pon_filters \"%s\"", ponFilters));
paveArgs.add("-gnomad_load_chr_on_demand");
paveArgs.add(String.format("-output_vcf_file %s", paveVcf));
String paveJar = String.format("%s/%s", VmDirectories.TOOLS, PAVE_JAR);
// String paveJar = String.format("%s/pave/%s/pave.jar", VmDirectories.TOOLS, Versions.PAVE);
startupScript.addCommand(() -> format("java -jar %s %s", paveJar, paveArgs.toString()));
// upload output
startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
return ImmutableVirtualMachineJobDefinition.builder().name("sage").startupCommand(startupScript).performanceProfile(custom(24, 64)).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
use of com.hartwig.pipeline.resource.ResourceFiles in project pipeline5 by hartwigmedical.
the class SageRerun method execute.
@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
InputFileDescriptor descriptor = inputs.get();
final String sampleId = descriptor.inputValue();
final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, SAGE_DIR, SAGE_JAR, VmDirectories.TOOLS));
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_TOOLS_BUCKET, PAVE_DIR, PAVE_JAR, VmDirectories.TOOLS));
String ponFile = "SageGermlinePon.1000x.37.tsv.gz";
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s/%s %s", BATCH_RESOURCE_BUCKET, SAGE_DIR, ponFile, VmDirectories.INPUT));
// download tumor and ref CRAM
final RemoteLocationsApi locations = new RemoteLocationsApi("hmf-crunch", sampleId);
String[] tumorCramData = getCramFileData(locations.getTumorAlignment());
String tumorCramFile = tumorCramData[CRAM_FILENAME];
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp gs://%s* %s", tumorCramData[CRAM_FULL_PATH], VmDirectories.INPUT));
String referenceId = locations.getReference();
String[] refCramData = getCramFileData(locations.getReferenceAlignment());
String refCramFile = refCramData[CRAM_FILENAME];
startupScript.addCommand(() -> format("gsutil -u hmf-crunch cp gs://%s* %s", refCramData[CRAM_FULL_PATH], VmDirectories.INPUT));
final String sageVcf = String.format("%s/%s.sage.somatic.vcf.gz", VmDirectories.OUTPUT, sampleId);
// run Sage
final StringJoiner sageArgs = new StringJoiner(" ");
sageArgs.add(String.format("-tumor %s", sampleId));
sageArgs.add(String.format("-tumor_bam %s/%s", VmDirectories.INPUT, tumorCramFile));
sageArgs.add(String.format("-reference %s", referenceId));
sageArgs.add(String.format("-reference_bam %s/%s", VmDirectories.INPUT, refCramFile));
sageArgs.add(String.format("-hotspots %s", resourceFiles.sageSomaticHotspots()));
sageArgs.add(String.format("-panel_bed %s", resourceFiles.sageSomaticCodingPanel()));
sageArgs.add(String.format("-high_confidence_bed %s", resourceFiles.giabHighConfidenceBed()));
sageArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
sageArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
sageArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
sageArgs.add(String.format("-out %s", sageVcf));
sageArgs.add(String.format("-perf_warn_time 50"));
// sageArgs.add(String.format("-log_debug"));
sageArgs.add(String.format("-threads %s", Bash.allCpus()));
startupScript.addCommand(() -> format("java -Xmx48G -jar %s/%s %s", VmDirectories.TOOLS, SAGE_JAR, sageArgs.toString()));
// annotate with Pave - PON and gene impacts
final StringJoiner paveArgs = new StringJoiner(" ");
String ponFilters = "HOTSPOT:5:5;PANEL:2:5;UNKNOWN:2:0";
final String paveVcf = String.format("%s/%s.sage.somatic.pon.pave.vcf.gz", VmDirectories.OUTPUT, sampleId);
paveArgs.add(String.format("-sample %s", sampleId));
// ponFilterVcf from BCF Tools
paveArgs.add(String.format("-vcf_file %s", sageVcf));
paveArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
paveArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
paveArgs.add(String.format("-driver_gene_panel %s", resourceFiles.driverGenePanel()));
paveArgs.add(String.format("-ensembl_data_dir %s", resourceFiles.ensemblDataCache()));
paveArgs.add(String.format("-pon_file %s/%s", VmDirectories.INPUT, ponFile));
paveArgs.add(String.format("-pon_filters \"%s\"", ponFilters));
paveArgs.add(String.format("-output_vcf_file %s", paveVcf));
String paveJar = String.format("%s/%s", VmDirectories.TOOLS, PAVE_JAR);
startupScript.addCommand(() -> format("java -jar %s %s", paveJar, paveArgs.toString()));
// upload output
startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "sage"), executionFlags));
return ImmutableVirtualMachineJobDefinition.builder().name("sage").startupCommand(startupScript).performanceProfile(custom(24, 64)).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
use of com.hartwig.pipeline.resource.ResourceFiles in project pipeline5 by hartwigmedical.
the class LilacCtpacBatch method addSampleCommands.
private void addSampleCommands(final InputFileDescriptor runData, final BashStartupScript commands, final String runDirectory, final String sampleId) {
final String referenceBam = String.format("%s/%s.bam", VmDirectories.INPUT, sampleId);
// download sample BAM files
commands.addCommand(() -> format("gsutil -m -u hmf-crunch cp gs://%s/%s/* %s", PCAWG_BAM_BUCKET, sampleId, VmDirectories.INPUT));
// build Lilac arguments
String sampleOutputDir = String.format("%s/%s/", VmDirectories.OUTPUT, sampleId);
commands.addCommand(() -> format("mkdir -p %s", sampleOutputDir));
final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V38);
/*
-sample C3N-01023_B -ref_genome /Users/charlesshale/data/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
-ref_genome_version V38
-reference_bam /Users/charlesshale/data/lilac/pcawg/samples/C3N-01023_B/C3N-01023_B.bam
-resource_dir /Users/charlesshale/data/lilac/ref/
-output_dir /Users/charlesshale/data/lilac/pcawg/samples/C3N-01023_B/
*/
StringBuilder lilacArgs = new StringBuilder();
lilacArgs.append(String.format(" -sample %s", sampleId));
lilacArgs.append(String.format(" -resource_dir %s", VmDirectories.INPUT));
lilacArgs.append(String.format(" -ref_genome %s", resourceFiles.refGenomeFile()));
lilacArgs.append(String.format(" -ref_genome_version %s", "V38"));
lilacArgs.append(String.format(" -reference_bam %s", referenceBam));
lilacArgs.append(String.format(" -output_dir %s", sampleOutputDir));
lilacArgs.append(String.format(" -threads %s", Bash.allCpus()));
commands.addCommand(() -> format("java -Xmx%s -jar %s/%s %s", MAX_HEAP, VmDirectories.TOOLS, LILAC_JAR, lilacArgs.toString()));
String sampleRemoteOutputDir = String.format("gs://%s/%s/", LILAC_BATCH_BUCKET, runDirectory);
commands.addCommand(() -> format("gsutil -m cp -r %s/%s/ %s", VmDirectories.OUTPUT, sampleId, sampleRemoteOutputDir));
}
use of com.hartwig.pipeline.resource.ResourceFiles in project pipeline5 by hartwigmedical.
the class LilacPanelBatch method execute.
@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript commands, final RuntimeFiles executionFlags) {
// Inputs: SampleId,ExpectedAlleles
final InputFileDescriptor runData = inputs.get();
final String batchInputs = runData.inputValue();
final String[] batchItems = batchInputs.split(",");
String sampleId = batchItems[0];
// download pilot Lilac jar
addLilacDownloadCommands(commands);
String tumorBam = String.format("%s.non_umi_dedup.bam", sampleId);
commands.addCommand(() -> format("gsutil -u hmf-crunch cp %s/%s* %s", PANEL_BAM_BUCKET, tumorBam, VmDirectories.INPUT));
// build Lilac arguments
// String sampleOutputDir = String.format("%s/%s/", VmDirectories.OUTPUT, sampleId);
// commands.addCommand(() -> format("mkdir -p %s", sampleOutputDir));
// String runDirectory = "run_panel";
final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V38);
StringJoiner lilacArgs = new StringJoiner(" ");
lilacArgs.add(String.format("-sample %s", sampleId));
lilacArgs.add(String.format("-reference_bam %s/%s", VmDirectories.INPUT, tumorBam));
lilacArgs.add(String.format("-resource_dir %s/", VmDirectories.INPUT));
lilacArgs.add(String.format("-ref_genome %s", resourceFiles.refGenomeFile()));
lilacArgs.add(String.format("-ref_genome_version %s", resourceFiles.version().toString()));
lilacArgs.add(String.format("-output_dir %s", VmDirectories.OUTPUT));
lilacArgs.add("-write_all_files");
lilacArgs.add(String.format("-threads %s", Bash.allCpus()));
String lilacJar = String.format("%s/%s", VmDirectories.TOOLS, LILAC_JAR);
// String lilacJar = String.format("%s/lilac/%s/lilac.jar", VmDirectories.TOOLS, Versions.LILAC);
commands.addCommand(() -> format("java -Xmx%s -jar %s %s", MAX_HEAP, lilacJar, lilacArgs.toString()));
commands.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "lilac"), executionFlags));
return ImmutableVirtualMachineJobDefinition.builder().name("lilac").startupCommand(commands).performanceProfile(custom(12, 32)).namespacedResults(ResultsDirectory.defaultDirectory()).build();
}
Aggregations