use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.
the class BwaAligner method run.
public AlignmentOutput run(final SingleSampleRunMetadata metadata) throws Exception {
StageTrace trace = new StageTrace(NAMESPACE, metadata.sampleName(), StageTrace.ExecutorType.COMPUTE_ENGINE).start();
RuntimeBucket rootBucket = RuntimeBucket.from(storage, NAMESPACE, metadata, arguments, labels);
Sample sample = sampleSource.sample(metadata);
if (sample.bam().isPresent()) {
String noPrefix = sample.bam().orElseThrow().replace("gs://", "");
int firstSlash = noPrefix.indexOf("/");
String bucket = noPrefix.substring(0, firstSlash);
String path = noPrefix.substring(firstSlash + 1);
return AlignmentOutput.builder().sample(metadata.sampleName()).status(PipelineStatus.PROVIDED).maybeAlignments(GoogleStorageLocation.of(bucket, path)).build();
}
final ResourceFiles resourceFiles = buildResourceFiles(arguments);
sampleUpload.run(sample, rootBucket);
List<Future<PipelineStatus>> futures = new ArrayList<>();
List<GoogleStorageLocation> perLaneBams = new ArrayList<>();
List<ReportComponent> laneLogComponents = new ArrayList<>();
List<GoogleStorageLocation> laneFailedLogs = new ArrayList<>();
for (Lane lane : sample.lanes()) {
RuntimeBucket laneBucket = RuntimeBucket.from(storage, laneNamespace(lane), metadata, arguments, labels);
BashStartupScript bash = BashStartupScript.of(laneBucket.name());
InputDownload first = new InputDownload(GoogleStorageLocation.of(rootBucket.name(), fastQFileName(sample.name(), lane.firstOfPairPath())));
InputDownload second = new InputDownload(GoogleStorageLocation.of(rootBucket.name(), fastQFileName(sample.name(), lane.secondOfPairPath())));
bash.addCommand(first).addCommand(second);
bash.addCommands(OverrideReferenceGenomeCommand.overrides(arguments));
SubStageInputOutput alignment = new LaneAlignment(arguments.sbpApiRunId().isPresent(), resourceFiles.refGenomeFile(), first.getLocalTargetPath(), second.getLocalTargetPath(), metadata.sampleName(), lane).apply(SubStageInputOutput.empty(metadata.sampleName()));
perLaneBams.add(GoogleStorageLocation.of(laneBucket.name(), resultsDirectory.path(alignment.outputFile().fileName())));
bash.addCommands(alignment.bash()).addCommand(new OutputUpload(GoogleStorageLocation.of(laneBucket.name(), resultsDirectory.path()), RuntimeFiles.typical()));
futures.add(executorService.submit(() -> runWithRetries(metadata, laneBucket, VirtualMachineJobDefinition.alignment(laneId(lane).toLowerCase(), bash, resultsDirectory))));
laneLogComponents.add(new RunLogComponent(laneBucket, laneNamespace(lane), Folder.from(metadata), resultsDirectory));
laneFailedLogs.add(GoogleStorageLocation.of(laneBucket.name(), RunLogComponent.LOG_FILE));
}
AlignmentOutput output;
if (lanesSuccessfullyComplete(futures)) {
List<InputDownload> laneBams = perLaneBams.stream().map(InputDownload::new).collect(Collectors.toList());
BashStartupScript mergeMarkdupsBash = BashStartupScript.of(rootBucket.name());
laneBams.forEach(mergeMarkdupsBash::addCommand);
SubStageInputOutput merged = new MergeMarkDups(laneBams.stream().map(InputDownload::getLocalTargetPath).filter(path -> path.endsWith("bam")).collect(Collectors.toList())).apply(SubStageInputOutput.empty(metadata.sampleName()));
mergeMarkdupsBash.addCommands(merged.bash());
mergeMarkdupsBash.addCommand(new OutputUpload(GoogleStorageLocation.of(rootBucket.name(), resultsDirectory.path()), RuntimeFiles.typical()));
PipelineStatus status = runWithRetries(metadata, rootBucket, VirtualMachineJobDefinition.mergeMarkdups(mergeMarkdupsBash, resultsDirectory));
ImmutableAlignmentOutput.Builder outputBuilder = AlignmentOutput.builder().sample(metadata.sampleName()).status(status).maybeAlignments(GoogleStorageLocation.of(rootBucket.name(), resultsDirectory.path(merged.outputFile().fileName()))).addAllReportComponents(laneLogComponents).addAllFailedLogLocations(laneFailedLogs).addFailedLogLocations(GoogleStorageLocation.of(rootBucket.name(), RunLogComponent.LOG_FILE)).addReportComponents(new RunLogComponent(rootBucket, Aligner.NAMESPACE, Folder.from(metadata), resultsDirectory));
if (!arguments.outputCram()) {
outputBuilder.addReportComponents(new SingleFileComponent(rootBucket, Aligner.NAMESPACE, Folder.from(metadata), bam(metadata.sampleName()), bam(metadata.sampleName()), resultsDirectory), new SingleFileComponent(rootBucket, Aligner.NAMESPACE, Folder.from(metadata), bai(bam(metadata.sampleName())), bai(bam(metadata.sampleName())), resultsDirectory)).addDatatypes(new AddDatatype(DataType.ALIGNED_READS, metadata.barcode(), new ArchivePath(Folder.from(metadata), BwaAligner.NAMESPACE, bam(metadata.sampleName()))), new AddDatatype(DataType.ALIGNED_READS_INDEX, metadata.barcode(), new ArchivePath(Folder.from(metadata), BwaAligner.NAMESPACE, bai(metadata.sampleName()))));
}
output = outputBuilder.build();
} else {
output = AlignmentOutput.builder().sample(metadata.sampleName()).status(PipelineStatus.FAILED).build();
}
trace.stop();
executorService.shutdown();
return output;
}
use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.
the class CobaltRerun method execute.
@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript commands, final RuntimeFiles executionFlags) {
// Inputs
final String set = inputs.get("set").inputValue();
final String tumorSampleName = inputs.get("tumor_sample").inputValue();
final String referenceSampleName = inputs.get("ref_sample").inputValue();
final InputFileDescriptor remoteTumorFile = inputs.get("tumor_cram");
final InputFileDescriptor remoteReferenceFile = inputs.get("ref_cram");
final InputFileDescriptor remoteTumorIndex = remoteTumorFile.index();
final InputFileDescriptor remoteReferenceIndex = remoteReferenceFile.index();
final String localTumorFile = localFilename(remoteTumorFile);
final String localReferenceFile = localFilename(remoteReferenceFile);
// Download tumor
commands.addCommand(() -> remoteTumorFile.toCommandForm(localTumorFile));
commands.addCommand(() -> remoteTumorIndex.toCommandForm(localFilename(remoteTumorIndex)));
// Download normal
commands.addCommand(() -> remoteReferenceFile.toCommandForm(localReferenceFile));
commands.addCommand(() -> remoteReferenceIndex.toCommandForm(localFilename(remoteReferenceIndex)));
final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
commands.addCommand(() -> CobaltCommandBuilder.newBuilder(resourceFiles).reference(referenceSampleName, localReferenceFile).tumor(tumorSampleName, localTumorFile).build().asBash());
// Store output
final GoogleStorageLocation archiveStorageLocation = cobaltArchiveDirectory(set);
commands.addCommand(new CopyLogToOutput(executionFlags.log(), "run.log"));
commands.addCommand(new OutputUpload(archiveStorageLocation));
return VirtualMachineJobDefinition.amber(commands, ResultsDirectory.defaultDirectory());
}
use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.
the class CobaltTumorOnlyRerun method execute.
@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript commands, final RuntimeFiles executionFlags) {
// Inputs
final String set = inputs.get("set").inputValue();
final String tumorSampleName = inputs.get("tumor_sample").inputValue();
final InputFileDescriptor remoteTumorFile = inputs.get("tumor_cram");
final InputFileDescriptor remoteTumorIndex = remoteTumorFile.index();
final String localTumorFile = localFilename(remoteTumorFile);
// Download tumor
commands.addCommand(() -> remoteTumorFile.toCommandForm(localTumorFile));
commands.addCommand(() -> remoteTumorIndex.toCommandForm(localFilename(remoteTumorIndex)));
final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
commands.addCommand(() -> CobaltCommandBuilder.newBuilder(resourceFiles).tumor(tumorSampleName, localTumorFile).build().asBash());
// Store output
final GoogleStorageLocation archiveStorageLocation = cobaltArchiveDirectory(set);
commands.addCommand(new CopyLogToOutput(executionFlags.log(), "run.log"));
commands.addCommand(new OutputUpload(archiveStorageLocation));
return VirtualMachineJobDefinition.amber(commands, ResultsDirectory.defaultDirectory());
}
use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.
the class FlagstatGenerator method execute.
@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket bucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
InputFileDescriptor input = inputs.get();
InputFileDescriptor existingFlagstat = InputFileDescriptor.from(input).withInputValue(input.inputValue().replaceAll("\\.bam$", ".flagstat"));
String localCopyOfOriginalFlagstat = format("%s/%s", VmDirectories.OUTPUT, new File(existingFlagstat.inputValue()).getName());
String outputFile = VmDirectories.outputFile(new File(input.inputValue()).getName().replaceAll("\\.bam$", ".batch.flagstat"));
String localInput = format("%s/%s", VmDirectories.INPUT, new File(input.inputValue()).getName());
startupScript.addCommand(() -> input.toCommandForm(localInput));
startupScript.addCommand(() -> existingFlagstat.toCommandForm(localCopyOfOriginalFlagstat));
startupScript.addCommand(new PipeCommands(new VersionedToolCommand("sambamba", "sambamba", Versions.SAMBAMBA, "flagstat", "-t", Bash.allCpus(), localInput), () -> "tee " + outputFile));
startupScript.addCommand(() -> format("diff %s %s", localCopyOfOriginalFlagstat, outputFile));
startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(bucket.name(), "flagstat"), executionFlags));
return VirtualMachineJobDefinition.builder().name("flagstat").startupCommand(startupScript).namespacedResults(ResultsDirectory.defaultDirectory()).performanceProfile(VirtualMachinePerformanceProfile.custom(4, 6)).build();
}
use of com.hartwig.pipeline.execution.vm.OutputUpload in project pipeline5 by hartwigmedical.
the class GridssBackport method execute.
@Override
public VirtualMachineJobDefinition execute(final InputBundle inputs, final RuntimeBucket runtimeBucket, final BashStartupScript startupScript, final RuntimeFiles executionFlags) {
final ResourceFiles resourceFiles = ResourceFilesFactory.buildResourceFiles(RefGenomeVersion.V37);
final InputFileDescriptor template = inputs.get("set");
final String set = inputs.get("set").inputValue();
final String sample = inputs.get("tumor_sample").inputValue();
final String bamFile = String.format("gs://hmf-gridss/assembly/%s/%s.assembly.bam.sv.bam", set, sample);
final String vcfFile = String.format("gs://hmf-gridss/original/%s/%s.gridss.unfiltered.vcf.gz", set, sample);
final InputFileDescriptor inputBam = ImmutableInputFileDescriptor.builder().from(template).inputValue(bamFile).build();
final InputFileDescriptor inputBamIndex = inputBam.index();
final InputFileDescriptor inputVcf = ImmutableInputFileDescriptor.builder().from(template).inputValue(vcfFile).build();
final InputFileDescriptor inputVcfIndex = inputVcf.index();
// 1. Set up paths
startupScript.addCommand(new ExportPathCommand(new BwaCommand()));
startupScript.addCommand(new ExportPathCommand(new SamtoolsCommand()));
// 2. Download input files
startupScript.addCommand(inputBam::copyToLocalDestinationCommand);
startupScript.addCommand(inputBamIndex::copyToLocalDestinationCommand);
startupScript.addCommand(inputVcf::copyToLocalDestinationCommand);
startupScript.addCommand(inputVcfIndex::copyToLocalDestinationCommand);
// 3. Get sample names
startupScript.addCommand(() -> format("sampleNames=$(zgrep -m1 CHROM %s)", inputVcf.localDestination()));
startupScript.addCommand(() -> "sample0=$(echo $sampleNames | cut -d \" \" -f 10)");
startupScript.addCommand(() -> "sample1=$(echo $sampleNames | cut -d \" \" -f 11)");
// 4. Create empty bams (and their working directories)
final String emptyBam1 = String.format("%s/${%s}", VmDirectories.INPUT, "sample0");
final String emptyBam1Working = workingDir(emptyBam1) + ".sv.bam";
final String emptyBam2 = String.format("%s/${%s}", VmDirectories.INPUT, "sample1");
final String emptyBam2Working = workingDir(emptyBam2) + ".sv.bam";
startupScript.addCommand(() -> format("samtools view -H %s | samtools view -o %s", inputBam.localDestination(), emptyBam1));
startupScript.addCommand(() -> format("samtools view -H %s | samtools view -o %s", inputBam.localDestination(), emptyBam2));
startupScript.addCommand(() -> format("mkdir -p %s", dirname(emptyBam1Working)));
startupScript.addCommand(() -> format("mkdir -p %s", dirname(emptyBam2Working)));
startupScript.addCommand(() -> format("cp %s %s", emptyBam1, emptyBam1Working));
startupScript.addCommand(() -> format("cp %s %s", emptyBam2, emptyBam2Working));
// 5. SoftClipsToSplitReads
final String newAssemblyBam = workingDir(inputBam.localDestination());
startupScript.addCommand(() -> format("mkdir -p %s", dirname(newAssemblyBam)));
startupScript.addCommand(new SoftClipsToSplitReads(inputBam.localDestination(), resourceFiles.refGenomeFile(), newAssemblyBam));
// 6. Allocate Evidence
final OutputFile newRawVcf = OutputFile.of(sample, "gridss_" + Versions.GRIDSS.replace(".", "_") + ".raw", FileTypes.GZIPPED_VCF);
startupScript.addCommand(new AllocateEvidence(emptyBam1, emptyBam2, newAssemblyBam, inputVcf.localDestination(), newRawVcf.path(), resourceFiles.refGenomeFile(), resourceFiles.gridssPropertiesFile()));
// 7. Gridss Annotation
final SubStageInputOutput annotation = new GridssAnnotation(resourceFiles, true).apply(SubStageInputOutput.of(sample, newRawVcf, Collections.emptyList()));
startupScript.addCommands(annotation.bash());
// 8. Archive targeted output
final OutputFile unfilteredVcf = annotation.outputFile();
final OutputFile unfilteredVcfIndex = unfilteredVcf.index(".tbi");
final GoogleStorageLocation unfilteredVcfRemoteLocation = remoteUnfilteredVcfArchivePath(set, sample);
final GoogleStorageLocation unfilteredVcfIndexRemoteLocation = index(unfilteredVcfRemoteLocation, ".tbi");
startupScript.addCommand(() -> unfilteredVcf.copyToRemoteLocation(unfilteredVcfRemoteLocation));
startupScript.addCommand(() -> unfilteredVcfIndex.copyToRemoteLocation(unfilteredVcfIndexRemoteLocation));
// 9. Upload all output
startupScript.addCommand(new OutputUpload(GoogleStorageLocation.of(runtimeBucket.name(), "gridss"), executionFlags));
return VirtualMachineJobDefinition.structuralCalling(startupScript, ResultsDirectory.defaultDirectory());
}
Aggregations