use of htsjdk.samtools.MergingSamRecordIterator in project gatk by broadinstitute.
the class FixMateInformation method doWork.
@Override
protected Object doWork() {
// Open up the input
boolean allQueryNameSorted = true;
final List<SamReader> readers = new ArrayList<>();
for (final File f : INPUT) {
IOUtil.assertFileIsReadable(f);
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
readers.add(reader);
if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.queryname)
allQueryNameSorted = false;
}
// or into a temporary file that will overwrite the INPUT file eventually
if (OUTPUT != null)
OUTPUT = OUTPUT.getAbsoluteFile();
final boolean differentOutputSpecified = OUTPUT != null;
if (differentOutputSpecified) {
IOUtil.assertFileIsWritable(OUTPUT);
} else if (INPUT.size() != 1) {
throw new UserException("Must specify either an explicit OUTPUT file or a single INPUT file to be overridden.");
} else {
final File soleInput = INPUT.get(0).getAbsoluteFile();
final File dir = soleInput.getParentFile().getAbsoluteFile();
try {
IOUtil.assertFileIsWritable(soleInput);
IOUtil.assertDirectoryIsWritable(dir);
OUTPUT = File.createTempFile(soleInput.getName() + ".being_fixed.", BamFileIoUtils.BAM_FILE_EXTENSION, dir);
} catch (final IOException ioe) {
throw new RuntimeIOException("Could not create tmp file in " + dir.getAbsolutePath());
}
}
// Get the input records merged and sorted by query name as needed
final PeekableIterator<SAMRecord> iterator;
final SAMFileHeader header;
{
// Deal with merging if necessary
final Iterator<SAMRecord> tmp;
if (INPUT.size() > 1) {
final List<SAMFileHeader> headers = new ArrayList<>(readers.size());
for (final SamReader reader : readers) {
headers.add(reader.getFileHeader());
}
final SAMFileHeader.SortOrder sortOrder = (allQueryNameSorted ? SAMFileHeader.SortOrder.queryname : SAMFileHeader.SortOrder.unsorted);
final SamFileHeaderMerger merger = new SamFileHeaderMerger(sortOrder, headers, false);
tmp = new MergingSamRecordIterator(merger, readers, false);
header = merger.getMergedHeader();
} else {
tmp = readers.get(0).iterator();
header = readers.get(0).getFileHeader();
}
// And now deal with re-sorting if necessary
if (ASSUME_SORTED || allQueryNameSorted) {
iterator = new SamPairUtil.SetMateInfoIterator(new PeekableIterator<>(tmp), ADD_MATE_CIGAR);
} else {
logger.info("Sorting input into queryname order.");
final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
while (tmp.hasNext()) {
sorter.add(tmp.next());
}
iterator = new SamPairUtil.SetMateInfoIterator(new PeekableIterator<SAMRecord>(sorter.iterator()) {
@Override
public void close() {
super.close();
sorter.cleanup();
}
}, ADD_MATE_CIGAR);
logger.info("Sorting by queryname complete.");
}
// Deal with the various sorting complications
final SAMFileHeader.SortOrder outputSortOrder = SORT_ORDER == null ? readers.get(0).getFileHeader().getSortOrder() : SORT_ORDER;
logger.info("Output will be sorted by " + outputSortOrder);
header.setSortOrder(outputSortOrder);
}
if (CREATE_INDEX && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException("Can't CREATE_INDEX unless sort order is coordinate");
}
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, header, header.getSortOrder() == SAMFileHeader.SortOrder.queryname)) {
logger.info("Traversing query name sorted records and fixing up mate pair information.");
final ProgressLogger progress = new ProgressLogger(logger);
while (iterator.hasNext()) {
final SAMRecord record = iterator.next();
out.addAlignment(record);
progress.record(record);
}
iterator.close();
if (header.getSortOrder() == SAMFileHeader.SortOrder.queryname) {
logger.info("Closing output file.");
} else {
logger.info("Finished processing reads; re-sorting output file.");
}
}
// TODO throw appropriate exceptions instead of writing to log.error and returning
if (!differentOutputSpecified) {
logger.info("Replacing input file with fixed file.");
final File soleInput = INPUT.get(0).getAbsoluteFile();
final File old = new File(soleInput.getParentFile(), soleInput.getName() + ".old");
if (!old.exists() && soleInput.renameTo(old)) {
if (OUTPUT.renameTo(soleInput)) {
if (!old.delete()) {
logger.warn("Could not delete old file: " + old.getAbsolutePath());
return null;
}
if (CREATE_INDEX) {
final File newIndex = new File(OUTPUT.getParent(), OUTPUT.getName().substring(0, OUTPUT.getName().length() - 4) + ".bai");
final File oldIndex = new File(soleInput.getParent(), soleInput.getName().substring(0, soleInput.getName().length() - 4) + ".bai");
if (!newIndex.renameTo(oldIndex)) {
logger.warn("Could not overwrite index file: " + oldIndex.getAbsolutePath());
}
}
} else {
logger.error("Could not move new file to " + soleInput.getAbsolutePath());
logger.error("Input file preserved as: " + old.getAbsolutePath());
logger.error("New file preserved as: " + OUTPUT.getAbsolutePath());
return null;
}
} else {
logger.error("Could not move input file out of the way: " + soleInput.getAbsolutePath());
if (!OUTPUT.delete()) {
logger.error("Could not delete temporary file: " + OUTPUT.getAbsolutePath());
}
return null;
}
}
CloserUtil.close(readers);
return null;
}
use of htsjdk.samtools.MergingSamRecordIterator in project jvarkit by lindenb.
the class Bam2Wig method doWork.
@Override
public int doWork(final List<String> args) {
if (this.win_shift <= 0) {
LOG.error("window shift<=0");
return -1;
}
if (this.window_span <= 0) {
LOG.error("window size<=0");
return -1;
}
final Interval interval;
PrintWriter pw = null;
CloseableIterator<SAMRecord> samRecordIterator = null;
final List<SamReader> samReaders = new ArrayList<>();
final List<CloseableIterator<SAMRecord>> merginIterators = new ArrayList<>();
try {
final SamReaderFactory srf = SamReaderFactory.makeDefault().validationStringency(htsjdk.samtools.ValidationStringency.LENIENT);
if (args.isEmpty()) {
if (!StringUtil.isBlank(region_str)) {
LOG.error("cannot specify region for stdin");
return -1;
}
interval = null;
samReaders.add(srf.open(SamInputResource.of(stdin())));
samRecordIterator = samReaders.get(0).iterator();
} else if (args.size() == 1 && !args.get(0).endsWith(".list")) {
samReaders.add(srf.open(SamInputResource.of(new File(args.get(0)))));
if (StringUtil.isBlank(this.region_str)) {
samRecordIterator = samReaders.get(0).iterator();
interval = null;
} else {
interval = new IntervalParser(samReaders.get(0).getFileHeader().getSequenceDictionary()).setContigNameIsWholeContig(true).parse(region_str);
if (interval == null) {
LOG.error("Cannot parse interval " + this.region_str);
return -1;
}
LOG.debug("interval " + interval);
samRecordIterator = samReaders.get(0).query(interval.getContig(), interval.getStart(), interval.getEnd(), false);
}
} else {
final List<File> samFiles;
if (args.size() == 1 && args.get(0).endsWith(".list")) {
samFiles = IOUtils.unrollFile(new File(args.get(0)));
} else {
samFiles = args.stream().map(S -> new File(S)).collect(Collectors.toList());
}
if (samFiles.isEmpty()) {
LOG.error("No Input SAM file");
return -1;
}
final SAMSequenceDictionary dict0 = SAMSequenceDictionaryExtractor.extractDictionary(samFiles.get(0));
if (dict0 == null)
throw new JvarkitException.DictionaryMissing(samFiles.get(0).getPath());
samFiles.stream().forEach(F -> {
final SAMSequenceDictionary dicti = SAMSequenceDictionaryExtractor.extractDictionary(F);
if (dicti == null)
throw new JvarkitException.DictionaryMissing(F.getPath());
if (!SequenceUtil.areSequenceDictionariesEqual(dicti, dict0)) {
throw new JvarkitException.DictionariesAreNotTheSame(dict0, dicti);
}
});
for (final File bamFile : samFiles) {
LOG.info("opening " + bamFile);
samReaders.add(srf.open(bamFile));
}
final SamFileHeaderMerger mergedheader = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, samReaders.stream().map(SR -> SR.getFileHeader()).collect(Collectors.toList()), false);
final Map<SamReader, CloseableIterator<SAMRecord>> reader2iter = new HashMap<>();
if (StringUtil.isBlank(this.region_str)) {
for (final SamReader sr : samReaders) {
reader2iter.put(sr, sr.iterator());
}
interval = null;
} else {
interval = new IntervalParser(dict0).setContigNameIsWholeContig(true).parse(region_str);
if (interval == null) {
LOG.error("Cannot parse interval " + this.region_str);
return -1;
}
LOG.info("interval :" + interval);
for (final SamReader sr : samReaders) {
reader2iter.put(sr, sr.query(interval.getContig(), interval.getStart(), interval.getEnd(), false));
}
}
merginIterators.addAll(reader2iter.values());
samRecordIterator = new MergingSamRecordIterator(mergedheader, reader2iter, true);
}
for (final SamReader sr : samReaders) {
if (sr.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
LOG.error("one of your bam input is not sorted on coordinate");
return -1;
}
}
pw = openFileOrStdoutAsPrintWriter(this.outputFile);
run(pw, samRecordIterator, samReaders.get(0).getFileHeader().getSequenceDictionary(), interval);
samRecordIterator.close();
samRecordIterator = null;
CloserUtil.close(samReaders);
samReaders.clear();
pw.flush();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(merginIterators);
CloserUtil.close(samRecordIterator);
CloserUtil.close(samReaders);
CloserUtil.close(pw);
pw = null;
}
}
use of htsjdk.samtools.MergingSamRecordIterator in project jvarkit by lindenb.
the class SamFindClippedRegions method doWork.
/*private static boolean closeTo(int pos1,int pos2, int max)
{
return Math.abs(pos2-pos1)<=max;
}*/
/*
private static boolean same(char c1,char c2)
{
if(c1=='N' || c2=='N') return false;
return Character.toUpperCase(c1)==Character.toUpperCase(c2);
}*/
@Override
public int doWork(List<String> args) {
int readLength = 150;
if (args.isEmpty()) {
LOG.error("illegal.number.of.arguments");
return -1;
}
List<Input> inputs = new ArrayList<Input>();
VariantContextWriter w = null;
// SAMFileWriter w=null;
try {
SAMSequenceDictionary dict = null;
/* create input, collect sample names */
Map<String, Input> sample2input = new HashMap<String, Input>();
for (final String filename : args) {
Input input = new Input(new File(filename));
// input.index=inputs.size();
inputs.add(input);
if (sample2input.containsKey(input.sampleName)) {
LOG.error("Duplicate sample " + input.sampleName + " in " + input.bamFile + " and " + sample2input.get(input.sampleName).bamFile);
return -1;
}
sample2input.put(input.sampleName, input);
if (dict == null) {
dict = input.header.getSequenceDictionary();
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, input.header.getSequenceDictionary())) {
LOG.error("Found more than one dictint sequence dictionary");
return -1;
}
}
LOG.info("Sample N= " + sample2input.size());
/* create merged iterator */
List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(sample2input.size());
for (Input input : inputs) headers.add(input.header);
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SortOrder.coordinate, headers, true);
List<SamReader> readers = new ArrayList<SamReader>(sample2input.size());
for (Input input : inputs) readers.add(input.samFileReaderScan);
MergingSamRecordIterator merginIter = new MergingSamRecordIterator(headerMerger, readers, true);
Allele reference_allele = Allele.create("N", true);
Allele[] alternate_alleles = new Allele[] { Allele.create("<CLIP5>", false), Allele.create("<CLIP3>", false) };
Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
for (Allele alt : alternate_alleles) {
vcfHeaderLines.add(new VCFSimpleHeaderLine("<ID=" + alt.getDisplayString() + ",Description=\"StructVar\">", VCFHeaderVersion.VCF4_1, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")));
}
vcfHeaderLines.add(new VCFInfoHeaderLine("COUNT_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples with depth>=" + this.min_depth));
vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
for (int side = 0; side < 2; ++side) {
vcfHeaderLines.add(new VCFFormatHeaderLine("CN" + (side == 0 ? 5 : 3), 1, VCFHeaderLineType.Integer, "count clipped in " + (side == 0 ? 5 : 3) + "'"));
}
if (dict != null) {
vcfHeaderLines.addAll(VCFUtils.samSequenceDictToVCFContigHeaderLine(dict));
}
VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample2input.keySet());
w = VCFUtils.createVariantContextWriterToStdout();
w.writeHeader(vcfHeader);
final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<>();
// w=swf.make(header, System.out);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
if (bedFile != null) {
final BedLineCodec bedLineCodec = new BedLineCodec();
LOG.info("Reading " + bedFile);
BufferedReader r = IOUtils.openFileForBufferedReading(bedFile);
String line;
while ((line = r.readLine()) != null) {
BedLine bedLine = bedLineCodec.decode(line);
if (bedLine == null)
continue;
if (dict != null && dict.getSequence(bedLine.getContig()) == null) {
LOG.warning("undefined chromosome in " + bedFile + " " + line);
continue;
}
intervals.put(bedLine.toInterval(), true);
}
CloserUtil.close(r);
}
LinkedList<SAMRecord> buffer = new LinkedList<SAMRecord>();
final Predicate<SAMRecord> filterSamRecords = new Predicate<SAMRecord>() {
@Override
public boolean test(SAMRecord rec) {
if (rec.getReadUnmappedFlag())
return false;
if (rec.isSecondaryOrSupplementary())
return false;
if (rec.getDuplicateReadFlag())
return false;
if (rec.getReadFailsVendorQualityCheckFlag())
return false;
Cigar cigar = rec.getCigar();
if (cigar == null || cigar.numCigarElements() < 2)
return false;
boolean found_S = false;
for (int side = 0; side < 2; ++side) {
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
// read must be clipped on 5' or 3' with a good length
if (!ce.getOperator().equals(CigarOperator.S))
continue;
found_S = true;
break;
}
if (!found_S)
return false;
SAMReadGroupRecord g = rec.getReadGroup();
if (g == null || g.getSample() == null || g.getSample().isEmpty())
return false;
return true;
}
};
final FilteringIterator<SAMRecord> forwardIterator = new FilteringIterator<SAMRecord>(merginIter, filterSamRecords);
for (; ; ) {
SAMRecord rec = null;
if (forwardIterator.hasNext()) {
rec = forwardIterator.next();
progress.watch(rec);
if (intervals != null && !intervals.containsOverlapping(new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd())))
continue;
}
// need to flush buffer ?
if (rec == null || (!buffer.isEmpty() && !buffer.getLast().getReferenceIndex().equals(rec.getReferenceIndex())) || (!buffer.isEmpty() && buffer.getLast().getUnclippedEnd() + readLength < rec.getUnclippedStart())) {
if (!buffer.isEmpty()) {
int chromStart = buffer.getFirst().getUnclippedStart();
int chromEnd = buffer.getFirst().getUnclippedEnd();
for (SAMRecord sam : buffer) {
chromStart = Math.min(chromStart, sam.getUnclippedStart());
chromEnd = Math.max(chromEnd, sam.getUnclippedEnd());
}
final int winShift = 5;
for (int pos = chromStart; pos + winShift <= chromEnd; pos += winShift) {
int[] count_big_clip = new int[] { 0, 0 };
// int max_depth[]=new int[]{0,0};
List<Genotype> genotypes = new ArrayList<Genotype>();
Set<Allele> all_alleles = new HashSet<Allele>();
all_alleles.add(reference_allele);
boolean found_one_depth_ok = false;
int sum_depth = 0;
int samples_with_high_depth = 0;
for (String sample : sample2input.keySet()) {
GenotypeBuilder gb = new GenotypeBuilder(sample);
int[] count_clipped = new int[] { 0, 0 };
Set<Allele> sample_alleles = new HashSet<Allele>(3);
for (int side = 0; side < 2; ++side) {
for (SAMRecord sam : buffer) {
if (!sam.getReadGroup().getSample().equals(sample))
continue;
Cigar cigar = sam.getCigar();
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
if (!ce.getOperator().equals(CigarOperator.S))
continue;
int clipStart = (side == 0 ? sam.getUnclippedStart() : sam.getAlignmentEnd() + 1);
int clipEnd = (side == 0 ? sam.getAlignmentStart() - 1 : sam.getUnclippedEnd());
if ((pos + winShift < clipStart || pos > clipEnd))
continue;
count_clipped[side]++;
if (ce.getLength() >= this.min_clip_length) {
count_big_clip[side]++;
}
sample_alleles.add(alternate_alleles[side]);
gb.attribute("CN" + (side == 0 ? 5 : 3), count_clipped[side]);
}
}
// if(!(found_one_big_clip[0] || found_one_big_clip[1])) continue;
if (count_clipped[0] + count_clipped[1] == 0)
continue;
if ((count_clipped[0] + count_clipped[1]) > min_depth) {
found_one_depth_ok = true;
++samples_with_high_depth;
}
sum_depth += (count_clipped[0] + count_clipped[1]);
gb.alleles(new ArrayList<Allele>(sample_alleles));
all_alleles.addAll(sample_alleles);
gb.DP(count_clipped[0] + count_clipped[1]);
genotypes.add(gb.make());
}
if (all_alleles.size() == 1) {
// all homozygotes
continue;
}
if (!found_one_depth_ok) {
continue;
}
if (!(count_big_clip[0] >= 1 || count_big_clip[1] >= 1)) {
continue;
}
Map<String, Object> atts = new HashMap<String, Object>();
atts.put("COUNT_SAMPLES", samples_with_high_depth);
atts.put(VCFConstants.DEPTH_KEY, sum_depth);
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(buffer.getFirst().getReferenceName());
vcb.start(pos);
vcb.stop(pos + winShift);
vcb.alleles(all_alleles);
vcb.attributes(atts);
vcb.genotypes(genotypes);
w.add(vcb.make());
}
buffer.clear();
}
if (rec == null) {
break;
}
}
buffer.add(rec);
}
merginIter.close();
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
for (Input input : inputs) {
CloserUtil.close(input);
}
}
}
use of htsjdk.samtools.MergingSamRecordIterator in project gatk by broadinstitute.
the class MergeSamFiles method doWork.
/** Combines multiple SAM/BAM files into one. */
@Override
protected Object doWork() {
boolean matchedSortOrders = true;
// Open the files for reading and writing
final List<SamReader> readers = new ArrayList<>();
final List<SAMFileHeader> headers = new ArrayList<>();
{
// Used to try and reduce redundant SDs in memory
SAMSequenceDictionary dict = null;
for (final File inFile : INPUT) {
IOUtil.assertFileIsReadable(inFile);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile);
readers.add(in);
headers.add(in.getFileHeader());
// replace the duplicate copies with a single dictionary to reduce the memory footprint.
if (dict == null) {
dict = in.getFileHeader().getSequenceDictionary();
} else if (dict.equals(in.getFileHeader().getSequenceDictionary())) {
in.getFileHeader().setSequenceDictionary(dict);
}
matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
}
}
// If all the input sort orders match the output sort order then just merge them and
// write on the fly, otherwise setup to merge and sort before writing out the final file
IOUtil.assertFileIsWritable(OUTPUT);
final boolean presorted;
final SAMFileHeader.SortOrder headerMergerSortOrder;
final boolean mergingSamRecordIteratorAssumeSorted;
if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted || ASSUME_SORTED) {
logger.info("Input files are in same order as output so sorting to temp directory is not needed.");
headerMergerSortOrder = SORT_ORDER;
mergingSamRecordIteratorAssumeSorted = ASSUME_SORTED;
presorted = true;
} else {
logger.info("Sorting input files using temp directory " + TMP_DIR);
headerMergerSortOrder = SAMFileHeader.SortOrder.unsorted;
mergingSamRecordIteratorAssumeSorted = false;
presorted = false;
}
final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(headerMergerSortOrder, headers, MERGE_SEQUENCE_DICTIONARIES);
final MergingSamRecordIterator iterator = new MergingSamRecordIterator(headerMerger, readers, mergingSamRecordIteratorAssumeSorted);
final SAMFileHeader header = headerMerger.getMergedHeader();
for (final String comment : COMMENT) {
header.addComment(comment);
}
header.setSortOrder(SORT_ORDER);
final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory();
if (USE_THREADING) {
samFileWriterFactory.setUseAsyncIo(true);
}
try (final SAMFileWriter out = createSAMWriter(OUTPUT, REFERENCE_SEQUENCE, header, presorted)) {
// Lastly loop through and write out the records
final ProgressLogger progress = new ProgressLogger(logger, PROGRESS_INTERVAL);
while (iterator.hasNext()) {
final SAMRecord record = iterator.next();
out.addAlignment(record);
progress.record(record);
}
logger.info("Finished reading inputs.");
CloserUtil.close(readers);
}
return null;
}
Aggregations