use of htsjdk.samtools.util.IntervalList in project jvarkit by lindenb.
the class ImpactOfDuplicates method doWork.
@Override
public int doWork(final List<String> args) {
CloseableIterator<Duplicate> dupIter = null;
final List<File> INPUT = args.stream().map(S -> new File(S)).collect(Collectors.toList());
try {
this.duplicates = SortingCollection.newInstance(Duplicate.class, new DuplicateCodec(), new Comparator<Duplicate>() {
@Override
public int compare(Duplicate o1, Duplicate o2) {
return o1.compareTo(o2);
}
}, this.sortingCollectionArgs.getMaxRecordsInRam(), this.sortingCollectionArgs.getTmpPaths());
for (this.bamIndex = 0; this.bamIndex < INPUT.size(); this.bamIndex++) {
int prev_tid = -1;
int prev_pos = -1;
long nLines = 0L;
File inFile = INPUT.get(this.bamIndex);
LOG.info("Processing " + inFile);
IOUtil.assertFileIsReadable(inFile);
SamReader samReader = null;
CloseableIterator<SAMRecord> iter = null;
try {
samReader = SamReaderFactory.make().validationStringency(ValidationStringency.LENIENT).open(inFile);
final SAMFileHeader header = samReader.getFileHeader();
this.samFileDicts.add(header.getSequenceDictionary());
if (BEDFILE == null) {
iter = samReader.iterator();
} else {
IntervalList intervalList = new IntervalList(header);
BufferedReader in = new BufferedReader(new FileReader(BEDFILE));
String line = null;
while ((line = in.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
String[] tokens = line.split("[\t]");
Interval interval = new Interval(tokens[0], 1 + Integer.parseInt(tokens[1]), Integer.parseInt(tokens[2]));
intervalList.add(interval);
}
in.close();
intervalList = intervalList.sorted();
List<Interval> uniqueIntervals = IntervalList.getUniqueIntervals(intervalList, false);
SamRecordIntervalIteratorFactory sriif = new SamRecordIntervalIteratorFactory();
iter = sriif.makeSamRecordIntervalIterator(samReader, uniqueIntervals, false);
}
while (iter.hasNext()) {
SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (!rec.getReadPairedFlag())
continue;
if (rec.getReferenceIndex() != rec.getMateReferenceIndex())
continue;
if (!rec.getProperPairFlag())
continue;
if (!rec.getFirstOfPairFlag())
continue;
if (prev_tid != -1) {
if (prev_tid > rec.getReferenceIndex()) {
throw new IOException("Bad sort order from " + rec);
} else if (prev_tid == rec.getReferenceIndex() && prev_pos > rec.getAlignmentStart()) {
throw new IOException("Bad sort order from " + rec);
} else {
prev_pos = rec.getAlignmentStart();
}
} else {
prev_tid = rec.getReferenceIndex();
prev_pos = -1;
}
if ((++nLines) % 1000000 == 0) {
LOG.info("In " + inFile + " N=" + nLines);
}
Duplicate dup = new Duplicate();
dup.bamIndex = this.bamIndex;
dup.pos = Math.min(rec.getAlignmentStart(), rec.getMateAlignmentStart());
dup.tid = rec.getReferenceIndex();
dup.size = Math.abs(rec.getInferredInsertSize());
this.duplicates.add(dup);
}
} finally {
if (iter != null)
iter.close();
if (samReader != null)
samReader.close();
}
LOG.info("done " + inFile);
}
/**
* loop done, now scan the duplicates
*/
LOG.info("doneAdding");
this.duplicates.doneAdding();
this.out = super.openFileOrStdoutAsPrintStream(outputFile);
out.print("#INTERVAL\tMAX\tMEAN");
for (int i = 0; i < INPUT.size(); ++i) {
out.print('\t');
out.print(INPUT.get(i));
}
out.println();
dupIter = this.duplicates.iterator();
while (dupIter.hasNext()) {
Duplicate dup = dupIter.next();
if (this.duplicatesBuffer.isEmpty() || dup.compareChromPosSize(this.duplicatesBuffer.get(0)) == 0) {
this.duplicatesBuffer.add(dup);
} else {
dumpDuplicatesBuffer(INPUT);
this.duplicatesBuffer.add(dup);
}
}
dumpDuplicatesBuffer(INPUT);
LOG.info("end iterator");
out.flush();
out.close();
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
if (dupIter != null)
dupIter.close();
LOG.info("cleaning duplicates");
this.duplicates.cleanup();
}
return 0;
}
use of htsjdk.samtools.util.IntervalList in project jvarkit by lindenb.
the class VCFAnnoBam method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, final VariantContextWriter w) {
BufferedReader bedIn = null;
List<SamReader> samReaders = new ArrayList<SamReader>();
IntervalTreeMap<Rgn> capture = new IntervalTreeMap<Rgn>();
try {
SAMFileHeader firstHeader = null;
for (final File samFile : new HashSet<File>(BAMFILE)) {
LOG.info("open bam " + samFile);
final SamReader samReader = super.openSamReader(samFile.getPath());
final SAMFileHeader samHeader = samReader.getFileHeader();
samReaders.add(samReader);
if (firstHeader == null) {
firstHeader = samHeader;
} else if (!SequenceUtil.areSequenceDictionariesEqual(firstHeader.getSequenceDictionary(), samHeader.getSequenceDictionary())) {
throw new JvarkitException.DictionariesAreNotTheSame(firstHeader.getSequenceDictionary(), samHeader.getSequenceDictionary());
}
}
IntervalList intervalList = new IntervalList(firstHeader);
LOG.info("read bed " + BEDILE);
bedIn = IOUtils.openFileForBufferedReading(BEDILE);
String line;
final BedLineCodec bedCodec = new BedLineCodec();
while ((line = bedIn.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
final BedLine bed = bedCodec.decode(line);
if (bed == null)
continue;
if (firstHeader.getSequenceDictionary().getSequence(bed.getContig()) == null) {
LOG.error("error in BED +" + BEDILE + " : " + line + " chromosome is not in sequence dict of " + BAMFILE);
continue;
}
intervalList.add(bed.toInterval());
}
bedIn.close();
bedIn = null;
intervalList = intervalList.sorted();
for (final Interval interval : intervalList.uniqued()) {
final Rgn rgn = new Rgn();
rgn.interval = interval;
capture.put(rgn.interval, rgn);
}
intervalList = null;
VCFHeader header = r.getHeader();
VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder());
h2.addMetaDataLine(new VCFInfoHeaderLine(this.capture_tag, 1, VCFHeaderLineType.String, "Capture stats: Format is (start|end|mean|min|max|length|not_covered|percent_covered) BAM files: " + BAMFILE + " CAPTURE:" + BEDILE));
w.writeHeader(h2);
while (r.hasNext()) {
final VariantContext ctx = r.next();
Interval interval = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd());
Collection<Rgn> rgns = capture.getOverlapping(interval);
Iterator<Rgn> it = rgns.iterator();
if (!it.hasNext()) {
w.add(ctx);
continue;
}
final Rgn rgn = it.next();
if (!rgn.processed) {
// LOG.info("processing "+rgn.interval);
process(rgn, samReaders);
}
final VariantContextBuilder b = new VariantContextBuilder(ctx);
b.attribute(this.capture_tag, rgn.toString());
w.add(b.make());
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
for (final SamReader samReader : samReaders) CloserUtil.close(samReader);
}
}
Aggregations