use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class CompareBams method doWork.
@Override
public int doWork(final List<String> args) {
this.inputBamsList.addAll(IOUtils.unrollPaths(args));
SortingCollection<Match> database = null;
SamReader samFileReader = null;
CloseableIterator<Match> iter = null;
try {
if (this.inputBamsList.size() < 2) {
LOG.error("Need more bams please, got " + this.inputBamsList.size());
return -1;
}
database = SortingCollection.newInstance(Match.class, new MatchCodec(), (A, B) -> matchCompare0(A, B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
this.samSequenceDictAreTheSame = true;
database.setDestructiveIteration(true);
for (int currentSamFileIndex = 0; currentSamFileIndex < this.inputBamsList.size(); currentSamFileIndex++) {
final Path samFile = this.inputBamsList.get(currentSamFileIndex);
samFileReader = super.createSamReaderFactory().referenceSequence(this.refPath).open(samFile);
final SAMSequenceDictionary dict = samFileReader.getFileHeader().getSequenceDictionary();
if (dict == null || dict.isEmpty()) {
LOG.error("Empty Dict in " + samFile);
return -1;
}
if (!this.sequenceDictionaries.isEmpty() && !SequenceUtil.areSequenceDictionariesEqual(this.sequenceDictionaries.get(0), dict)) {
this.samSequenceDictAreTheSame = false;
LOG.warn("FOOL !! THE SEQUENCE DICTIONARIES ARE **NOT** THE SAME. I will try to compare anyway but it will be slower.");
}
this.sequenceDictionaries.add(dict);
final Optional<SimpleInterval> interval;
if (!StringUtils.isBlank(this.REGION)) {
interval = IntervalParserFactory.newInstance().dictionary(dict).make().apply(this.REGION);
if (!interval.isPresent()) {
LOG.error("Cannot parse " + REGION + " (bad syntax or not in dictionary)");
return -1;
}
} else {
interval = Optional.empty();
}
SAMRecordIterator it = null;
if (!interval.isPresent()) {
it = samFileReader.iterator();
} else {
it = samFileReader.queryOverlapping(interval.get().getContig(), interval.get().getStart(), interval.get().getEnd());
}
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
while (it.hasNext()) {
final SAMRecord rec = progress.apply(it.next());
if (!rec.getReadUnmappedFlag()) {
if (rec.getMappingQuality() < this.min_mapq)
continue;
if (!this.no_read_filtering && rec.isSecondaryOrSupplementary())
continue;
}
final Match m = new Match();
if (rec.getReadPairedFlag()) {
m.num_in_pair = (rec.getFirstOfPairFlag() ? 1 : 2);
} else {
m.num_in_pair = 0;
}
m.readName = rec.getReadName();
m.bamIndex = currentSamFileIndex;
m.flag = rec.getFlags();
m.cigar = rec.getCigarString();
if (m.cigar == null)
m.cigar = "";
if (rec.getReadUnmappedFlag()) {
m.tid = -1;
m.pos = -1;
} else {
m.tid = rec.getReferenceIndex();
m.pos = rec.getAlignmentStart();
}
database.add(m);
}
progress.close();
it.close();
samFileReader.close();
samFileReader = null;
}
database.doneAdding();
LOG.info("Writing results....");
this.out = super.openPathOrStdoutAsPrintWriter(this.outputFile);
// compute the differences for each read
this.out.print("#READ-Name\t");
for (int x = 0; x < this.inputBamsList.size(); ++x) {
for (int y = x + 1; y < this.inputBamsList.size(); ++y) {
if (!(x == 0 && y == 1))
this.out.print("|");
this.out.print(inputBamsList.get(x));
this.out.print(" ");
this.out.print(inputBamsList.get(y));
}
}
for (int x = 0; x < this.inputBamsList.size(); ++x) {
this.out.print("\t" + inputBamsList.get(x));
}
this.out.println();
/* create an array of set<Match> */
final Comparator<Match> match_comparator = (A, B) -> matchCompare1(A, B);
final List<Set<Match>> matches = new ArrayList<Set<CompareBams.Match>>(this.inputBamsList.size());
while (matches.size() < this.inputBamsList.size()) {
matches.add(new TreeSet<CompareBams.Match>(match_comparator));
}
iter = database.iterator();
String currReadName = null;
int curr_num_in_pair = -1;
for (; ; ) {
Match nextMatch = null;
if (iter.hasNext()) {
nextMatch = iter.next();
}
if (nextMatch == null || (currReadName != null && !currReadName.equals(nextMatch.readName)) || (curr_num_in_pair != -1 && curr_num_in_pair != nextMatch.num_in_pair)) {
if (currReadName != null) {
this.out.print(currReadName);
if (curr_num_in_pair > 0) {
this.out.print("/");
this.out.print(curr_num_in_pair);
}
this.out.print("\t");
for (int x = 0; x < this.inputBamsList.size(); ++x) {
final Set<Match> first = matches.get(x);
for (int y = x + 1; y < this.inputBamsList.size(); ++y) {
if (!(x == 0 && y == 1))
this.out.print("|");
final Set<Match> second = matches.get(y);
if (same(first, second)) {
this.out.print("EQ");
} else {
this.out.print("NE");
}
}
}
for (int x = 0; x < this.inputBamsList.size(); ++x) {
this.out.print("\t");
print(matches.get(x), sequenceDictionaries.get(x));
}
this.out.println();
}
if (nextMatch == null)
break;
for (final Set<Match> set : matches) set.clear();
}
currReadName = nextMatch.readName;
curr_num_in_pair = nextMatch.num_in_pair;
matches.get(nextMatch.bamIndex).add(nextMatch);
if (this.out.checkError())
break;
}
iter.close();
this.out.flush();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
if (database != null)
database.cleanup();
CloserUtil.close(samFileReader);
CloserUtil.close(this.out);
this.out = null;
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class MergeBlastXml method doWork.
@Override
public int doWork(List<String> args) {
if (args.isEmpty()) {
LOG.error("input xml missing");
return -1;
}
XMLEventReader rx = null;
XMLEventReader rx2 = null;
XMLEventWriter wx = null;
SortingCollection<Iteration> sortingCollection = null;
try {
JAXBContext jc = JAXBContext.newInstance("gov.nih.nlm.ncbi.blast");
this.unmarshaller = jc.createUnmarshaller();
this.marshaller = jc.createMarshaller();
this.marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true);
this.marshaller.setProperty(Marshaller.JAXB_FRAGMENT, true);
XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
xmlInputFactory.setXMLResolver(new XMLResolver() {
@Override
public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
return null;
}
});
final Comparator<Iteration> hitComparator = (A, B) -> {
return A.getIterationQueryDef().compareTo(B.getIterationQueryDef());
};
sortingCollection = SortingCollection.newInstance(Iteration.class, new BlastIterationCodec(), hitComparator, this.maxRecordsInRam, this.tmpFile.toPath());
rx = xmlInputFactory.createXMLEventReader(new FileReader(args.get(0)));
XMLOutputFactory xof = XMLOutputFactory.newFactory();
if (this.outputFile != null) {
wx = xof.createXMLEventWriter(new StreamResult(this.outputFile));
} else {
wx = xof.createXMLEventWriter(new StreamResult(stdout()));
}
boolean in_iteration = false;
while (rx.hasNext()) {
final XMLEvent evt = rx.peek();
if (evt.isStartElement() && evt.asStartElement().getName().getLocalPart().equals("Iteration")) {
final Iteration iteration = this.unmarshaller.unmarshal(rx, Iteration.class).getValue();
sortingCollection.add(iteration);
} else if (evt.isStartElement() && evt.asStartElement().getName().getLocalPart().equals("BlastOutput_iterations")) {
wx.add(rx.nextEvent());
in_iteration = true;
} else if (evt.isEndElement() && evt.asEndElement().getName().getLocalPart().equals("BlastOutput_iterations")) {
for (int optind = 1; optind < args.size(); ++optind) {
LOG.info("opening " + args.get(optind));
rx2 = xmlInputFactory.createXMLEventReader(new FileReader(args.get(optind)));
while (rx2.hasNext()) {
XMLEvent evt2 = rx2.peek();
if (evt2.isStartElement() && evt2.asStartElement().getName().getLocalPart().equals("Iteration")) {
final Iteration iteration = this.unmarshaller.unmarshal(rx2, Iteration.class).getValue();
sortingCollection.add(iteration);
} else {
rx2.nextEvent();
}
}
rx2.close();
LOG.info("close");
}
sortingCollection.doneAdding();
sortingCollection.setDestructiveIteration(true);
final CloseableIterator<Iteration> coliter = sortingCollection.iterator();
final EqualRangeIterator<Iteration> eq = new EqualRangeIterator<>(coliter, hitComparator);
while (coliter.hasNext()) {
final List<Iteration> L = eq.next();
for (int i = 1; i < L.size(); ++i) {
L.get(0).getIterationHits().getHit().addAll(L.get(i).getIterationHits().getHit());
}
marshaller.marshal(L.get(0), wx);
}
eq.close();
coliter.close();
sortingCollection.cleanup();
sortingCollection = null;
wx.add(rx.nextEvent());
in_iteration = false;
} else if (in_iteration) {
// consumme text
rx.nextEvent();
} else {
wx.add(rx.nextEvent());
}
}
wx.flush();
wx.close();
return 0;
} catch (Exception e) {
LOG.error(e);
if (sortingCollection != null) {
sortingCollection.cleanup();
}
return -1;
} finally {
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class VcfSlidingWindowSplitter method run.
private int run(final List<String> args) {
SortingCollection<WinAndLine> sortingcollection = null;
BufferedReader in = null;
FileOutputStream fos = null;
CloseableIterator<WinAndLine> iter = null;
ArchiveFactory archiveFactory = null;
PrintWriter manifest = null;
final Function<VariantContext, List<Interval>> makeWindows = (ctx) -> {
final int ctx_start = ctx.getStart();
final int ctx_end = ctx.getEnd();
final List<Interval> list = new ArrayList<>();
int right = ctx_start - ctx_start % this.window_shift;
while (right + this.window_size >= ctx_start) {
right -= this.window_shift;
}
while (right <= ctx_end) {
final int left = right + this.window_size;
if (right > 0 && CoordMath.overlaps(right, left, ctx_start, ctx_end)) {
list.add(new Interval(ctx.getContig(), right, left));
}
right += this.window_shift;
}
return list;
};
try {
final Path tmpVcf = Files.createTempFile("tmp.", ".vcf.gz");
archiveFactory = ArchiveFactory.open(this.outputFile);
manifest = new PrintWriter(this.manifestFile == null ? new NullOuputStream() : IOUtils.openPathForWriting(manifestFile));
manifest.println("#chrom\tstart\tend\twindow\tpath\tCount_Variants");
in = super.openBufferedReader(oneFileOrNull(args));
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
// read variants
final ProgressFactory.Watcher<VariantContext> progess = ProgressFactory.newInstance().dictionary(cah.header).logger(LOG).build();
String prevCtg = null;
for (; ; ) {
final String line = in.readLine();
final VariantContext ctx = (line == null ? null : cah.codec.decode(line));
if (ctx != null)
progess.apply(ctx);
if (ctx == null || !ctx.getContig().equals(prevCtg)) {
if (sortingcollection != null) {
sortingcollection.doneAdding();
iter = sortingcollection.iterator();
final EqualRangeIterator<WinAndLine> eqiter = new EqualRangeIterator<>(iter, new WinAndLineComparator1());
while (eqiter.hasNext()) {
final List<WinAndLine> buffer = eqiter.next();
if (buffer.size() < this.min_number_of_ctx)
continue;
if (this.max_number_of_ctx > 0 && buffer.size() > this.max_number_of_ctx)
continue;
final WinAndLine first = buffer.get(0);
final VariantContextWriter out = VCFUtils.createVariantContextWriterToPath(tmpVcf);
final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + ".interval", first.interval.getContig() + ":" + first.interval.getStart() + "-" + first.interval.getEnd()));
out.writeHeader(header2);
int minPos = Integer.MAX_VALUE;
int maxPos = 0;
for (final WinAndLine kl : buffer) {
final VariantContext ctx2 = cah.codec.decode(kl.ctx);
minPos = Math.min(ctx2.getStart(), minPos);
maxPos = Math.max(ctx2.getEnd(), maxPos);
out.add(ctx2);
}
out.close();
final String md5 = StringUtils.md5(first.interval.getContig() + ":" + first.interval.getStart() + "-" + first.interval.getEnd());
final String filename = md5.substring(0, 2) + File.separatorChar + md5.substring(2) + File.separator + first.interval.getContig() + "_" + first.interval.getStart() + "_" + first.interval.getEnd() + ".vcf.gz";
try (final OutputStream os = archiveFactory.openOuputStream(filename)) {
IOUtils.copyTo(tmpVcf, os);
os.flush();
}
manifest.print(prevCtg);
manifest.print('\t');
manifest.print(minPos - 1);
manifest.print('\t');
manifest.print(maxPos);
manifest.print('\t');
manifest.print(first.interval.getContig() + ":" + first.interval.getStart() + "-" + first.interval.getEnd());
manifest.print('\t');
manifest.print((archiveFactory.isTarOrZipArchive() ? "" : this.outputFile.toString() + File.separator) + filename);
manifest.print('\t');
manifest.println(buffer.size());
}
eqiter.close();
iter.close();
}
sortingcollection = null;
if (ctx == null)
break;
prevCtg = ctx.getContig();
}
for (final Interval win : makeWindows.apply(ctx)) {
if (sortingcollection == null) {
sortingcollection = SortingCollection.newInstance(WinAndLine.class, new WinAndLineCodec(), new WinAndLineComparator2(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingcollection.setDestructiveIteration(true);
}
sortingcollection.add(new WinAndLine(win, line));
}
}
progess.close();
manifest.flush();
manifest.close();
archiveFactory.close();
Files.deleteIfExists(tmpVcf);
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
if (sortingcollection != null)
sortingcollection.cleanup();
CloserUtil.close(in);
CloserUtil.close(fos);
CloserUtil.close(manifest);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class OneThousandBam method doWork.
@Override
public int doWork(final List<String> args) {
SAMFileWriter sfw = null;
QueryInterval[] queryIntervalArray = null;
try {
if (this.bedFile == null && !this.exitAfterBaiDownload) {
LOG.error("bed file is missing");
return -1;
}
IOUtil.assertDirectoryIsWritable(this.baiCacheDir);
if (args.isEmpty()) {
LOG.error("no urls");
} else if (args.size() == 1 && args.get(0).endsWith(".list")) {
Files.lines(Paths.get(args.get(0))).filter(L -> !StringUtils.isBlank(L)).filter(L -> !L.startsWith("#")).map(L -> new Sample1KG(L)).forEach(S -> this.samples.add(S));
} else {
args.stream().map(L -> new Sample1KG(L)).forEach(S -> this.samples.add(S));
}
if (this.samples.isEmpty()) {
LOG.error("no sample defined");
return -1;
}
for (int x = 0; x < this.samples.size(); x++) {
LOG.debug("bai " + (x + 1) + "/" + this.samples.size());
this.samples.get(x).downloadBaiToCache();
}
if (this.exitAfterBaiDownload) {
LOG.info("Bai downloaded");
return 0;
}
for (final Sample1KG sample : this.samples) {
LOG.debug("get sam file header for " + sample.url);
try (final SamReader sr = sample.open()) {
final SAMFileHeader header = sr.getFileHeader();
if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
throw new RuntimeIOException(sample.url + " is not sorted");
}
sample.sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElseThrow(() -> new IllegalArgumentException("Cannot find sample in " + sample.url));
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (this.dict == null) {
LOG.debug("read bed file " + this.bedFile);
final List<QueryInterval> queryIntervals = new ArrayList<>();
this.dict = dict;
// load bed here
ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(this.dict);
final BedLineCodec codec = new BedLineCodec();
BufferedReader br = IOUtils.openPathForBufferedReading(this.bedFile);
String line;
while ((line = br.readLine()) != null) {
final BedLine bed = codec.decode(line);
if (bed == null)
continue;
String newCtg = ctgConverter.apply(bed.getContig());
if (StringUtils.isBlank(newCtg)) {
throw new JvarkitException.ContigNotFoundInDictionary(bed.getContig(), this.dict);
}
final int tid = this.dict.getSequenceIndex(newCtg);
final QueryInterval queryInterval = new QueryInterval(tid, bed.getStart(), bed.getEnd());
queryIntervals.add(queryInterval);
}
br.close();
queryIntervalArray = QueryInterval.optimizeIntervals(queryIntervals.toArray(new QueryInterval[queryIntervals.size()]));
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, this.dict)) {
throw new JvarkitException.DictionariesAreNotTheSame(dict, this.dict);
}
}
}
if (queryIntervalArray == null || queryIntervalArray.length == 0) {
LOG.error("no query interval defined");
return -1;
}
final SAMRecordComparator samRecordComparator = new SAMRecordCoordinateComparator();
final SAMFileHeader ouSamFileHeader = new SAMFileHeader(this.dict);
ouSamFileHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
for (final Sample1KG sample : this.samples) {
sample.readGroupRecord = new SAMReadGroupRecord(sample.sampleName);
sample.readGroupRecord.setSample(sample.sampleName);
sample.readGroupRecord.setLibrary(sample.sampleName);
sample.readGroupRecord.setAttribute("URL", sample.url);
ouSamFileHeader.addReadGroup(sample.readGroupRecord);
}
final SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
sfwf.setCreateIndex(false);
sfwf.setCreateMd5File(false);
if (this.outputFile == null) {
sfw = sfwf.makeBAMWriter(ouSamFileHeader, true, stdout());
} else {
sfw = sfwf.makeSAMOrBAMWriter(ouSamFileHeader, true, this.outputFile);
}
int interval_index = 0;
while (interval_index < queryIntervalArray.length) {
LOG.debug("interval_index= " + interval_index);
/* current interval fetched by bam, can be updated */
QueryInterval interval = queryIntervalArray[interval_index];
LOG.debug("interval= " + interval);
final List<Path> tmpFiles = new ArrayList<>(this.samples.size());
boolean done = false;
while (!done) {
done = true;
for (final Path p : tmpFiles) Files.delete(p);
tmpFiles.clear();
for (final Sample1KG sample : this.samples) {
if (!done)
break;
// try forever until it works !
for (; ; ) {
// try to download data for the current interval
final Path tmpBam = Files.createTempFile(this.baiCacheDir, "tmp.", ".bam");
SamReader srf = null;
SAMFileWriter rgnWriter = null;
SAMRecordIterator samIter = null;
try {
srf = sample.open();
// LOG.debug("query= "+sample.sampleName+" "+interval);
samIter = srf.query(new QueryInterval[] { interval }, false);
rgnWriter = sfwf.makeBAMWriter(ouSamFileHeader, true, tmpBam);
while (samIter.hasNext()) {
final SAMRecord rec = samIter.next();
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getReadUnmappedFlag())
continue;
if (rec.getAlignmentEnd() > interval.end && /* current read goes beyond current interval */
interval_index + 1 < queryIntervalArray.length && /* there is a new interval */
interval.referenceIndex == queryIntervalArray[interval_index + 1].referenceIndex && /* on same chromosome */
queryIntervalArray[interval_index + 1].start <= rec.getAlignmentEnd()) {
// update current interval, merge with next
interval = new QueryInterval(interval.referenceIndex, interval.start, queryIntervalArray[interval_index + 1].end);
interval_index++;
LOG.info("extending interval to " + interval);
done = false;
break;
}
rec.setAttribute(SAMTag.RG.name(), sample.readGroupRecord.getId());
rgnWriter.addAlignment(rec);
}
samIter.close();
samIter = null;
srf.close();
srf = null;
rgnWriter.close();
rgnWriter = null;
// LOG.debug("donequery= "+sample.url);
} catch (final Exception err) {
LOG.error(err);
Files.delete(tmpBam);
continue;
} finally {
CloserUtil.close(samIter);
CloserUtil.close(srf);
CloserUtil.close(rgnWriter);
}
tmpFiles.add(tmpBam);
break;
}
if (!done)
break;
}
// end of download each sample
if (!done)
continue;
}
// end of while(!done)
// LOG.info("merging "+interval);
// merge everything
final List<SamReader> chunkReaders = new ArrayList<>(samples.size());
final List<CloseableIterator<SAMRecord>> chunkIterators = new ArrayList<>(samples.size());
for (final Path tmpPath : tmpFiles) {
final SamReader tmpReader = samReaderFactory.open(tmpPath);
chunkReaders.add(tmpReader);
chunkIterators.add(tmpReader.iterator());
}
final MergingIterator<SAMRecord> merger = new MergingIterator<>(samRecordComparator, chunkIterators);
while (merger.hasNext()) {
sfw.addAlignment(merger.next());
}
merger.close();
CloserUtil.close(chunkIterators);
CloserUtil.close(chunkReaders);
// cleanup
for (final Path p : tmpFiles) Files.delete(p);
interval_index++;
}
sfw.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class ScanRetroCopy method doWork.
@Override
public int doWork(final List<String> args) {
if (this.min_depth < 1) {
LOG.error("Bad min depth");
return -1;
}
SamReader sr = null;
VariantContextWriter vcw0 = null;
CloseableIterator<SAMRecord> iter = null;
SAMFileWriter sfw = null;
try {
/* load the reference genome */
this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
/* create a contig name converter from the REF */
this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
/* READ KNOWGENES FILES */
LOG.info("Loading " + this.knownGeneUri);
try (BufferedReader br = IOUtils.openURIForBufferedReading(this.knownGeneUri)) {
String line;
final CharSplitter tab = CharSplitter.TAB;
while ((line = br.readLine()) != null) {
if (StringUtils.isBlank(line))
continue;
final String[] tokens = tab.split(line);
final KnownGene kg = new KnownGene(tokens);
if (kg.getExonCount() < 2)
continue;
if (this.onlyCodingTranscript && kg.isNonCoding())
continue;
final String ctg = this.refCtgNameConverter.apply(kg.getContig());
if (StringUtils.isBlank(ctg))
continue;
kg.setChrom(ctg);
final Interval interval = new Interval(ctg, kg.getTxStart() + 1, kg.getTxEnd(), kg.isNegativeStrand(), kg.getName());
List<KnownGene> L = this.knownGenesMap.get(interval);
if (L == null) {
L = new ArrayList<KnownGene>();
this.knownGenesMap.put(interval, L);
}
L.add(kg);
}
}
if (this.knownGenesMap.isEmpty()) {
LOG.error("no gene found in " + this.knownGeneUri);
return -1;
}
LOG.info("Number of transcripts: " + this.knownGenesMap.values().stream().flatMap(L -> L.stream()).count());
// open the sam file
final SamReaderFactory samReaderFactory = super.createSamReaderFactory();
samReaderFactory.referenceSequence(this.faidx);
sr = samReaderFactory.open(SamInputResource.of(oneFileOrNull(args)));
final SAMFileHeader samFileHeader = sr.getFileHeader();
// check it's sorted on coordinate
if (!samFileHeader.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
LOG.error("input is not sorted on coordinate but \"" + samFileHeader.getSortOrder() + "\"");
return -1;
}
if (this.saveBamTo != null) {
sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(samFileHeader, true, this.saveBamTo);
}
if (this.use_bai && !sr.hasIndex()) {
LOG.warning("Cannot used bai because input is not indexed");
}
// if there is a bai, only query the bam in the regions of splicing
if (this.use_bai && sr.hasIndex()) {
LOG.info("building intervals...");
final SAMSequenceDictionary samdict = SequenceDictionaryUtils.extractRequired(samFileHeader);
final ContigNameConverter samConvert = ContigNameConverter.fromOneDictionary(samdict);
final List<QueryInterval> intervalsL = this.knownGenesMap.values().stream().flatMap(K -> K.stream()).filter(KG -> samConvert.apply(KG.getContig()) != null).flatMap(KG -> KG.getExons().stream()).flatMap(exon -> {
// we need the reads overlapping the exon bounds
final int tid = samdict.getSequenceIndex(samConvert.apply(exon.getGene().getContig()));
final QueryInterval q1 = new QueryInterval(tid, exon.getStart() + 1, exon.getStart() + 1);
final QueryInterval q2 = new QueryInterval(tid, exon.getEnd(), exon.getEnd());
return Arrays.stream(new QueryInterval[] { q1, q2 });
}).sorted().collect(Collectors.toList());
final QueryInterval[] intervals = QueryInterval.optimizeIntervals(intervalsL.toArray(new QueryInterval[intervalsL.size()]));
// GC
intervalsL.clear();
LOG.debug("Query bam using " + intervals.length + " random access intervals. Please wait...");
iter = sr.queryOverlapping(intervals);
} else {
iter = sr.iterator();
}
final Set<String> samples = this.partiton.getPartitions(samFileHeader);
if (samples.isEmpty()) {
LOG.error("No sample was defined in the read group of the input bam.");
return -1;
}
/**
* build vcf header
*/
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
metaData.add(new VCFInfoHeaderLine(ATT_BEST_MATCHING_LENGTH, 1, VCFHeaderLineType.Integer, "Best Matching length"));
metaData.add(new VCFFormatHeaderLine(ATT_BEST_MATCHING_LENGTH, 1, VCFHeaderLineType.Integer, "Best Matching length"));
metaData.add(new VCFFormatHeaderLine(ATT_GT_INTRON, 1, VCFHeaderLineType.String, "Introns info: (intron-0-idx,valid,dp-5,dp-3,max-len-5,max-len-3,avg-5,avg-3)*"));
// metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
// metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
// "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
metaData.add(new VCFInfoHeaderLine(ATT_INSERTION, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Possible place of insertion:" + "chr:start-end|count-evidence|mate-genes|non-coding|distance"));
metaData.add(new VCFInfoHeaderLine(ATT_KG_STRAND, 1, VCFHeaderLineType.String, "KnownGene strand."));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_INFO, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Introns found: chr|start|end|seq-left|seq-right"));
metaData.add(new VCFFilterHeaderLine(ATT_FILTER_NONDOCODING, "Only non-coding transcripts"));
metaData.add(new VCFInfoHeaderLine(ATT_SAMPLES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples found. partition:" + this.partiton.name()));
metaData.add(new VCFFilterHeaderLine(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold, "Number of read is lower than :" + this.min_depth));
metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
final VCFHeader header = new VCFHeader(metaData, samples);
JVarkitVersion.getInstance().addMetaData(this, header);
header.setSequenceDictionary(refDict);
/* open vcf for writing*/
vcw0 = super.openVariantContextWriter(this.outputFile);
final VariantContextWriter vcw = vcw0;
vcw.writeHeader(header);
/* save gene writer */
if (this.saveBedPeTo != null) {
this.saveInsertionsPw = super.openPathOrStdoutAsPrintWriter(this.saveBedPeTo);
} else {
this.saveInsertionsPw = new PrintWriter(new NullOuputStream());
}
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(samFileHeader).logger(LOG).build();
while (iter.hasNext()) {
final SAMRecord rec = progress.apply(iter.next());
if (rec.getReadUnmappedFlag())
continue;
if (rec.getMappingQuality() < this.min_read_mapq)
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
final byte[] bases = rec.getReadBases();
if (bases == null || bases == SAMRecord.NULL_SEQUENCE)
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.numCigarElements() < 2)
continue;
final String refContig = this.refCtgNameConverter.apply(rec.getContig());
if (StringUtils.isBlank(refContig))
continue;
boolean save_read_to_bam = false;
/* get sample */
final String sampleName = this.partiton.getPartion(rec, null);
if (StringUtils.isBlank(sampleName))
continue;
/* this is a new reference sequence */
if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(refContig)) {
if (this.genomicSequence != null) {
/* DUMP things BEFORE changing the reference sequence!!! */
/* dump buffer */
dump(vcw, null);
}
/* map transcript-name to their transcript */
/*this.kgId2knownGenes.clear();
this.knownGenesMap.values().
stream().
flatMap(L->L.stream()).
filter(G->refContig.equals(G.getContig())).
forEach(K->this.kgId2knownGenes.put(K.getName(), K));*/
/* now, we can change genomicSequence */
this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, refContig);
}
final CigarElement leftCigar = cigar.getCigarElement(0);
final CigarElement rightCigar = cigar.getCigarElement(cigar.numCigarElements() - 1);
/* both ends are not candidate */
if (!isCandidateCigarElement(leftCigar) && !isCandidateCigarElement(rightCigar)) {
continue;
}
final List<KnownGene> genes = this.knownGenesMap.getOverlapping(new Interval(refContig, rec.getUnclippedStart(), rec.getUnclippedEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
/* time to time dump the buffer for the transcripts before the current ones*/
if (!genes.isEmpty()) {
// get all genes overlapping the current set of genes
int minTxStart = genes.stream().mapToInt(K -> K.getTxStart()).min().getAsInt();
final int maxTxStart = genes.stream().mapToInt(K -> K.getTxEnd()).max().getAsInt();
// update minTxtStart to get the lowest gene overlapping the set of genes
minTxStart = this.knownGenesMap.getOverlapping(new Interval(refContig, minTxStart, maxTxStart)).stream().flatMap(L -> L.stream()).mapToInt(K -> K.getStart()).min().getAsInt();
// not max, because we only need the 5' side
dump(vcw, new Interval(refContig, minTxStart, minTxStart));
}
/* test each side of the clipped read */
for (int side = 0; side < 2 && !genes.isEmpty(); ++side) {
final CigarElement ce_side = (side == 0 ? leftCigar : rightCigar);
if (!isCandidateCigarElement(ce_side))
continue;
for (final KnownGene knownGene : genes) {
for (int exonIndex = 0; exonIndex < knownGene.getExonCount(); exonIndex++) {
if (side == 0) /* looking at cigar string in 5' */
{
if (exonIndex == 0)
continue;
// first 'M' element
final CigarLocatable cigarM = new CigarLocatable(refContig, rec, 1);
// first cigar element
final CigarLocatable cigarS = new CigarLocatable(refContig, rec, 0);
// current exon
final ExonOne exonRight = new ExonOne(knownGene.getExon(exonIndex));
if (!cigarM.overlaps(exonRight))
continue;
if (!(exonRight.getStart() >= cigarM.getStart()))
continue;
// get exon on the left
final ExonOne exonLeft = new ExonOne(knownGene.getExon(exonIndex - 1));
if (exonLeft.getLengthOnReference() < this.minCigarSize)
continue;
/* end of cigar 'M' can have same bases than the prev exon. */
final int malus = exonRight.getStart() - cigarM.getStart();
int genomic1 = exonLeft.getEnd() - malus;
if (genomic1 < exonLeft.getStart() || genomic1 > exonLeft.getEnd()) {
continue;
}
int matchLength = (this.use_malus ? malus : 0);
int readIdx0 = cigarS.size() - 1;
// loop over sequence
while (readIdx0 >= 0 && genomic1 >= exonLeft.getStart()) {
final char read_base = cigarS.readBaseAt0(readIdx0);
final char genome_base = exonLeft.charAt1(genomic1);
if (read_base != genome_base) {
break;
}
readIdx0--;
matchLength++;
genomic1--;
}
if (matchLength < this._priv_ignoreCigarSize)
continue;
final KnownGene.Intron intron = knownGene.getIntron(exonIndex - 1);
// find match or create new
final Match match = new Match(intron, sampleName, rec, SIDE_3_PRIME, matchLength);
this.intronBuffer.add(match);
save_read_to_bam = true;
} else /* test last cigar */
{
if (exonIndex + 1 >= knownGene.getExonCount())
continue;
// last 'M' element
final CigarLocatable cigarM = new CigarLocatable(refContig, rec, cigar.numCigarElements() - 2);
// last cigar element
final CigarLocatable cigarS = new CigarLocatable(refContig, rec, cigar.numCigarElements() - 1);
// current exon
final ExonOne exonLeft = new ExonOne(knownGene.getExon(exonIndex));
if (!cigarM.overlaps(exonLeft))
continue;
if (!(exonLeft.getEnd() <= cigarM.getEnd()))
continue;
// get next exon
final ExonOne exonRight = new ExonOne(knownGene.getExon(exonIndex + 1));
if (exonRight.getLengthOnReference() < this.minCigarSize)
continue;
/* end of cigar 'M' can have same bases than the next exon. */
final int malus = cigarM.getEnd() - exonLeft.getEnd();
int genomic1 = exonRight.getStart() + malus;
if (genomic1 < exonRight.getStart() || genomic1 > exonRight.getEnd()) {
continue;
}
int matchLength = (this.use_malus ? malus : 0);
int readIdx0 = 0;
// loop over sequence
while (readIdx0 < cigarS.size() && genomic1 <= exonRight.getEnd()) {
final char read_base = cigarS.readBaseAt0(readIdx0);
final char genome_base = exonRight.charAt1(genomic1);
if (read_base != genome_base) {
break;
}
readIdx0++;
matchLength++;
genomic1++;
}
if (matchLength < this._priv_ignoreCigarSize)
continue;
// find match or create new
final KnownGene.Intron intron = knownGene.getIntron(exonIndex);
final Match match = new Match(intron, sampleName, rec, SIDE_5_PRIME, matchLength);
this.intronBuffer.add(match);
save_read_to_bam = true;
}
}
}
}
// end for side
if (save_read_to_bam && sfw != null)
sfw.addAlignment(rec);
}
/* dump buffer */
dump(vcw, null);
progress.close();
vcw.close();
iter.close();
iter = null;
sr.close();
sr = null;
this.saveInsertionsPw.flush();
this.saveInsertionsPw.close();
this.saveInsertionsPw = null;
if (sfw != null) {
sfw.close();
sfw = null;
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sr);
CloserUtil.close(vcw0);
CloserUtil.close(sfw);
CloserUtil.close(this.indexedFastaSequenceFile);
CloserUtil.close(this.saveInsertionsPw);
}
}
Aggregations