use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class CompareBamAndBuild method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
SortingCollection<Match> database = null;
if (this.chainFile == null) {
LOG.error("Chain file is not defined Option");
return -1;
}
if (args.size() != 2) {
LOG.error("Illegal number of arguments. Expected two indexed BAMS.");
return -1;
}
try {
LOG.info("load chain file");
this.liftOver = new LiftOver(this.chainFile);
database = SortingCollection.newInstance(Match.class, new MatchCodec(), new MatchOrdererInSortingCollection(), this.maxRecordsInRam, this.tmpDir.toPath());
database.setDestructiveIteration(true);
for (int currentSamFileIndex = 0; currentSamFileIndex < 2; currentSamFileIndex++) {
final File samFile = new File(args.get(currentSamFileIndex));
LOG.info("read " + samFile);
this.bamFiles[currentSamFileIndex] = samFile;
SamReader samFileReader = CompareBamAndBuild.this.openSamReader(samFile.getPath());
final SAMSequenceDictionary dict = samFileReader.getFileHeader().getSequenceDictionary();
this.sequenceDictionaries[currentSamFileIndex] = dict;
if (dict.isEmpty()) {
samFileReader.close();
LOG.error("Empty Dict in " + samFile);
return -1;
}
final Interval interval;
if (REGION != null) {
IntervalParser intervalParser = new IntervalParser(dict);
interval = intervalParser.parse(this.REGION);
if (interval == null) {
samFileReader.close();
LOG.error("Cannot parse " + REGION + " (bad syntax or not in dictionary");
return -1;
}
} else {
interval = null;
}
final SAMRecordIterator iter;
if (interval == null) {
iter = samFileReader.iterator();
} else {
iter = samFileReader.queryOverlapping(interval.getContig(), interval.getStart(), interval.getEnd());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.isSecondaryOrSupplementary())
continue;
final Match m = new Match();
m.flag = rec.getFlags();
m.readName = rec.getReadName();
m.firstBamFile = currentSamFileIndex == 0;
if (rec.getReadUnmappedFlag()) {
m.tid = -1;
m.pos = -1;
} else {
m.tid = rec.getReferenceIndex();
m.pos = rec.getAlignmentStart();
}
database.add(m);
}
iter.close();
samFileReader.close();
samFileReader = null;
LOG.info("Close " + samFile);
}
database.doneAdding();
LOG.info("Writing results....");
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
// compute the differences for each read
out.print("#READ-Name\tCOMPARE");
for (final File f : this.bamFiles) {
out.print("\t" + f);
}
out.println();
/* create an array of set<Match> */
final MatchComparator match_comparator = new MatchComparator();
final List<Set<Match>> matches = new ArrayList<Set<CompareBamAndBuild.Match>>(2);
while (matches.size() < 2) {
matches.add(new TreeSet<CompareBamAndBuild.Match>(match_comparator));
}
CloseableIterator<Match> iter = database.iterator();
String currReadName = null;
int curr_num_in_pair = -1;
for (; ; ) {
final Match nextMatch;
if (iter.hasNext()) {
nextMatch = iter.next();
} else {
nextMatch = null;
}
if (nextMatch == null || (currReadName != null && !currReadName.equals(nextMatch.readName)) || (curr_num_in_pair != -1 && curr_num_in_pair != nextMatch.indexInPair())) {
if (currReadName != null) {
out.print(currReadName);
if (curr_num_in_pair > 0) {
out.print("/");
out.print(curr_num_in_pair);
}
out.print("\t");
if (same(matches.get(0), matches.get(1))) {
out.print("EQ");
} else {
out.print("NE");
}
for (int x = 0; x < 2; ++x) {
out.print("\t");
print(out, matches.get(x));
}
out.println();
}
if (nextMatch == null)
break;
for (final Set<Match> set : matches) set.clear();
}
currReadName = nextMatch.readName;
curr_num_in_pair = nextMatch.indexInPair();
matches.get(nextMatch.firstBamFile ? 0 : 1).add(nextMatch);
}
iter.close();
out.flush();
out.close();
database.cleanup();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
this.liftOver = null;
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class CompareBams method doWork.
@Override
public int doWork(final List<String> args) {
this.IN.addAll(args.stream().map(S -> new File(S)).collect(Collectors.toList()));
SortingCollection<Match> database = null;
SamReader samFileReader = null;
CloseableIterator<Match> iter = null;
try {
if (this.IN.size() < 2) {
LOG.error("Need more bams please");
return -1;
}
database = SortingCollection.newInstance(Match.class, new MatchCodec(), new MatchOrderer(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
this.samSequenceDictAreTheSame = true;
database.setDestructiveIteration(true);
for (int currentSamFileIndex = 0; currentSamFileIndex < this.IN.size(); currentSamFileIndex++) {
final File samFile = this.IN.get(currentSamFileIndex);
LOG.info("Opening " + samFile);
samFileReader = super.createSamReaderFactory().open(samFile);
final SAMSequenceDictionary dict = samFileReader.getFileHeader().getSequenceDictionary();
if (dict == null || dict.isEmpty()) {
LOG.error("Empty Dict in " + samFile);
return -1;
}
if (!this.sequenceDictionaries.isEmpty() && !SequenceUtil.areSequenceDictionariesEqual(this.sequenceDictionaries.get(0), dict)) {
this.samSequenceDictAreTheSame = false;
LOG.warn("FOOL !! THE SEQUENCE DICTIONARIES ARE **NOT** THE SAME. I will try to compare anyway but it will be slower.");
}
this.sequenceDictionaries.add(dict);
final Optional<Interval> interval;
if (REGION != null && !REGION.trim().isEmpty()) {
final IntervalParser dix = new IntervalParser(dict);
interval = Optional.ofNullable(dix.parse(REGION));
if (!interval.isPresent()) {
LOG.error("Cannot parse " + REGION + " (bad syntax or not in dictionary)");
return -1;
}
} else {
interval = Optional.empty();
}
SAMRecordIterator it = null;
if (!interval.isPresent()) {
it = samFileReader.iterator();
} else {
it = samFileReader.queryOverlapping(interval.get().getContig(), interval.get().getStart(), interval.get().getEnd());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
while (it.hasNext()) {
final SAMRecord rec = progress.watch(it.next());
if (!rec.getReadUnmappedFlag()) {
if (rec.getMappingQuality() < this.min_mapq)
continue;
if (rec.isSecondaryOrSupplementary())
continue;
}
final Match m = new Match();
if (rec.getReadPairedFlag()) {
m.num_in_pair = (rec.getFirstOfPairFlag() ? 1 : 2);
} else {
m.num_in_pair = 0;
}
m.readName = rec.getReadName();
m.bamIndex = currentSamFileIndex;
m.flag = rec.getFlags();
m.cigar = rec.getCigarString();
if (m.cigar == null)
m.cigar = "";
if (rec.getReadUnmappedFlag()) {
m.tid = -1;
m.pos = -1;
} else {
m.tid = rec.getReferenceIndex();
m.pos = rec.getAlignmentStart();
}
database.add(m);
}
it.close();
samFileReader.close();
samFileReader = null;
LOG.info("Close " + samFile);
}
database.doneAdding();
LOG.info("Writing results....");
this.out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
// compute the differences for each read
this.out.print("#READ-Name\t");
for (int x = 0; x < this.IN.size(); ++x) {
for (int y = x + 1; y < this.IN.size(); ++y) {
if (!(x == 0 && y == 1))
this.out.print("|");
this.out.print(IN.get(x));
this.out.print(" ");
this.out.print(IN.get(y));
}
}
for (int x = 0; x < this.IN.size(); ++x) {
this.out.print("\t" + IN.get(x));
}
this.out.println();
/* create an array of set<Match> */
final MatchComparator match_comparator = new MatchComparator();
final List<Set<Match>> matches = new ArrayList<Set<CompareBams.Match>>(this.IN.size());
while (matches.size() < this.IN.size()) {
matches.add(new TreeSet<CompareBams.Match>(match_comparator));
}
iter = database.iterator();
String currReadName = null;
int curr_num_in_pair = -1;
for (; ; ) {
Match nextMatch = null;
if (iter.hasNext()) {
nextMatch = iter.next();
}
if (nextMatch == null || (currReadName != null && !currReadName.equals(nextMatch.readName)) || (curr_num_in_pair != -1 && curr_num_in_pair != nextMatch.num_in_pair)) {
if (currReadName != null) {
this.out.print(currReadName);
if (curr_num_in_pair > 0) {
this.out.print("/");
this.out.print(curr_num_in_pair);
}
this.out.print("\t");
for (int x = 0; x < this.IN.size(); ++x) {
final Set<Match> first = matches.get(x);
for (int y = x + 1; y < this.IN.size(); ++y) {
if (!(x == 0 && y == 1))
this.out.print("|");
Set<Match> second = matches.get(y);
if (same(first, second)) {
this.out.print("EQ");
} else {
this.out.print("NE");
}
}
}
for (int x = 0; x < this.IN.size(); ++x) {
this.out.print("\t");
print(matches.get(x), sequenceDictionaries.get(x));
}
this.out.println();
}
if (nextMatch == null)
break;
for (Set<Match> set : matches) set.clear();
}
currReadName = nextMatch.readName;
curr_num_in_pair = nextMatch.num_in_pair;
matches.get(nextMatch.bamIndex).add(nextMatch);
if (this.out.checkError())
break;
}
iter.close();
this.out.flush();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (database != null)
database.cleanup();
CloserUtil.close(samFileReader);
CloserUtil.close(this.out);
this.out = null;
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class HaloplexParasite method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
SamReader samReader = null;
final List<Mutation> mutations = new ArrayList<>();
try {
final Set<File> bamFiles = Files.lines(this.bamList.toPath()).filter(T -> !(T.isEmpty() || T.startsWith("#"))).map(T -> new File(T)).collect(Collectors.toSet());
final VCFHeader header = new VCFHeader();
header.setSequenceDictionary(in.getHeader().getSequenceDictionary());
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine("HALOPLEXPARASITE", "fails Parasite Haloplex Sequence");
final VCFInfoHeaderLine infoWord = new VCFInfoHeaderLine(filter.getID(), 1, VCFHeaderLineType.String, "Parasite Haloplex Sequence (Word|Count|Fraction)");
super.addMetaData(header);
out.writeHeader(header);
header.addMetaDataLine(filter);
header.addMetaDataLine(infoWord);
while (in.hasNext()) {
final VariantContext ctx = in.next();
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (!(ctx.isIndel() || ctx.isMixed())) {
out.add(vcb.make());
continue;
}
if (!vcb.getAlleles().stream().filter(A -> !(A.isSymbolic() || A.isNoCall() || A.length() < this.minClipSize)).findAny().isPresent()) {
out.add(vcb.make());
continue;
}
final Mutation mut = new Mutation(ctx);
mutations.add(mut);
}
final Counter<String> words = new Counter<>();
for (final File bamFile : bamFiles) {
LOG.info("Opening " + bamFile);
samReader = createSamReaderFactory().open(bamFile);
for (final Mutation mut : mutations) {
// words seen in that mutation : don't use a Counter
final Set<String> mutWords = new HashSet<>();
/* loop over reads overlapping that mutation */
final SAMRecordIterator sri = samReader.queryOverlapping(mut.contig, mut.start, mut.end);
while (sri.hasNext()) {
final SAMRecord rec = sri.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar.numCigarElements() == 1)
continue;
final byte[] bases = rec.getReadBases();
int refpos = rec.getUnclippedStart();
int readpos = 0;
/* scan cigar overlapping that mutation */
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int ref_end = refpos + (op.consumesReferenceBases() || op.isClipping() ? ce.getLength() : 0);
final int read_end = readpos + (op.consumesReadBases() ? ce.getLength() : 0);
/* check clip is large enough */
if (op.equals(CigarOperator.S) && ce.getLength() >= this.minClipSize) {
/* check clip overlap mutation */
if (!(ref_end < mut.start || refpos > mut.end)) {
/* break read of soft clip into words */
for (int x = 0; x + this.minClipSize <= ce.getLength(); ++x) {
final String substr = new String(bases, readpos + x, this.minClipSize);
if (!substr.contains("N")) {
final String revcomp = AcidNucleics.reverseComplement(substr);
mutWords.add(substr);
if (!revcomp.equals(substr))
mutWords.add(revcomp);
}
}
}
}
refpos = ref_end;
readpos = read_end;
}
}
sri.close();
for (final String w : mutWords) {
words.incr(w);
}
}
// end of for(mutations)
samReader.close();
samReader = null;
}
LOG.info("mutations:" + mutations.size());
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
for (final Mutation mut : mutations) {
progress.watch(mut.contig, mut.start);
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, mut.contig, mut.start, mut.end, mut.alleles);
String worstString = null;
Double worstFraction = null;
final double totalWords = words.getTotal();
for (final Allele a : mut.alleles) {
if (a.isSymbolic() || a.isNoCall() || a.length() < this.minClipSize)
continue;
for (int x = 0; x + this.minClipSize <= a.length(); ++x) {
final String substr = new String(a.getBases(), x, this.minClipSize);
final long count = words.count(substr);
final double fraction = count / totalWords;
if (worstFraction == null || worstFraction < fraction) {
worstString = substr + "|" + count + "|" + fraction;
worstFraction = fraction;
}
}
}
if (worstString != null) {
vcb.attribute(infoWord.getID(), worstString);
}
if (worstFraction != null && worstFraction.doubleValue() >= this.tresholdFraction) {
vcb.filter(filter.getID());
}
out.add(vcb.make());
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(samReader);
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class Biostar90204 method doWork.
@Override
public int doWork(final List<String> args) {
if (this.suffix_length < 0) {
LOG.error("Bad value of suffix_length:" + this.suffix_length);
return -1;
}
if (this.record_per_file < 1L) {
LOG.error("Bad value of record_per_file:" + this.record_per_file);
return -1;
}
SAMFileWriter sfw = null;
SAMRecordIterator iter = null;
SamReader samFileReader = null;
PrintWriter manifest = new PrintWriter(new NullOuputStream());
try {
samFileReader = super.openSamReader(oneFileOrNull(args));
final SAMFileHeader header = samFileReader.getFileHeader();
int split_file_number = 0;
long nReads = 0L;
iter = samFileReader.iterator();
if (this.manifestFile != null) {
manifest.close();
manifest = new PrintWriter(manifestFile);
}
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (this.samRecordFilter.filterOut(rec))
continue;
++nReads;
if (sfw == null) {
split_file_number++;
final String pathname = (this.prefix.isEmpty() ? "" : this.prefix + ".") + String.format("%0" + suffix_length + "d", split_file_number) + ".bam";
final File out = new File(pathname);
manifest.write(pathname);
manifest.write("\t" + (nReads) + "\t");
final SAMFileHeader header2 = header.clone();
header2.addComment("SPLIT:" + split_file_number);
header2.addComment("SPLIT:Starting from Read" + nReads);
sfw = this.writingBamArgs.openSAMFileWriter(out, header2, true);
}
sfw.addAlignment(rec);
if (nReads % record_per_file == 0) {
sfw.close();
manifest.write((nReads) + "\n");
sfw = null;
}
}
if (sfw != null) {
sfw.close();
manifest.write((nReads) + "\n");
}
manifest.flush();
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(manifest);
CloserUtil.close(sfw);
CloserUtil.close(iter);
CloserUtil.close(samFileReader);
}
return 0;
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class Biostar59647 method doWork.
@Override
public int doWork(final List<String> args) {
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samFileReader = null;
PrintStream pout;
try {
GenomicSequence genomicSequence = null;
indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile);
samFileReader = null;
final String bamFile = oneFileOrNull(args);
samFileReader = super.openSamReader(bamFile);
if (!SequenceUtil.areSequenceDictionariesEqual(indexedFastaSequenceFile.getSequenceDictionary(), samFileReader.getFileHeader().getSequenceDictionary())) {
LOG.warning("Not the same sequence dictionaries");
}
final XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
pout = (outputFile == null ? stdout() : new PrintStream(this.outputFile));
final XMLStreamWriter w = xmlfactory.createXMLStreamWriter(pout, "UTF-8");
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("sam");
w.writeAttribute("bam", (bamFile == null ? "stdin" : bamFile));
w.writeAttribute("ref", refFile.getPath());
w.writeComment(getProgramCommandLine());
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(samFileReader.getFileHeader().getSequenceDictionary());
final SAMRecordIterator iter = samFileReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
progess.watch(rec);
final byte[] readbases = rec.getReadBases();
w.writeStartElement("read");
w.writeStartElement("name");
w.writeCharacters(rec.getReadName());
w.writeEndElement();
w.writeStartElement("sequence");
w.writeCharacters(new String(readbases));
w.writeEndElement();
w.writeStartElement("flags");
for (SAMFlag f : SAMFlag.values()) {
w.writeAttribute(f.name(), String.valueOf(f.isSet(rec.getFlags())));
}
w.writeCharacters(String.valueOf(rec.getFlags()));
// flags
w.writeEndElement();
if (!rec.getReadUnmappedFlag()) {
w.writeStartElement("qual");
w.writeCharacters(String.valueOf(rec.getMappingQuality()));
w.writeEndElement();
w.writeStartElement("chrom");
w.writeAttribute("index", String.valueOf(rec.getReferenceIndex()));
w.writeCharacters(rec.getReferenceName());
w.writeEndElement();
w.writeStartElement("pos");
w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
w.writeEndElement();
w.writeStartElement("cigar");
w.writeCharacters(rec.getCigarString());
w.writeEndElement();
}
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
w.writeStartElement("mate-chrom");
w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex()));
w.writeCharacters(rec.getMateReferenceName());
w.writeEndElement();
w.writeStartElement("mate-pos");
w.writeCharacters(String.valueOf(rec.getMateAlignmentStart()));
w.writeEndElement();
}
if (!rec.getReadUnmappedFlag() && rec.getCigar() != null) {
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
w.writeStartElement("align");
int readIndex = 0;
int refIndex = rec.getAlignmentStart();
for (final CigarElement e : rec.getCigar().getCigarElements()) {
switch(e.getOperator()) {
// ignore hard clips
case H:
break;
// ignore pads
case P:
break;
// cont.
case I:
case S:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("read-index", String.valueOf(readIndex + 1));
if (readIndex >= 0 && readIndex < readbases.length) {
w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex])));
}
readIndex++;
}
break;
}
// cont. -- reference skip
case N:
case D:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("ref-index", String.valueOf(refIndex));
if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
w.writeAttribute("ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1)));
}
refIndex++;
}
break;
}
case M:
case EQ:
case X:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
char baseRead = '\0';
if (readIndex >= 0 && readIndex < readbases.length) {
baseRead = (char) (rec.getReadBases()[readIndex]);
w.writeAttribute("read-index", String.valueOf(readIndex + 1));
w.writeAttribute("read-base", String.valueOf(baseRead));
}
w.writeAttribute("ref-index", String.valueOf(refIndex));
if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
char baseRef = genomicSequence.charAt(refIndex - 1);
w.writeAttribute("ref-base", String.valueOf(baseRef));
if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) {
w.writeAttribute("mismatch", "true");
}
}
refIndex++;
readIndex++;
}
break;
}
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
}
}
w.writeEndElement();
}
w.writeEndElement();
}
iter.close();
w.writeEndElement();
w.writeEndDocument();
w.flush();
pout.flush();
CloserUtil.close(w);
CloserUtil.close(pout);
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(samFileReader);
CloserUtil.close(indexedFastaSequenceFile);
}
return 0;
}
Aggregations