use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class BamToSVG method printSamRecord.
private void printSamRecord(XMLStreamWriter w, final SAMRecord record, final Map<Integer, Counter<Character>> consensus) throws XMLStreamException {
double mid_y = this.featureHeight / 2.0;
final double y_h95 = this.featureHeight * 0.90;
final double y_top5 = (this.featureHeight - y_h95) / 2.0;
final double y_bot5 = y_top5 + y_h95;
final double arrow_w = this.featureHeight / 3.0;
w.writeStartElement(SVG.NS, "g");
String title = record.getReadName();
w.writeAttribute("title", title);
/* print that sam record */
final int unclipped_start = record.getUnclippedStart();
Cigar cigar = record.getCigar();
if (cigar == null)
return;
byte[] bases = record.getReadBases();
if (bases == null)
return;
byte[] qualities = record.getBaseQualities();
if (qualities == null)
return;
int readPos = 0;
Map<Integer, String> pos2insertions = new HashMap<Integer, String>();
List<CigarElement> cigarElements = cigar.getCigarElements();
/* find position of arrow */
int arrow_cigar_index = -1;
for (int cidx = 0; cidx < cigarElements.size(); cidx++) {
CigarElement ce = cigarElements.get(cidx);
CigarOperator op = ce.getOperator();
switch(op) {
// threw
case H:
// threw
case S:
if (!this.showClipping)
break;
case M:
case EQ:
case X:
{
arrow_cigar_index = cidx;
}
default:
break;
}
if (record.getReadNegativeStrandFlag() && arrow_cigar_index != -1) {
break;
}
}
int refPos = unclipped_start;
/* loop over cigar string */
for (int cidx = 0; cidx < cigarElements.size(); cidx++) {
CigarElement ce = cigarElements.get(cidx);
CigarOperator op = ce.getOperator();
boolean in_clip = false;
switch(ce.getOperator()) {
case D:
case N:
{
int c_start = trim(refPos);
int c_end = trim(refPos + ce.getLength());
if (c_start < c_end) {
w.writeEmptyElement("line");
w.writeAttribute("class", "indel");
w.writeAttribute("title", op.name() + ce.getLength());
w.writeAttribute("x1", format(baseToPixel(c_start)));
w.writeAttribute("x2", format(baseToPixel(c_end)));
w.writeAttribute("y1", format(mid_y));
w.writeAttribute("y2", format(mid_y));
}
refPos += ce.getLength();
break;
}
case I:
{
StringBuilder sb = new StringBuilder();
for (int i = 0; i < ce.getLength(); ++i) {
sb.append((char) bases[readPos++]);
}
pos2insertions.put(refPos, sb.toString());
break;
}
case H:
{
if (!this.showClipping) {
refPos += ce.getLength();
break;
}
in_clip = true;
// NO break;
}
case S:
{
if (!this.showClipping) {
readPos += ce.getLength();
refPos += ce.getLength();
break;
}
in_clip = true;
// NO break;
}
case X:
case EQ:
case M:
{
int match_start = refPos;
int match_end = refPos + ce.getLength();
// print sam background
StringBuilder sb = new StringBuilder();
if (record.getReadNegativeStrandFlag() && match_start >= this.interval.start && match_start <= this.interval.end && cidx == arrow_cigar_index) {
sb.append(" M ").append(format(baseToPixel(match_start) + arrow_w)).append(',').append(y_top5);
sb.append(" h ").append(format(baseToPixel(trim(match_end)) - baseToPixel(match_start) - arrow_w));
sb.append(" v ").append(format(y_h95));
sb.append(" h ").append(format(-(baseToPixel(trim(match_end)) - baseToPixel(match_start) - arrow_w)));
sb.append(" L ").append(format(baseToPixel(match_start))).append(',').append(mid_y);
sb.append(" Z");
} else if (!record.getReadNegativeStrandFlag() && match_end >= this.interval.start && match_end <= this.interval.end && cidx == arrow_cigar_index) {
sb.append(" M ").append(format(baseToPixel(match_end) - arrow_w)).append(',').append(y_top5);
sb.append(" h ").append(format(-(baseToPixel(match_end) - baseToPixel(trim(match_start)) - arrow_w)));
sb.append(" v ").append(format(y_h95));
sb.append(" h ").append(format(baseToPixel(match_end) - baseToPixel(trim(match_start)) - arrow_w));
sb.append(" L ").append(format(baseToPixel(match_end))).append(',').append(mid_y);
sb.append(" Z");
} else {
sb.append(" M ").append(format(baseToPixel(trim(match_start)))).append(',').append(y_top5);
sb.append(" h ").append(format(baseToPixel(trim(match_end)) - baseToPixel(trim(match_start))));
sb.append(" v ").append(format(y_h95));
sb.append(" h ").append(format(-(baseToPixel(trim(match_end)) - baseToPixel(trim(match_start)))));
sb.append(" Z");
}
w.writeEmptyElement("path");
w.writeAttribute("d", sb.toString().trim());
if (ce.getOperator() == CigarOperator.H || ce.getOperator() == CigarOperator.S) {
w.writeAttribute("style", "fill:yellow;");
} else {
w.writeAttribute("style", "fill:url(#f" + record.getFlags() + ");");
}
if (op.consumesReadBases()) {
for (int i = 0; i < ce.getLength(); ++i) {
char ca = (char) bases[readPos];
if (!in_clip) {
Counter<Character> count = consensus.get(refPos + i);
if (count == null) {
count = new Counter<>();
consensus.put(refPos + i, count);
}
count.incr(ca);
}
char cb = 'N';
if (genomicSequence != null) {
cb = Character.toUpperCase(genomicSequence.charAt(refPos + i - 1));
}
if (cb != 'N' && ca != 'N' && cb != ca && !in_clip) {
w.writeEmptyElement("use");
w.writeAttribute("x", format(baseToPixel(refPos + i)));
// w.writeAttribute("y",format(y_top));never needed
w.writeAttribute("xlink", XLINK.NS, "href", "#mut");
}
if (this.interval.contains(refPos + i) && this.featureWidth > 5) {
// int qual = qualities[readPos];
w.writeEmptyElement("use");
w.writeAttribute("x", format(baseToPixel(refPos + i)));
// w.writeAttribute("y",format(y_top));never needed
w.writeAttribute("xlink", XLINK.NS, "href", "#b" + ca);
}
readPos++;
}
}
refPos += ce.getLength();
break;
}
default:
{
throw new RuntimeException("Unknown SAM op: " + ce.getOperator());
}
}
}
for (Integer pos : pos2insertions.keySet()) {
if (pos < this.interval.start)
continue;
if (pos > this.interval.end)
continue;
String insertion = pos2insertions.get(pos);
w.writeEmptyElement("line");
double x = baseToPixel(pos);
w.writeAttribute("title", "Insertion " + insertion);
w.writeAttribute("class", "insert");
w.writeAttribute("x1", format(x));
w.writeAttribute("x2", format(x));
w.writeAttribute("y1", format(y_top5));
w.writeAttribute("y2", format(y_bot5));
}
// g
w.writeEndElement();
}
use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class BamToSVG method printSample.
private void printSample(XMLStreamWriter w, double top_y, final Sample sample) throws XMLStreamException {
w.writeComment("START SAMPLE: " + sample.name);
w.writeStartElement(SVG.NS, "g");
w.writeAttribute("transform", "translate(0," + top_y + ")");
double y = 0;
/* write title */
w.writeEmptyElement("path");
w.writeAttribute("class", "samplename");
w.writeAttribute("title", sample.name);
w.writeAttribute("d", this.hershey.svgPath(sample.name, scaleRect(new Rectangle2D.Double(0, y, this.drawinAreaWidth, HEIGHT_SAMPLE_NAME))));
y += HEIGHT_SAMPLE_NAME;
/* write REFERENCE */
w.writeComment("REFERENCE");
for (int pos = this.interval.start; // ignore if too small
this.featureWidth > 5 && pos <= this.interval.end; ++pos) {
char c = (this.genomicSequence == null ? 'N' : this.genomicSequence.charAt(pos - 1));
double x0 = baseToPixel(pos);
w.writeEmptyElement("use");
w.writeAttribute("x", format(x0));
w.writeAttribute("y", format(y));
w.writeAttribute("xlink", XLINK.NS, "href", "#b" + c);
}
y += featureHeight;
/* skip line for later consensus */
double consensus_y = y;
y += featureHeight;
Map<Integer, Counter<Character>> pos2consensus = new HashMap<Integer, Counter<Character>>();
/* print variants */
if (!pos2variant.isEmpty()) {
for (VariantContext ctx : this.pos2variant) {
Genotype g = ctx.getGenotype(sample.name);
if (g == null || !g.isCalled() || g.isHomRef())
continue;
if (ctx.getEnd() < this.interval.start)
continue;
if (ctx.getStart() > this.interval.end)
continue;
double x0 = baseToPixel(ctx.getStart());
w.writeEmptyElement("use");
w.writeAttribute("x", format(x0));
w.writeAttribute("y", format(y));
w.writeAttribute("title", ctx.getAltAlleleWithHighestAlleleCount().getDisplayString());
if (g.isHomVar()) {
w.writeAttribute("xlink", XLINK.NS, "href", "#homvar");
} else if (g.isHet()) {
w.writeAttribute("xlink", XLINK.NS, "href", "#het");
}
}
y += featureHeight;
}
/* print all lines */
w.writeComment("Alignments");
for (int nLine = 0; nLine < sample.lines.size(); ++nLine) {
w.writeStartElement("g");
w.writeAttribute("transform", "translate(0," + format(y + nLine * featureHeight) + ")");
List<SAMRecord> line = sample.lines.get(nLine);
// loop over records on that line
for (SAMRecord record : line) {
printSamRecord(w, record, pos2consensus);
}
// g
w.writeEndElement();
}
/* write consensus */
w.writeComment("Consensus");
for (int pos = this.interval.start; // ignore if too small
this.featureWidth > 5 && pos <= this.interval.end; ++pos) {
Counter<Character> cons = pos2consensus.get(pos);
if (cons == null)
continue;
char c = cons.getMostFrequent();
double x0 = baseToPixel(pos);
w.writeEmptyElement("use");
w.writeAttribute("x", format(x0));
w.writeAttribute("y", format(consensus_y));
w.writeAttribute("xlink", XLINK.NS, "href", "#b" + c);
}
/* surrounding frame for that sample */
w.writeEmptyElement("rect");
w.writeAttribute("class", "frame");
w.writeAttribute("x", format(0));
w.writeAttribute("y", format(0));
w.writeAttribute("width", format(drawinAreaWidth));
w.writeAttribute("height", format(sample.getHeight()));
// g for sample
w.writeEndElement();
w.writeComment("END SAMPLE: " + sample.name);
}
use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class BamStats02 method run.
private void run(String filename, SamReader r, PrintWriter out) {
final Counter<Category> counter = new Counter<>();
SAMRecordIterator iter = null;
try {
iter = r.iterator();
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(r.getFileHeader().getSequenceDictionary());
while (iter.hasNext()) {
final SAMRecord record = progress.watch(iter.next());
final Category cat = new Category();
cat.set(STRING_PROPS.filename, filename);
cat.flag = record.getFlags();
cat.set(INT_PROPS.inTarget, -1);
final SAMReadGroupRecord g = record.getReadGroup();
if (g != null) {
cat.set(STRING_PROPS.samplename, g.getSample());
cat.set(STRING_PROPS.platform, g.getPlatform());
cat.set(STRING_PROPS.platformUnit, g.getPlatformUnit());
cat.set(STRING_PROPS.library, g.getLibrary());
}
final ShortReadName readName = ShortReadName.parse(record);
if (readName.isValid()) {
cat.set(STRING_PROPS.instrument, readName.getInstrumentName());
cat.set(STRING_PROPS.flowcell, readName.getFlowCellId());
cat.set(INT_PROPS.lane, readName.getFlowCellLane());
cat.set(INT_PROPS.run, readName.getRunId());
}
if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
cat.set(STRING_PROPS.mate_chromosome, record.getMateReferenceName());
}
if (!record.getReadUnmappedFlag()) {
cat.set(INT_PROPS.mapq, (int) (Math.ceil(record.getMappingQuality() / 10.0) * 10));
cat.set(STRING_PROPS.chromosome, record.getReferenceName());
if (this.intervals != null) {
if (this.intervals.containsOverlapping(new Interval(record.getReferenceName(), record.getAlignmentStart(), record.getAlignmentEnd()))) {
cat.set(INT_PROPS.inTarget, 1);
} else {
cat.set(INT_PROPS.inTarget, 0);
}
}
}
counter.incr(cat);
}
progress.finish();
for (final Category cat : counter.keySetDecreasing()) {
cat.print(out);
out.print("\t");
out.print(counter.count(cat));
out.println();
}
out.flush();
} finally {
CloserUtil.close(iter);
}
}
use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class Biostar154220 method doWork.
@SuppressWarnings("resource")
private int doWork(final SamReader in) throws IOException {
SAMFileHeader header = in.getFileHeader();
if (header.getSortOrder() != SAMFileHeader.SortOrder.unsorted) {
LOG.error("input should be unsorted, reads sorted on REF/query-name e.g: see https://github.com/lindenb/jvarkit/wiki/SortSamRefName");
return -1;
}
SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
LOG.error("no dict !");
return -1;
}
SAMFileWriter out = null;
SAMRecordIterator iter = null;
int prev_tid = -1;
int[] depth_array = null;
try {
SAMFileHeader header2 = header.clone();
header2.addComment("Biostar154220" + " " + getVersion() + " " + getProgramCommandLine());
out = this.writingBams.openSAMFileWriter(outputFile, header2, true);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
iter = in.iterator();
List<SAMRecord> buffer = new ArrayList<>();
for (; ; ) {
SAMRecord rec = null;
if (iter.hasNext()) {
rec = progress.watch(iter.next());
}
if (rec != null && rec.getReadUnmappedFlag()) {
out.addAlignment(rec);
continue;
}
// no more record or
if (!buffer.isEmpty() && rec != null && buffer.get(0).getReadName().equals(rec.getReadName()) && buffer.get(0).getReferenceIndex().equals(rec.getReferenceIndex())) {
buffer.add(progress.watch(rec));
} else if (buffer.isEmpty() && rec != null) {
buffer.add(progress.watch(rec));
} else // dump buffer
{
if (!buffer.isEmpty()) {
final int tid = buffer.get(0).getReferenceIndex();
if (prev_tid == -1 || prev_tid != tid) {
SAMSequenceRecord ssr = dict.getSequence(tid);
prev_tid = tid;
depth_array = null;
System.gc();
LOG.info("Alloc memory for contig " + ssr.getSequenceName() + " N=" + ssr.getSequenceLength() + "*sizeof(int)");
// use a +1 pos
depth_array = new int[ssr.getSequenceLength() + 1];
Arrays.fill(depth_array, 0);
}
// position->coverage for this set of reads
Counter<Integer> readposition2coverage = new Counter<Integer>();
boolean dump_this_buffer = true;
for (final SAMRecord sr : buffer) {
if (!dump_this_buffer)
break;
if (this.filter.filterOut(sr))
continue;
final Cigar cigar = sr.getCigar();
if (cigar == null) {
throw new IOException("Cigar missing in " + rec.getSAMString());
}
int refPos1 = sr.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (!op.consumesReferenceBases())
continue;
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength() && refPos1 + x < depth_array.length; ++x) {
int cov = (int) readposition2coverage.incr(refPos1 + x);
if (depth_array[refPos1 + x] + cov > this.capDepth) {
dump_this_buffer = false;
break;
}
}
}
if (!dump_this_buffer)
break;
refPos1 += ce.getLength();
}
}
if (dump_this_buffer) {
// consumme this coverage
for (Integer pos : readposition2coverage.keySet()) {
depth_array[pos] += (int) readposition2coverage.count(pos);
}
for (SAMRecord sr : buffer) {
out.addAlignment(sr);
}
}
buffer.clear();
}
if (rec == null)
break;
buffer.add(rec);
}
}
depth_array = null;
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class HaloplexParasite method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
SamReader samReader = null;
final List<Mutation> mutations = new ArrayList<>();
try {
final Set<File> bamFiles = Files.lines(this.bamList.toPath()).filter(T -> !(T.isEmpty() || T.startsWith("#"))).map(T -> new File(T)).collect(Collectors.toSet());
final VCFHeader header = new VCFHeader();
header.setSequenceDictionary(in.getHeader().getSequenceDictionary());
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine("HALOPLEXPARASITE", "fails Parasite Haloplex Sequence");
final VCFInfoHeaderLine infoWord = new VCFInfoHeaderLine(filter.getID(), 1, VCFHeaderLineType.String, "Parasite Haloplex Sequence (Word|Count|Fraction)");
super.addMetaData(header);
out.writeHeader(header);
header.addMetaDataLine(filter);
header.addMetaDataLine(infoWord);
while (in.hasNext()) {
final VariantContext ctx = in.next();
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (!(ctx.isIndel() || ctx.isMixed())) {
out.add(vcb.make());
continue;
}
if (!vcb.getAlleles().stream().filter(A -> !(A.isSymbolic() || A.isNoCall() || A.length() < this.minClipSize)).findAny().isPresent()) {
out.add(vcb.make());
continue;
}
final Mutation mut = new Mutation(ctx);
mutations.add(mut);
}
final Counter<String> words = new Counter<>();
for (final File bamFile : bamFiles) {
LOG.info("Opening " + bamFile);
samReader = createSamReaderFactory().open(bamFile);
for (final Mutation mut : mutations) {
// words seen in that mutation : don't use a Counter
final Set<String> mutWords = new HashSet<>();
/* loop over reads overlapping that mutation */
final SAMRecordIterator sri = samReader.queryOverlapping(mut.contig, mut.start, mut.end);
while (sri.hasNext()) {
final SAMRecord rec = sri.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar.numCigarElements() == 1)
continue;
final byte[] bases = rec.getReadBases();
int refpos = rec.getUnclippedStart();
int readpos = 0;
/* scan cigar overlapping that mutation */
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int ref_end = refpos + (op.consumesReferenceBases() || op.isClipping() ? ce.getLength() : 0);
final int read_end = readpos + (op.consumesReadBases() ? ce.getLength() : 0);
/* check clip is large enough */
if (op.equals(CigarOperator.S) && ce.getLength() >= this.minClipSize) {
/* check clip overlap mutation */
if (!(ref_end < mut.start || refpos > mut.end)) {
/* break read of soft clip into words */
for (int x = 0; x + this.minClipSize <= ce.getLength(); ++x) {
final String substr = new String(bases, readpos + x, this.minClipSize);
if (!substr.contains("N")) {
final String revcomp = AcidNucleics.reverseComplement(substr);
mutWords.add(substr);
if (!revcomp.equals(substr))
mutWords.add(revcomp);
}
}
}
}
refpos = ref_end;
readpos = read_end;
}
}
sri.close();
for (final String w : mutWords) {
words.incr(w);
}
}
// end of for(mutations)
samReader.close();
samReader = null;
}
LOG.info("mutations:" + mutations.size());
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
for (final Mutation mut : mutations) {
progress.watch(mut.contig, mut.start);
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, mut.contig, mut.start, mut.end, mut.alleles);
String worstString = null;
Double worstFraction = null;
final double totalWords = words.getTotal();
for (final Allele a : mut.alleles) {
if (a.isSymbolic() || a.isNoCall() || a.length() < this.minClipSize)
continue;
for (int x = 0; x + this.minClipSize <= a.length(); ++x) {
final String substr = new String(a.getBases(), x, this.minClipSize);
final long count = words.count(substr);
final double fraction = count / totalWords;
if (worstFraction == null || worstFraction < fraction) {
worstString = substr + "|" + count + "|" + fraction;
worstFraction = fraction;
}
}
}
if (worstString != null) {
vcb.attribute(infoWord.getID(), worstString);
}
if (worstFraction != null && worstFraction.doubleValue() >= this.tresholdFraction) {
vcb.filter(filter.getID());
}
out.add(vcb.make());
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(samReader);
}
}
Aggregations