use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfGeneEpistasis method doWork.
@Override
public int doWork(final List<String> args) {
if (this.geneBed == null) {
LOG.error("gene file bed undefined");
return -1;
}
if (this.outputFile == null) {
LOG.error("output file undefined");
return -1;
}
CloseableIterator<VariantContext> iter = null;
try {
final File vcfFile = new File(oneAndOnlyOneFile(args));
this.vcfFileReader = new VCFFileReader(vcfFile, true);
final VCFHeader header = this.vcfFileReader.getFileHeader();
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
pedigree = new Pedigree.Parser().parse(header);
}
if (pedigree == null || pedigree.isEmpty() || !pedigree.hasAffected() || !pedigree.hasUnaffected()) {
LOG.error("empty ped or no case/ctrl");
return -1;
}
pedigree.verifyPersonsHaveUniqueNames();
for (final Pedigree.Person p : pedigree.getPersons().stream().filter(P -> P.isAffected() || P.isUnaffected()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toSet())) {
this.id2samples.put(p.getId(), p);
}
this.vcfTools = new VcfTools(header);
List<Interval> geneList;
if (!this.geneBed.exists()) {
final Map<String, Interval> gene2interval = new HashMap<>(50000);
LOG.info("building gene file" + this.geneBed);
iter = this.vcfFileReader.iterator();
// iter = this.vcfFileReader.query("chr3",1,300_000_000);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!accept(ctx))
continue;
for (final String geneName : getGenes(ctx)) {
final Interval old = gene2interval.get(geneName);
if (old == null) {
gene2interval.put(geneName, new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, geneName));
LOG.info("adding " + geneName + ". number of genes: " + gene2interval.size());
} else if (!old.getContig().equals(ctx.getContig())) {
LOG.error("boum :" + geneName + ": on chrom " + ctx.getContig() + " vs " + old);
return -1;
} else {
gene2interval.put(geneName, new Interval(ctx.getContig(), Math.min(ctx.getStart(), old.getStart()), Math.max(ctx.getEnd(), old.getEnd()), false, geneName));
}
}
}
iter.close();
iter = null;
progress.finish();
geneList = new ArrayList<>(gene2interval.values());
PrintWriter pw = new PrintWriter(this.geneBed);
for (final Interval g : geneList) {
pw.println(g.getContig() + "\t" + (g.getStart() - 1) + "\t" + (g.getEnd()) + "\t" + g.getName());
}
pw.flush();
pw.close();
pw = null;
} else {
BedLineCodec codec = new BedLineCodec();
BufferedReader r = IOUtil.openFileForBufferedReading(geneBed);
geneList = r.lines().map(L -> codec.decode(L)).filter(B -> B != null).map(B -> new Interval(B.getContig(), B.getStart(), B.getEnd(), true, B.get(3))).collect(Collectors.toList());
r.close();
}
if (geneList.isEmpty()) {
LOG.error("gene List is empty");
return -1;
}
final Comparator<VariantContext> ctxSorter = VCFUtils.createTidPosRefComparator(header.getSequenceDictionary());
final Function<Interval, List<VariantContext>> loadVariants = (R) -> {
List<VariantContext> L = new ArrayList<>();
CloseableIterator<VariantContext> r = this.vcfFileReader.query(R.getContig(), R.getStart(), R.getEnd());
while (r.hasNext()) {
final VariantContext ctx = r.next();
if (!accept(ctx))
continue;
if (!getGenes(ctx).contains(R.getName()))
continue;
L.add(ctx);
}
r.close();
return L;
};
final SkatExecutor executor = this.skatFactory.build();
Double bestSkat = null;
LOG.info("number of genes : " + geneList.size());
final int list_end_index = (this.user_end_index < 0 ? geneList.size() : Math.min(geneList.size(), this.user_end_index));
for (int x = this.user_begin_index; x < list_end_index; ++x) {
final Interval intervalx = geneList.get(x);
final List<VariantContext> variantsx = loadVariants.apply(intervalx);
if (variantsx.isEmpty())
continue;
for (int y = x; y < geneList.size(); /* pas list_end_index */
++y) {
final Interval intervaly;
final List<VariantContext> merge;
if (y == x) {
// we-re testing gene 1 only
intervaly = intervalx;
merge = variantsx;
} else {
intervaly = geneList.get(y);
if (intervaly.intersects(intervalx))
continue;
final List<VariantContext> variantsy = loadVariants.apply(intervaly);
if (variantsy.isEmpty())
continue;
merge = new MergedList<>(variantsx, variantsy);
}
LOG.info("testing : [" + x + "]" + intervalx + " [" + y + "]" + intervaly + " N:" + geneList.size() + " best: " + bestSkat);
final Double skat = eval(executor, merge);
if (skat == null)
continue;
if (bestSkat == null || skat.compareTo(bestSkat) < 0) {
bestSkat = skat;
LOG.info("best " + bestSkat + " " + intervalx + " " + intervaly);
if (this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz")) {
final VCFHeader header2 = new VCFHeader(header);
header2.addMetaDataLine(new VCFHeaderLine(VcfGeneEpistasis.class.getName(), intervalx.getName() + " " + intervaly.getName() + " " + bestSkat));
final VariantContextWriter w = VCFUtils.createVariantContextWriter(outputFile);
w.writeHeader(header2);
merge.stream().sorted(ctxSorter).forEach(V -> w.add(V));
w.close();
} else {
final PrintWriter w = super.openFileOrStdoutAsPrintWriter(outputFile);
w.println(String.valueOf(bestSkat) + "\t" + intervalx.getName() + "\t" + intervaly.getName());
w.flush();
w.close();
}
}
}
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(this.vcfFileReader);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class GenScan method doWork.
@Override
public int doWork(final List<String> args) {
if (this.faidxFile == null) {
LOG.error("Reference missing");
return -1;
}
if (this.maxY <= this.minY) {
LOG.error("MaxY <= MinY");
return -1;
}
this.trackNames.stream().flatMap(S -> Arrays.stream(S.split("[ ,;]"))).filter(S -> !S.isEmpty()).collect(Collectors.toSet()).forEach(LABEL -> {
this.name2track.put(LABEL, new Track(LABEL));
});
/* no track specified, add a default one */
if (this.name2track.isEmpty()) {
/**
* define default track
*/
this.defaultTrack = new Track("");
this.name2track.put(this.defaultTrack.label, this.defaultTrack);
}
final DecimalFormat niceIntFormat = new DecimalFormat("###,###");
BufferedReader r = null;
try {
final Rectangle2D.Double drawingRect = new Rectangle2D.Double(this.insets.left, this.insets.top, WIDTH - (this.insets.left + this.insets.right), HEIGHT - (this.insets.top + this.insets.bottom));
final Style styleBase = new Style();
styleBase.update(this.defaultStyleStr);
this.styleStack.add(styleBase);
final double adjustedHeight = drawingRect.getHeight() - this.distanceBetweenContigs * (this.name2track.size() - 1);
double y = drawingRect.y;
for (final Track track : this.name2track.values()) {
track.startY = y;
track.height = (adjustedHeight / this.name2track.size());
y += this.distanceBetweenContigs;
y += track.height;
}
this.dict = SAMSequenceDictionaryExtractor.extractDictionary(this.faidxFile);
for (final String rgnstr : this.regions) {
if (rgnstr.trim().isEmpty())
continue;
final DisplayRange contig;
if (rgnstr.contains(":")) {
contig = new DisplayInterval(rgnstr);
} else {
contig = new DisplayContig(rgnstr);
}
if (contig.length() == 0)
continue;
this.displayRanges.add(contig);
}
if (this.displayRanges.isEmpty()) {
// no region, add whole genome
for (final SAMSequenceRecord ssr : this.dict.getSequences()) {
if (ssr.getSequenceLength() < this.removeContigsSmallerThan) {
LOG.warn("Ignoring " + ssr.getSequenceName() + " because length =" + ssr.getSequenceLength() + " < " + this.removeContigsSmallerThan);
continue;
}
this.displayRanges.add(new DisplayContig(ssr));
}
if (this.displayRanges.isEmpty()) {
LOG.error("No range to display");
return -1;
}
}
// sort on chrom/start
Collections.sort(this.displayRanges, (DR1, DR2) -> {
final int tid1 = dict.getSequenceIndex(DR1.getContig());
final int tid2 = dict.getSequenceIndex(DR2.getContig());
int i = tid1 - tid2;
if (i != 0)
return i;
i = DR1.getStart() - DR2.getStart();
if (i != 0)
return i;
i = DR1.getEnd() - DR2.getEnd();
return i;
});
this.genomeViewLength = this.displayRanges.stream().mapToLong(DR -> DR.length()).sum();
final double adjustedImgWidth = drawingRect.getWidth() - this.distanceBetweenContigs * (this.displayRanges.size() - 1);
if (adjustedImgWidth <= 0) {
LOG.error("with such settings image width would be empty");
return -1;
}
double x = this.insets.left;
for (final DisplayRange displayRange : this.displayRanges) {
displayRange.startX = x;
displayRange.width = (displayRange.length() / (double) genomeViewLength) * adjustedImgWidth;
x += displayRange.width;
x += this.distanceBetweenContigs;
}
final BufferedImage img = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB);
final Graphics2D g = img.createGraphics();
g.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY);
g.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);
g.setColor(ALMOST_WHITE);
g.fillRect(0, 0, WIDTH, HEIGHT);
// plot background of each contig
for (int rangeIdx = 0; rangeIdx < this.displayRanges.size(); ++rangeIdx) {
final DisplayRange displayRange = this.displayRanges.get(rangeIdx);
int trackidx = 0;
for (final Track track : this.name2track.values()) {
final Rectangle2D rec = new Pane(displayRange, track).getBounds();
g.setColor(rangeIdx % 2 != trackidx % 2 ? ColorUtils.antiquewhite : ColorUtils.navajowhite);
g.fill(rec);
trackidx++;
}
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.dict);
r = super.openBufferedReader(oneFileOrNull(args));
String line;
while ((line = r.readLine()) != null) {
if (line.trim().isEmpty())
continue;
if (line.startsWith("#")) {
if (line.equals("#!push")) {
this.styleStack.push(new Style(this.styleStack.peek()));
} else if (line.equals("#!pop")) {
if (this.styleStack.size() == 1) {
LOG.error("Cannot pop bottom style");
return -1;
}
this.styleStack.pop();
} else if (line.startsWith("#!log")) {
LOG.info(line);
} else if (line.startsWith("#!")) {
this.styleStack.peek().update(line);
}
continue;
}
final Data data = parseData(line);
if (data == null)
continue;
progress.watch(data.getContig(), data.getStart());
if (data.getValue() < GenScan.this.minY) {
continue;
}
if (data.getValue() > GenScan.this.maxY) {
continue;
}
if (data.track == null)
continue;
if (data.style.getAlpha() <= 0f)
continue;
final Style style = data.getStyle();
for (final DisplayRange dr : GenScan.this.displayRanges) {
if (!dr.contains(data))
continue;
final Pane panel = new Pane(dr, data.track);
final Rectangle2D.Double bounds = panel.getBounds();
final Point2D.Double point = panel.convertToPoint(data);
final Shape oldClip = g.getClip();
final Composite oldComposite = g.getComposite();
g.setClip(bounds);
//
g.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_OVER, style.getAlpha()));
final Shape dataShape;
final double shape_size = style.getSize();
switch(style.getShapeType()) {
case circle:
dataShape = new Ellipse2D.Double(point.x - shape_size / 2.0, point.y - shape_size / 2.0, shape_size, shape_size);
break;
case square:
dataShape = new Rectangle2D.Double(point.x - shape_size / 2.0, point.y - shape_size / 2.0, shape_size, shape_size);
break;
default:
throw new IllegalArgumentException();
}
if (style.getPaperColor() != null) {
g.setColor(style.getPaperColor());
g.fill(dataShape);
}
final float stroke_width = style.getStrokeWidth();
if (style.getPenColor() != null && stroke_width > 0) {
final Stroke oldStroke = g.getStroke();
final Stroke stroke = new BasicStroke(stroke_width, BasicStroke.CAP_BUTT, BasicStroke.JOIN_ROUND);
g.setStroke(stroke);
g.setColor(style.getPenColor());
g.draw(dataShape);
g.setStroke(oldStroke);
}
//
g.setClip(oldClip);
g.setComposite(oldComposite);
}
}
progress.finish();
double lastTickX = 0.0;
// plot frame of each contig
for (final DisplayRange displayRange : this.displayRanges) {
for (final Track track : this.name2track.values()) {
final Rectangle2D rec = new Pane(displayRange, track).getBounds();
g.setColor(ALMOST_BLACK);
g.draw(rec);
}
final String label = displayRange.getLabel();
int fontSize = insets.top - 2;
final double labelLength = Math.min(label.length() * fontSize, displayRange.width);
if (labelLength > this.distanceBetweenContigs) {
g.setColor(ALMOST_BLACK);
this.hershey.paint(g, label, new Rectangle2D.Double(displayRange.startX + displayRange.width / 2.0 - labelLength / 2.0, 1, labelLength, (insets.top - 2)));
}
// plot ticks
for (int i = 0; i <= 10; i++) {
double bp = displayRange.getStart() + (displayRange.length() / 10.0) * i;
double tx = displayRange.convertToX(bp);
double ty = drawingRect.getMaxY();
g.setColor(ALMOST_BLACK);
g.draw(new Line2D.Double(tx, ty, tx, ty + 5));
if (tx - lastTickX > 5) {
g.translate(tx, ty + 5);
g.rotate(Math.PI / 2.0);
g.drawString(niceIntFormat.format((long) bp), 0, 0);
g.rotate(-Math.PI / 2.0);
g.translate(-tx, -(ty + 5));
lastTickX = tx + 5;
}
}
}
// plot y axis
for (final Track track : this.name2track.values()) {
for (int i = 0; i <= 10; ++i) {
double v = this.minY + ((this.maxY - this.minY) / 10.0) * i;
double ty = track.convertToY(v);
double font_size = 12;
// horizontal line
g.setColor(Color.LIGHT_GRAY);
g.draw(new Line2D.Double(drawingRect.getX() - 5, ty, drawingRect.getMaxX(), ty));
g.setColor(ALMOST_BLACK);
this.hershey.paint(g, String.format("%8s", v), new Rectangle2D.Double(0, ty - font_size, insets.left, font_size));
}
// plot track label if no default track
if (this.defaultTrack == null) {
g.setColor(ALMOST_BLACK);
g.translate(WIDTH - insets.right + 2, track.startY);
g.rotate(Math.PI / 2.0);
g.drawString(track.label, 0, 0);
g.rotate(-Math.PI / 2.0);
g.translate(-(WIDTH - insets.right + 2), -(track.startY));
}
}
g.dispose();
r.close();
r = null;
if (this.outputFile == null) {
ImageIO.write(img, "PNG", stdout());
} else {
ImageIO.write(img, this.outputFile.getName().toLowerCase().endsWith(".png") ? "PNG" : "JPG", this.outputFile);
}
return 0;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(r);
this.dict = null;
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class HaloplexParasite method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
SamReader samReader = null;
final List<Mutation> mutations = new ArrayList<>();
try {
final Set<File> bamFiles = Files.lines(this.bamList.toPath()).filter(T -> !(T.isEmpty() || T.startsWith("#"))).map(T -> new File(T)).collect(Collectors.toSet());
final VCFHeader header = new VCFHeader();
header.setSequenceDictionary(in.getHeader().getSequenceDictionary());
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine("HALOPLEXPARASITE", "fails Parasite Haloplex Sequence");
final VCFInfoHeaderLine infoWord = new VCFInfoHeaderLine(filter.getID(), 1, VCFHeaderLineType.String, "Parasite Haloplex Sequence (Word|Count|Fraction)");
super.addMetaData(header);
out.writeHeader(header);
header.addMetaDataLine(filter);
header.addMetaDataLine(infoWord);
while (in.hasNext()) {
final VariantContext ctx = in.next();
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (!(ctx.isIndel() || ctx.isMixed())) {
out.add(vcb.make());
continue;
}
if (!vcb.getAlleles().stream().filter(A -> !(A.isSymbolic() || A.isNoCall() || A.length() < this.minClipSize)).findAny().isPresent()) {
out.add(vcb.make());
continue;
}
final Mutation mut = new Mutation(ctx);
mutations.add(mut);
}
final Counter<String> words = new Counter<>();
for (final File bamFile : bamFiles) {
LOG.info("Opening " + bamFile);
samReader = createSamReaderFactory().open(bamFile);
for (final Mutation mut : mutations) {
// words seen in that mutation : don't use a Counter
final Set<String> mutWords = new HashSet<>();
/* loop over reads overlapping that mutation */
final SAMRecordIterator sri = samReader.queryOverlapping(mut.contig, mut.start, mut.end);
while (sri.hasNext()) {
final SAMRecord rec = sri.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar.numCigarElements() == 1)
continue;
final byte[] bases = rec.getReadBases();
int refpos = rec.getUnclippedStart();
int readpos = 0;
/* scan cigar overlapping that mutation */
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int ref_end = refpos + (op.consumesReferenceBases() || op.isClipping() ? ce.getLength() : 0);
final int read_end = readpos + (op.consumesReadBases() ? ce.getLength() : 0);
/* check clip is large enough */
if (op.equals(CigarOperator.S) && ce.getLength() >= this.minClipSize) {
/* check clip overlap mutation */
if (!(ref_end < mut.start || refpos > mut.end)) {
/* break read of soft clip into words */
for (int x = 0; x + this.minClipSize <= ce.getLength(); ++x) {
final String substr = new String(bases, readpos + x, this.minClipSize);
if (!substr.contains("N")) {
final String revcomp = AcidNucleics.reverseComplement(substr);
mutWords.add(substr);
if (!revcomp.equals(substr))
mutWords.add(revcomp);
}
}
}
}
refpos = ref_end;
readpos = read_end;
}
}
sri.close();
for (final String w : mutWords) {
words.incr(w);
}
}
// end of for(mutations)
samReader.close();
samReader = null;
}
LOG.info("mutations:" + mutations.size());
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
for (final Mutation mut : mutations) {
progress.watch(mut.contig, mut.start);
final VariantContextBuilder vcb = new VariantContextBuilder(inputName, mut.contig, mut.start, mut.end, mut.alleles);
String worstString = null;
Double worstFraction = null;
final double totalWords = words.getTotal();
for (final Allele a : mut.alleles) {
if (a.isSymbolic() || a.isNoCall() || a.length() < this.minClipSize)
continue;
for (int x = 0; x + this.minClipSize <= a.length(); ++x) {
final String substr = new String(a.getBases(), x, this.minClipSize);
final long count = words.count(substr);
final double fraction = count / totalWords;
if (worstFraction == null || worstFraction < fraction) {
worstString = substr + "|" + count + "|" + fraction;
worstFraction = fraction;
}
}
}
if (worstString != null) {
vcb.attribute(infoWord.getID(), worstString);
}
if (worstFraction != null && worstFraction.doubleValue() >= this.tresholdFraction) {
vcb.filter(filter.getID());
}
out.add(vcb.make());
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(samReader);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class Biostar59647 method doWork.
@Override
public int doWork(final List<String> args) {
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samFileReader = null;
PrintStream pout;
try {
GenomicSequence genomicSequence = null;
indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile);
samFileReader = null;
final String bamFile = oneFileOrNull(args);
samFileReader = super.openSamReader(bamFile);
if (!SequenceUtil.areSequenceDictionariesEqual(indexedFastaSequenceFile.getSequenceDictionary(), samFileReader.getFileHeader().getSequenceDictionary())) {
LOG.warning("Not the same sequence dictionaries");
}
final XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
pout = (outputFile == null ? stdout() : new PrintStream(this.outputFile));
final XMLStreamWriter w = xmlfactory.createXMLStreamWriter(pout, "UTF-8");
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("sam");
w.writeAttribute("bam", (bamFile == null ? "stdin" : bamFile));
w.writeAttribute("ref", refFile.getPath());
w.writeComment(getProgramCommandLine());
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(samFileReader.getFileHeader().getSequenceDictionary());
final SAMRecordIterator iter = samFileReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
progess.watch(rec);
final byte[] readbases = rec.getReadBases();
w.writeStartElement("read");
w.writeStartElement("name");
w.writeCharacters(rec.getReadName());
w.writeEndElement();
w.writeStartElement("sequence");
w.writeCharacters(new String(readbases));
w.writeEndElement();
w.writeStartElement("flags");
for (SAMFlag f : SAMFlag.values()) {
w.writeAttribute(f.name(), String.valueOf(f.isSet(rec.getFlags())));
}
w.writeCharacters(String.valueOf(rec.getFlags()));
// flags
w.writeEndElement();
if (!rec.getReadUnmappedFlag()) {
w.writeStartElement("qual");
w.writeCharacters(String.valueOf(rec.getMappingQuality()));
w.writeEndElement();
w.writeStartElement("chrom");
w.writeAttribute("index", String.valueOf(rec.getReferenceIndex()));
w.writeCharacters(rec.getReferenceName());
w.writeEndElement();
w.writeStartElement("pos");
w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
w.writeEndElement();
w.writeStartElement("cigar");
w.writeCharacters(rec.getCigarString());
w.writeEndElement();
}
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
w.writeStartElement("mate-chrom");
w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex()));
w.writeCharacters(rec.getMateReferenceName());
w.writeEndElement();
w.writeStartElement("mate-pos");
w.writeCharacters(String.valueOf(rec.getMateAlignmentStart()));
w.writeEndElement();
}
if (!rec.getReadUnmappedFlag() && rec.getCigar() != null) {
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
w.writeStartElement("align");
int readIndex = 0;
int refIndex = rec.getAlignmentStart();
for (final CigarElement e : rec.getCigar().getCigarElements()) {
switch(e.getOperator()) {
// ignore hard clips
case H:
break;
// ignore pads
case P:
break;
// cont.
case I:
case S:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("read-index", String.valueOf(readIndex + 1));
if (readIndex >= 0 && readIndex < readbases.length) {
w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex])));
}
readIndex++;
}
break;
}
// cont. -- reference skip
case N:
case D:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("ref-index", String.valueOf(refIndex));
if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
w.writeAttribute("ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1)));
}
refIndex++;
}
break;
}
case M:
case EQ:
case X:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
char baseRead = '\0';
if (readIndex >= 0 && readIndex < readbases.length) {
baseRead = (char) (rec.getReadBases()[readIndex]);
w.writeAttribute("read-index", String.valueOf(readIndex + 1));
w.writeAttribute("read-base", String.valueOf(baseRead));
}
w.writeAttribute("ref-index", String.valueOf(refIndex));
if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
char baseRef = genomicSequence.charAt(refIndex - 1);
w.writeAttribute("ref-base", String.valueOf(baseRef));
if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) {
w.writeAttribute("mismatch", "true");
}
}
refIndex++;
readIndex++;
}
break;
}
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
}
}
w.writeEndElement();
}
w.writeEndElement();
}
iter.close();
w.writeEndElement();
w.writeEndDocument();
w.flush();
pout.flush();
CloserUtil.close(w);
CloserUtil.close(pout);
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(samFileReader);
CloserUtil.close(indexedFastaSequenceFile);
}
return 0;
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class Biostar78400 method doWork.
@Override
public int doWork(List<String> args) {
if (this.XML == null) {
LOG.error("XML file missing");
return -1;
}
final Map<String, Map<Integer, String>> flowcell2lane2id = new HashMap<String, Map<Integer, String>>();
SamReader sfr = null;
SAMFileWriter sfw = null;
try {
final Pattern readNameSignature = Pattern.compile(this.readNameSignatureStr);
final JAXBContext context = JAXBContext.newInstance(ReadGroup.class, ReadGroupList.class);
final Unmarshaller unmarshaller = context.createUnmarshaller();
final ReadGroupList rgl = unmarshaller.unmarshal(new StreamSource(XML), ReadGroupList.class).getValue();
if (rgl.flowcells.isEmpty()) {
LOG.error("empty XML " + XML);
return -1;
}
sfr = openSamReader(oneFileOrNull(args));
final SAMFileHeader header = sfr.getFileHeader().clone();
header.addComment("Processed with " + getProgramName());
final Set<String> seenids = new HashSet<String>();
final List<SAMReadGroupRecord> samReadGroupRecords = new ArrayList<SAMReadGroupRecord>();
for (final FlowCell fc : rgl.flowcells) {
final Map<Integer, String> lane2id = new HashMap<Integer, String>();
for (final Lane lane : fc.lanes) {
for (final ReadGroup rg : lane.readGroups) {
if (seenids.contains(rg.id)) {
LOG.error("Group id " + rg.id + " defined twice");
return -1;
}
seenids.add(rg.id);
// create the read group we'll be using
final SAMReadGroupRecord rgrec = new SAMReadGroupRecord(rg.id);
rgrec.setLibrary(rg.library);
rgrec.setPlatform(rg.platform);
rgrec.setSample(rg.sample);
rgrec.setPlatformUnit(rg.platformunit);
if (rg.center != null)
rgrec.setSequencingCenter(rg.center);
if (rg.description != null)
rgrec.setDescription(rg.description);
lane2id.put(lane.id, rg.id);
samReadGroupRecords.add(rgrec);
}
}
if (flowcell2lane2id.containsKey(fc.name)) {
LOG.error("FlowCell id " + fc.name + " defined twice in XML");
return -1;
}
flowcell2lane2id.put(fc.name, lane2id);
}
header.setReadGroups(samReadGroupRecords);
sfw = this.writingBamArgs.openSAMFileWriter(this.outputFile, header, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
final SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
final Matcher matcher = readNameSignature.matcher(rec.getReadName());
final String flowcellStr;
final String laneStr;
if (matcher.matches()) {
flowcellStr = matcher.group(1);
laneStr = matcher.group(2);
} else {
LOG.error("Read name " + rec.getReadName() + " doesn't match regular expression " + readNameSignature.pattern() + ". please check options");
return -1;
}
String RGID = null;
final Map<Integer, String> lane2id = flowcell2lane2id.get(flowcellStr);
if (lane2id == null)
throw new RuntimeException("Cannot get flowcell/readgroup for " + rec.getReadName());
try {
RGID = lane2id.get(Integer.parseInt(laneStr));
} catch (final Exception e) {
LOG.error("bad lane-Id in " + rec.getReadName());
return -1;
}
if (RGID == null) {
throw new RuntimeException("Cannot get RGID for " + rec.getReadName() + " flowcell:" + flowcellStr + " lane2id:" + laneStr + " dict:" + lane2id);
}
rec.setAttribute(SAMTag.RG.name(), RGID);
sfw.addAlignment(rec);
}
progress.finish();
iter.close();
LOG.info("done");
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfw);
CloserUtil.close(sfr);
}
}
Aggregations