use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class BamToSVG method doWork.
@Override
public int doWork(List<String> args) {
/* parse interval */
if (this.intervalStr == null) {
LOG.error("bed.interval0.undefined");
return -1;
}
int colon = this.intervalStr.indexOf(':');
int hyphen = this.intervalStr.indexOf('-', colon + 1);
if (colon < 1 || hyphen <= colon || hyphen + 1 == intervalStr.length()) {
LOG.error("Bad interval " + this.intervalStr);
return -1;
}
this.interval = new Interval();
this.interval.chrom = this.intervalStr.substring(0, colon);
this.interval.start = Integer.parseInt(this.intervalStr.substring(colon + 1, hyphen)) + 1;
this.interval.end = Integer.parseInt(this.intervalStr.substring(hyphen + 1));
this.drawinAreaWidth = Math.max(100, this.drawinAreaWidth);
SamReader in = null;
SAMRecordIterator iter = null;
SamReaderFactory sfrf = SamReaderFactory.makeDefault();
sfrf.validationStringency(ValidationStringency.SILENT);
XMLStreamWriter w = null;
FileOutputStream fout = null;
try {
/* get genomic sequence */
if (this.referenceFile != null) {
LOG.info("opening " + this.referenceFile);
this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.referenceFile);
this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, this.interval.chrom);
}
for (String vcf : this.vcfFileSet) {
readVariantFile(vcf);
}
/* read SAM data */
if (args.isEmpty()) {
LOG.info("Reading from stdin");
in = sfrf.open(SamInputResource.of(stdin()));
iter = in.iterator();
readBamStream(iter);
iter.close();
in.close();
} else {
for (String arg : args) {
File filename = new File(arg);
LOG.info("Reading from " + filename);
in = sfrf.open(SamInputResource.of(filename));
if (in.hasIndex()) {
iter = in.query(this.interval.getChrom(), this.interval.getStart(), this.interval.getEnd(), false);
} else {
LOG.info("Bam file not indexed !! " + filename);
iter = in.iterator();
}
readBamStream(iter);
iter.close();
in.close();
}
}
this.featureWidth = this.drawinAreaWidth / (double) ((this.interval.end - this.interval.start) + 1);
this.featureHeight = Math.min(Math.max(5.0, this.featureWidth), 30);
this.HEIGHT_RULER = (int) (this.niceIntFormat.format(this.interval.end).length() * this.featureHeight + 5);
LOG.info("Feature height:" + this.featureHeight);
XMLOutputFactory xof = XMLOutputFactory.newFactory();
if (this.outputFile == null) {
w = xof.createXMLStreamWriter(stdout(), "UTF-8");
} else {
fout = new FileOutputStream(this.outputFile);
w = xof.createXMLStreamWriter(fout, "UTF-8");
}
w.writeStartDocument("UTF-8", "1.0");
printDocument(w, intervalStr);
w.writeEndDocument();
w.flush();
w.close();
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(fout);
CloserUtil.close(in);
CloserUtil.close(this.indexedFastaSequenceFile);
this.indexedFastaSequenceFile = null;
this.interval = null;
}
}
use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class Biostar175929 method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidx == null) {
LOG.error("fasta reference was not defined.");
return -1;
}
IndexedFastaSequenceFile reference = null;
VcfIterator iter = null;
try {
reference = new IndexedFastaSequenceFile(this.faidx);
iter = super.openVcfIterator(oneFileOrNull(args));
this.pw = openFileOrStdoutAsPrintWriter(this.outputFile);
final List<VariantContext> variants = new ArrayList<>();
for (; ; ) {
VariantContext ctx = null;
if (iter.hasNext()) {
ctx = iter.next();
}
if (ctx == null || (!variants.isEmpty() && !ctx.getContig().equals(variants.get(0).getContig()))) {
if (!variants.isEmpty()) {
LOG.info("chrom:" + variants.get(0).getContig() + " N=" + variants.size());
final GenomicSequence genomic = new GenomicSequence(reference, variants.get(0).getContig());
final StringBuilder title = new StringBuilder();
final StringBuilder sequence = new StringBuilder();
recursive(genomic, variants, 0, title, sequence);
variants.clear();
}
if (ctx == null)
break;
}
variants.add(ctx);
}
iter.close();
iter = null;
this.pw.flush();
this.pw.close();
return RETURN_OK;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(reference);
CloserUtil.close(iter);
CloserUtil.close(pw);
}
}
use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class Biostar251649 method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter w) {
try {
final VCFHeader header = new VCFHeader(in.getHeader());
VCFInfoHeaderLine info5 = new VCFInfoHeaderLine(leftTag + extend, 1, VCFHeaderLineType.String, "Sequence on the 5' of mutation");
VCFInfoHeaderLine info3 = new VCFInfoHeaderLine(rightTag + extend, 1, VCFHeaderLineType.String, "Sequence on the 3' of mutation");
if (header.getInfoHeaderLine(info5.getID()) != null) {
LOG.error("tag " + info5.getID() + " already present in VCF header");
return -1;
}
if (header.getInfoHeaderLine(info3.getID()) != null) {
LOG.error("tag " + info3.getID() + " already present in VCF header");
return -1;
}
header.addMetaDataLine(info5);
header.addMetaDataLine(info3);
GenomicSequence chrom = null;
w.writeHeader(header);
while (in.hasNext()) {
final VariantContext ctx = in.next();
if (chrom == null || !chrom.getChrom().equals(ctx.getContig())) {
chrom = new GenomicSequence(this.faidx, ctx.getContig());
}
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
if (ctx.getStart() > 0) {
final StringBuilder sb = new StringBuilder(this.extend);
for (int i = 0; i < this.extend; ++i) {
final int pos0 = (ctx.getStart() - 2) - i;
if (pos0 <= 0)
continue;
sb.insert(0, chrom.charAt(pos0));
}
if (sb.length() > 0)
vcb.attribute(info5.getID(), sb.toString());
}
{
final StringBuilder sb = new StringBuilder(this.extend);
for (int i = 0; i < this.extend; ++i) {
int pos0 = ctx.getEnd() + i;
if (pos0 >= chrom.length())
break;
sb.append(chrom.charAt(pos0));
}
if (sb.length() > 0)
vcb.attribute(info3.getID(), sb.toString());
}
w.add(vcb.make());
}
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(faidx);
}
}
use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class Biostar59647 method doWork.
@Override
public int doWork(final List<String> args) {
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samFileReader = null;
PrintStream pout;
try {
GenomicSequence genomicSequence = null;
indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile);
samFileReader = null;
final String bamFile = oneFileOrNull(args);
samFileReader = super.openSamReader(bamFile);
if (!SequenceUtil.areSequenceDictionariesEqual(indexedFastaSequenceFile.getSequenceDictionary(), samFileReader.getFileHeader().getSequenceDictionary())) {
LOG.warning("Not the same sequence dictionaries");
}
final XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
pout = (outputFile == null ? stdout() : new PrintStream(this.outputFile));
final XMLStreamWriter w = xmlfactory.createXMLStreamWriter(pout, "UTF-8");
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("sam");
w.writeAttribute("bam", (bamFile == null ? "stdin" : bamFile));
w.writeAttribute("ref", refFile.getPath());
w.writeComment(getProgramCommandLine());
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(samFileReader.getFileHeader().getSequenceDictionary());
final SAMRecordIterator iter = samFileReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
progess.watch(rec);
final byte[] readbases = rec.getReadBases();
w.writeStartElement("read");
w.writeStartElement("name");
w.writeCharacters(rec.getReadName());
w.writeEndElement();
w.writeStartElement("sequence");
w.writeCharacters(new String(readbases));
w.writeEndElement();
w.writeStartElement("flags");
for (SAMFlag f : SAMFlag.values()) {
w.writeAttribute(f.name(), String.valueOf(f.isSet(rec.getFlags())));
}
w.writeCharacters(String.valueOf(rec.getFlags()));
// flags
w.writeEndElement();
if (!rec.getReadUnmappedFlag()) {
w.writeStartElement("qual");
w.writeCharacters(String.valueOf(rec.getMappingQuality()));
w.writeEndElement();
w.writeStartElement("chrom");
w.writeAttribute("index", String.valueOf(rec.getReferenceIndex()));
w.writeCharacters(rec.getReferenceName());
w.writeEndElement();
w.writeStartElement("pos");
w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
w.writeEndElement();
w.writeStartElement("cigar");
w.writeCharacters(rec.getCigarString());
w.writeEndElement();
}
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
w.writeStartElement("mate-chrom");
w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex()));
w.writeCharacters(rec.getMateReferenceName());
w.writeEndElement();
w.writeStartElement("mate-pos");
w.writeCharacters(String.valueOf(rec.getMateAlignmentStart()));
w.writeEndElement();
}
if (!rec.getReadUnmappedFlag() && rec.getCigar() != null) {
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
w.writeStartElement("align");
int readIndex = 0;
int refIndex = rec.getAlignmentStart();
for (final CigarElement e : rec.getCigar().getCigarElements()) {
switch(e.getOperator()) {
// ignore hard clips
case H:
break;
// ignore pads
case P:
break;
// cont.
case I:
case S:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("read-index", String.valueOf(readIndex + 1));
if (readIndex >= 0 && readIndex < readbases.length) {
w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex])));
}
readIndex++;
}
break;
}
// cont. -- reference skip
case N:
case D:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("ref-index", String.valueOf(refIndex));
if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
w.writeAttribute("ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1)));
}
refIndex++;
}
break;
}
case M:
case EQ:
case X:
{
final int length = e.getLength();
for (int i = 0; i < length; ++i) {
w.writeEmptyElement(e.getOperator().name());
char baseRead = '\0';
if (readIndex >= 0 && readIndex < readbases.length) {
baseRead = (char) (rec.getReadBases()[readIndex]);
w.writeAttribute("read-index", String.valueOf(readIndex + 1));
w.writeAttribute("read-base", String.valueOf(baseRead));
}
w.writeAttribute("ref-index", String.valueOf(refIndex));
if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
char baseRef = genomicSequence.charAt(refIndex - 1);
w.writeAttribute("ref-base", String.valueOf(baseRef));
if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) {
w.writeAttribute("mismatch", "true");
}
}
refIndex++;
readIndex++;
}
break;
}
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
}
}
w.writeEndElement();
}
w.writeEndElement();
}
iter.close();
w.writeEndElement();
w.writeEndDocument();
w.flush();
pout.flush();
CloserUtil.close(w);
CloserUtil.close(pout);
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(samFileReader);
CloserUtil.close(indexedFastaSequenceFile);
}
return 0;
}
use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class ExtendReferenceWithReads method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidx == null) {
LOG.error("No REF defined");
return -1;
}
this.samReaders.clear();
PrintStream out = null;
try {
this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(faidx);
SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
if (dict == null) {
LOG.error("Reference file is missing a sequence dictionary (use picard)");
return -1;
}
final SamReaderFactory srf = super.createSamReaderFactory();
srf.setOption(Option.CACHE_FILE_BASED_INDEXES, true);
for (final String uri : IOUtils.unrollFiles(args)) {
LOG.info("opening BAM " + uri);
final SamReader sr = srf.open(SamInputResource.of(uri));
/* doesn't work with remote ?? */
if (!sr.hasIndex()) {
LOG.error("file " + uri + " is not indexed");
return -1;
}
final SAMFileHeader header = sr.getFileHeader();
if (!header.getSortOrder().equals(SortOrder.coordinate)) {
LOG.error("file " + uri + " not sorted on coordinate but " + header.getSortOrder());
return -1;
}
final SAMSequenceDictionary dict2 = header.getSequenceDictionary();
if (dict2 == null) {
LOG.error("file " + uri + " needs a sequence dictionary");
return -1;
}
samReaders.add(sr);
}
if (samReaders.isEmpty()) {
LOG.error("No BAM defined");
return -1;
}
out = super.openFileOrStdoutAsPrintStream(this.outputFile);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
for (final SAMSequenceRecord ssr : dict.getSequences()) {
final GenomicSequence genomic = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
int chromStart = 0;
int nPrinted = 0;
out.print(">");
out.print(ssr.getSequenceName());
for (final Rescue side : Rescue.values()) {
switch(side) {
case LEFT:
/* look before position 0 of chromosome */
{
final Map<Integer, Counter<Byte>> pos2bases = scanRegion(ssr, -1, -1, side);
int newstart = 0;
for (final Integer pos : pos2bases.keySet()) {
if (pos >= 0)
continue;
newstart = Math.min(newstart, pos);
}
while (newstart < 0) {
final Counter<Byte> count = pos2bases.get(newstart);
if (nPrinted % 60 == 0)
out.println();
out.print(consensus(count));
newstart++;
++nPrinted;
}
break;
}
case RIGHT:
/* look after position > length(chromosome) */
{
final Map<Integer, Counter<Byte>> pos2bases = this.scanRegion(ssr, -1, -1, side);
int newend = ssr.getSequenceLength();
for (final Integer pos : pos2bases.keySet()) {
if (pos < ssr.getSequenceLength())
continue;
newend = Math.max(newend, pos + 1);
}
for (int i = ssr.getSequenceLength(); i < newend; i++) {
final Counter<Byte> count = pos2bases.get(i);
if (nPrinted % 60 == 0)
out.println();
out.print(consensus(count));
++nPrinted;
}
break;
}
case CENTER:
/* 0 to chromosome-length */
{
chromStart = 0;
while (chromStart < genomic.length()) {
final char base = Character.toUpperCase(genomic.charAt(chromStart));
if (base != 'N') {
progress.watch(ssr.getSequenceName(), chromStart);
if (nPrinted % 60 == 0)
out.println();
out.print(base);
++chromStart;
++nPrinted;
continue;
}
int chromEnd = chromStart + 1;
while (chromEnd < genomic.length() && Character.toUpperCase(genomic.charAt(chromEnd)) == 'N') {
++chromEnd;
}
if (chromEnd - chromStart < this.minLenNNNNContig) {
while (chromStart < chromEnd) {
progress.watch(ssr.getSequenceName(), chromStart);
if (nPrinted % 60 == 0)
out.println();
out.print(base);
++chromStart;
++nPrinted;
}
continue;
}
final Map<Integer, Counter<Byte>> pos2bases = scanRegion(ssr, chromStart, chromEnd, side);
while (chromStart < chromEnd) {
final Counter<Byte> count = pos2bases.get(chromStart);
if (nPrinted % 60 == 0)
out.println();
if (count == null) {
out.print('N');
} else {
out.print(consensus(count));
}
++chromStart;
++nPrinted;
continue;
}
}
break;
}
}
// end switch type
}
out.println();
}
progress.finish();
out.flush();
out.close();
return RETURN_OK;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
for (final SamReader r : samReaders) CloserUtil.close(r);
samReaders.clear();
}
}
Aggregations