use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class ExtendReferenceWithReads method scanRegion.
/**
*scanRegion
* @param contig Reference sequence of interest.
* @param start 0-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
* @param end 0-based, exclusive end of interval of interest. Zero implies end of the reference sequence.
*/
private Map<Integer, Counter<Byte>> scanRegion(final SAMSequenceRecord contig, final int chromStart, final int chromEnd, final Rescue rescue) {
final Map<Integer, Counter<Byte>> pos2bases = new HashMap<>(1 + chromEnd - chromStart);
LOG.info("Scanning :" + contig.getSequenceName() + ":" + chromStart + "-" + chromEnd);
for (int side = 0; side < 2; ++side) {
if (// 5' or 3'
!rescue.equals(Rescue.CENTER) && side > 0) {
// already done
break;
}
for (final SamReader samReader : samReaders) {
final SAMSequenceDictionary dict2 = samReader.getFileHeader().getSequenceDictionary();
final SAMSequenceRecord ssr2 = dict2.getSequence(contig.getSequenceName());
if (ssr2 == null || ssr2.getSequenceLength() != contig.getSequenceLength()) {
LOG.info("No contig " + contig.getSequenceName() + " with L=" + contig.getSequenceLength() + " bases in " + samReader.getResourceDescription());
continue;
}
int mappedPos = -1;
switch(rescue) {
case LEFT:
mappedPos = 1;
break;
case RIGHT:
mappedPos = contig.getSequenceLength();
break;
case CENTER:
mappedPos = (side == 0 ? chromStart + 1 : chromEnd);
break;
default:
throw new IllegalStateException();
}
final SAMRecordIterator iter = samReader.query(contig.getSequenceName(), mappedPos, mappedPos, false);
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.filter.filterOut(rec))
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
final byte[] bases = rec.getReadBases();
if (bases == null || bases.length == 0)
continue;
int refPos1 = rec.getUnclippedStart();
int readpos = 0;
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
for (int L = 0; L < ce.getLength(); ++L) {
if (op.consumesReadBases()) {
Counter<Byte> count = pos2bases.get(refPos1 - 1);
if (count == null) {
count = new Counter<>();
pos2bases.put(refPos1 - 1, count);
}
count.incr((byte) Character.toLowerCase(bases[readpos]));
readpos++;
}
if (op.consumesReferenceBases()) {
refPos1++;
}
}
}
}
iter.close();
}
}
return pos2bases;
}
use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class FindAllCoverageAtPosition method doWork.
@Override
public int doWork(final List<String> args) {
final Set<Mutation> mutations = new TreeSet<>();
BufferedReader r = null;
try {
if (this.referenceFileFile != null) {
this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.referenceFileFile);
}
mutations.addAll(Arrays.asList(this.positionStr.split("[ ]")).stream().filter(S -> !S.trim().isEmpty()).map(S -> new Mutation(S)).collect(Collectors.toSet()));
for (final String s : positionStr.split("[ ]")) {
if (s.trim().isEmpty())
continue;
mutations.add(new Mutation(s));
}
if (this.positionFile != null) {
String line;
r = IOUtils.openFileForBufferedReading(positionFile);
if (positionFile.getName().endsWith(".bed")) {
final BedLineCodec codec = new BedLineCodec();
while ((line = r.readLine()) != null) {
final BedLine bedLine = codec.decode(line);
if (bedLine == null)
continue;
for (int x = bedLine.getStart(); x <= bedLine.getEnd(); ++x) {
mutations.add(new Mutation(bedLine.getContig(), x));
}
}
} else {
while ((line = r.readLine()) != null) {
if (line.trim().isEmpty() || line.startsWith("#"))
continue;
mutations.add(new Mutation(line));
}
}
r.close();
r = null;
}
if (mutations.isEmpty()) {
LOG.fatal("undefined position \'str\'");
return -1;
}
LOG.info("number of mutations " + mutations.size());
this.out = this.openFileOrStdoutAsPrintWriter(this.outputFile);
out.print("#File");
out.print('\t');
out.print("CHROM");
out.print('\t');
out.print("POS");
if (this.indexedFastaSequenceFile != null) {
out.print('\t');
out.print("REF");
}
out.print('\t');
out.print(this.groupBy.name().toUpperCase());
out.print('\t');
out.print("DEPTH");
for (final CigarOperator op : CigarOperator.values()) {
out.print('\t');
out.print(op.name());
}
for (char c : BASES_To_PRINT) {
out.print('\t');
out.print("Base(" + c + ")");
}
out.println();
if (args.isEmpty()) {
LOG.info("Reading from stdin");
r = new BufferedReader(new InputStreamReader(stdin()));
scan(r, mutations);
r.close();
r = null;
} else {
for (final String filename : args) {
LOG.info("Reading from " + filename);
r = IOUtils.openURIForBufferedReading(filename);
scan(r, mutations);
r.close();
r = null;
}
}
this.out.flush();
return 0;
} catch (Exception err) {
LOG.severe(err);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
CloserUtil.close(this.out);
CloserUtil.close(r);
}
}
use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class FindAllCoverageAtPosition method scan.
private void scan(final BufferedReader in, final Set<Mutation> mutations) throws Exception {
final String DEFAULT_SAMPLE_NAME = "(undefined)";
String line;
while ((line = in.readLine()) != null) {
if (out.checkError())
break;
if (line.isEmpty() || line.startsWith("#"))
continue;
File f = new File(line);
if (!f.exists())
continue;
if (!f.isFile())
continue;
if (!f.canRead())
continue;
String filename = f.getName();
if (filename.endsWith(".cram")) {
LOG.warn("Sorry CRAM is not supported " + filename);
continue;
}
if (!filename.endsWith(".bam"))
continue;
SamReader samReader = null;
SAMRecordIterator iter = null;
try {
samReader = this.samReaderFactory.open(f);
if (!samReader.hasIndex()) {
LOG.warn("no index for " + f);
continue;
}
final SAMFileHeader header = samReader.getFileHeader();
for (final Mutation src : mutations) {
final Map<String, CigarAndBases> sample2count = new TreeMap<String, CigarAndBases>();
for (SAMReadGroupRecord rg : header.getReadGroups()) {
if (rg != null) {
String sn = this.groupBy.apply(rg);
if (sn != null && !sn.trim().isEmpty()) {
sample2count.put(sn, new CigarAndBases());
}
}
}
if (sample2count.isEmpty()) {
sample2count.put(DEFAULT_SAMPLE_NAME, new CigarAndBases());
}
final Mutation m = convertFromSamHeader(f, header, src);
if (m == null)
continue;
iter = samReader.query(m.chrom, m.pos - 1, m.pos + 1, false);
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.filter.filterOut(rec))
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
final String readString = rec.getReadString().toUpperCase();
String sampleName = DEFAULT_SAMPLE_NAME;
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg != null) {
String sn = groupBy.apply(rg);
if (!StringUtil.isBlank(sn)) {
sampleName = sn;
}
}
CigarAndBases counter = sample2count.get(sampleName);
if (counter == null) {
counter = new CigarAndBases();
sample2count.put(sampleName, counter);
}
int ref = rec.getUnclippedStart();
int readPos = 0;
for (int k = 0; k < cigar.numCigarElements() && ref < m.pos + 1; ++k) {
final CigarElement ce = cigar.getCigarElement(k);
final CigarOperator op = ce.getOperator();
switch(op) {
case P:
break;
case I:
{
if (ref == m.pos) {
counter.operators.incr(op);
counter.bases.incr(INSERTION_CHAR);
}
readPos += ce.getLength();
break;
}
case D:
case N:
case M:
case X:
case EQ:
case H:
case S:
{
for (int i = 0; i < ce.getLength(); ++i) {
if (ref == m.pos) {
counter.operators.incr(op);
switch(op) {
case M:
case X:
case EQ:
counter.bases.incr(readString.charAt(readPos));
break;
case D:
case N:
counter.bases.incr(DELETION_CHAR);
break;
default:
break;
}
break;
}
if (op.consumesReadBases())
++readPos;
ref++;
}
break;
}
default:
throw new RuntimeException("unknown operator:" + op);
}
}
}
iter.close();
iter = null;
for (final String sample : sample2count.keySet()) {
final CigarAndBases counter = sample2count.get(sample);
out.print(f);
out.print('\t');
out.print(m.chrom);
out.print('\t');
out.print(m.pos);
if (this.indexedFastaSequenceFile != null) {
out.print('\t');
out.print(getReferenceAt(m.chrom, m.pos));
}
out.print('\t');
out.print(sample);
out.print('\t');
out.print(counter.operators.count(CigarOperator.M) + counter.operators.count(CigarOperator.EQ) + counter.operators.count(CigarOperator.X));
for (final CigarOperator op : CigarOperator.values()) {
out.print('\t');
out.print(counter.operators.count(op));
}
for (char c : BASES_To_PRINT) {
out.print('\t');
out.print(counter.bases.count(c));
}
out.println();
}
}
// end of loop over mutations
} catch (final Exception err) {
LOG.error(err);
throw err;
} finally {
CloserUtil.close(iter);
CloserUtil.close(samReader);
}
}
}
use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class BamToSql method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidxFile == null) {
LOG.error("ref sequence faidx not defined");
return -1;
}
SAMRecordIterator iter = null;
SamReader sfr = null;
PrintWriter out = null;
GenomicSequence genomicSequence = null;
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
args = new ArrayList<String>(IOUtils.unrollFiles(args));
try {
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
out.println("CREATE TABLE IF NOT EXISTS SamFile");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("filename TEXT");
out.println(");");
out.println("CREATE TABLE IF NOT EXISTS Dictionary");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("name TEXT NOT NULL,");
out.println("length INT NOT NULL,");
out.println("tid INT NOT NULL,");
out.println("samfile_id INT NOT NULL,");
out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id)");
out.println(");");
out.println("CREATE TABLE IF NOT EXISTS ReadGroup");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("groupId TEXT NOT NULL,");
out.println("sample TEXT NOT NULL,");
out.println("samfile_id INT NOT NULL,");
out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id)");
out.println(");");
out.println("CREATE TABLE IF NOT EXISTS Read");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("name TEXT NOT NULL,");
out.println("flag INTEGER NOT NULL,");
if (this.printflag) {
for (final SAMFlag flg : SAMFlag.values()) {
out.println(flg.name() + " INTEGER NOT NULL,");
}
}
out.println("rname TEXT,");
out.println("pos INTEGER,");
out.println("mapq INTEGER NOT NULL,");
out.println("cigar TEXT,");
out.println("rnext TEXT,");
out.println("pnext INTEGER,");
out.println("tlen INTEGER,");
out.println("sequence TEXT NOT NULL,");
out.println("qualities TEXT NOT NULL,");
out.println("samfile_id INT NOT NULL,");
out.println("group_id INT,");
out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id),");
out.println("FOREIGN KEY(group_id) REFERENCES ReadGroup(id)");
out.println(");");
out.println("CREATE TABLE IF NOT EXISTS Cigar");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("read_pos INT ,");
out.println("read_base TEXT,");
out.println("read_qual INT ,");
out.println("ref_pos INT ,");
out.println("ref_base TEXT,");
out.println("operator TEXT NOT NULL,");
out.println("read_id INT NOT NULL,");
out.println("FOREIGN KEY(read_id) REFERENCES Read(id)");
out.println(");");
out.println("begin transaction;");
int samIndex = 0;
do {
final String inputName;
if (samIndex == 0 && args.isEmpty()) {
sfr = openSamReader(null);
inputName = "<stdin>";
} else {
inputName = args.get(samIndex);
sfr = openSamReader(inputName);
}
final SAMFileHeader header1 = sfr.getFileHeader();
if (header1 == null) {
throw new JvarkitException.FileFormatError("File header missing");
}
final SAMSequenceDictionary dict = header1.getSequenceDictionary();
if (dict == null) {
throw new JvarkitException.DictionaryMissing("No Dictionary in input");
}
final IntervalParser intervalParser = new IntervalParser(dict);
final Interval userInterval;
iter = null;
if (this.regionStr == null || this.regionStr.isEmpty()) {
LOG.warn("You're currently scanning the whole BAM ???!!!");
iter = sfr.iterator();
userInterval = null;
} else {
userInterval = intervalParser.parse(this.regionStr);
if (userInterval == null) {
throw new JvarkitException.UserError("cannot parse interval " + this.regionStr);
}
iter = sfr.query(userInterval.getContig(), userInterval.getStart(), userInterval.getEnd(), false);
}
out.println(String.join(" ", "insert into SamFile(filename) values(", quote(inputName), ");"));
for (int i = 0; i < dict.size(); ++i) {
final SAMSequenceRecord ssr = dict.getSequence(i);
out.println("insert into Dictionary(name,length,tid,samfile_id) select " + quote(inputName) + "," + ssr.getSequenceLength() + "," + i + ",max(id) from SamFile;");
}
for (final SAMReadGroupRecord g : header1.getReadGroups()) {
out.println("insert into ReadGroup(groupId,sample,samfile_id) select " + quote(g.getId()) + "," + quote(g.getSample()) + "," + "max(id) from SamFile;");
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
final StringBuilder sql = new StringBuilder();
sql.append("insert into Read(" + "name,flag,");
if (this.printflag) {
for (final SAMFlag flg : SAMFlag.values()) {
sql.append(flg.name()).append(",");
}
}
sql.append("rname,pos,mapq,cigar,rnext,pnext,tlen,sequence,qualities,group_id,samfile_id) select ");
sql.append(quote(rec.getReadName())).append(",");
sql.append(rec.getFlags()).append(",");
if (this.printflag) {
for (final SAMFlag flg : SAMFlag.values()) {
sql.append(flg.isSet(rec.getFlags()) ? 1 : 0);
sql.append(",");
}
}
if (rec.getReferenceName() == null || rec.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
sql.append("NULL,NULL");
} else {
sql.append(quote(rec.getReferenceName()));
sql.append(",");
sql.append(rec.getAlignmentStart());
}
sql.append(",");
sql.append(rec.getMappingQuality());
sql.append(",");
// cigar
if (rec.getCigarString() == null || rec.getCigarString().equals(SAMRecord.NO_ALIGNMENT_CIGAR)) {
sql.append("NULL");
} else {
sql.append(quote(rec.getCigarString()));
}
sql.append(",");
// rnext
if (rec.getMateReferenceName() == null || rec.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
sql.append("NULL,NULL");
} else {
sql.append(quote(rec.getMateReferenceName()));
sql.append(",");
sql.append(rec.getMateAlignmentStart());
}
sql.append(",");
// tlen
sql.append(rec.getInferredInsertSize());
sql.append(",");
// sequence
sql.append(quote(rec.getReadString()));
sql.append(",");
// qualities
sql.append(quote(rec.getBaseQualityString()));
sql.append(",");
if (rec.getReadGroup() == null) {
sql.append("NULL");
} else {
sql.append("G.id");
}
sql.append(",F.id FROM SamFile as F");
if (rec.getReadGroup() != null) {
sql.append(" , ReadGroup as G where G.groupId=").append(quote(rec.getReadGroup().getId())).append(" and F.id = G.samfile_id ");
}
sql.append(" ORDER BY F.id DESC LIMIT 1;");
out.println(sql.toString());
if (this.printcigar && !rec.getReadUnmappedFlag() && rec.getCigar() != null) {
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
int ref = rec.getUnclippedStart();
final byte[] bases = rec.getReadBases();
final byte[] quals = rec.getBaseQualities();
int read = 0;
for (final CigarElement ce : rec.getCigar()) {
final CigarOperator op = ce.getOperator();
if (op.equals(CigarOperator.P))
continue;
for (int i = 0; i < ce.getLength(); ++i) {
sql.setLength(0);
boolean in_user_interval = true;
sql.append("insert into Cigar(operator,read_pos,read_base,read_qual,ref_pos,ref_base,read_id) ");
sql.append("select '");
sql.append(op.name());
sql.append("',");
if (userInterval != null && !(rec.getReferenceName().equals(userInterval.getContig()) && ref >= userInterval.getStart() && ref <= userInterval.getEnd())) {
in_user_interval = false;
}
switch(op) {
case I:
{
sql.append(read);
sql.append(",");
sql.append("'" + (char) bases[read] + "',");
sql.append("" + quals[read] + "");
sql.append(",");
sql.append("NULL,NULL");
read++;
break;
}
case D:
case N:
case // yes H (hard clip)
H:
{
sql.append("NULL,NULL,NULL,");
sql.append(ref);
sql.append(",'");
sql.append((ref < 1 || ref - 1 >= genomicSequence.length()) ? '*' : genomicSequence.charAt(ref - 1));
sql.append("'");
ref++;
break;
}
case M:
case X:
case EQ:
case // yes S, soft clip
S:
{
sql.append(read);
sql.append(",");
sql.append("'" + (char) bases[read] + "',");
sql.append("" + quals[read] + "");
sql.append(",");
sql.append(ref);
sql.append(",'");
sql.append((ref < 1 || ref - 1 >= genomicSequence.length()) ? '*' : genomicSequence.charAt(ref - 1));
sql.append("'");
ref++;
read++;
break;
}
default:
throw new IllegalStateException();
}
sql.append(", id from Read ORDER BY id DESC LIMIT 1;");
if (in_user_interval)
out.println(sql.toString());
}
}
}
}
iter.close();
iter = null;
sfr.close();
sfr = null;
progress.finish();
samIndex++;
} while (samIndex < args.size());
out.println("COMMIT;");
out.flush();
out.close();
LOG.info("done");
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sfr);
CloserUtil.close(out);
CloserUtil.close(indexedFastaSequenceFile);
}
}
use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class SamClipIndelFraction method doWork.
@Override
public int doWork(final List<String> args) {
SamReader sfr = null;
SAMRecordIterator iter = null;
PrintWriter pw = null;
try {
sfr = openSamReader(oneFileOrNull(args));
pw = super.openFileOrStdoutAsPrintWriter(outputFile);
long total_bases_count = 0L;
long count_clipped_reads = 0L;
long count_clipped_left_reads = 0L;
long count_clipped_right_reads = 0L;
long count_unclipped_reads = 0L;
long count_unmapped_reads = 0L;
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader()).logger(LOG);
Counter<Integer> counter = new Counter<>();
iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord record = progress.watch(iter.next());
if (record.getReadUnmappedFlag()) {
++count_unmapped_reads;
continue;
}
final Cigar cigar = record.getCigar();
int left_clip_length = 0;
int right_clip_length = 0;
int deletion_N_length = 0;
int deletion_D_length = 0;
int insertion_length = 0;
boolean onLeft = true;
for (int i = 0; i < cigar.numCigarElements(); ++i) {
final CigarElement ce = cigar.getCigarElement(i);
final CigarOperator op = ce.getOperator();
switch(op) {
case N:
{
onLeft = false;
deletion_D_length += ce.getLength();
total_bases_count += ce.getLength();
break;
}
case D:
{
onLeft = false;
deletion_N_length += ce.getLength();
total_bases_count += ce.getLength();
break;
}
case I:
{
onLeft = false;
insertion_length += ce.getLength();
total_bases_count += ce.getLength();
break;
}
case S:
case H:
{
if (onLeft) {
if (record.getReadNegativeStrandFlag()) {
right_clip_length += ce.getLength();
} else {
left_clip_length += ce.getLength();
}
} else {
if (record.getReadNegativeStrandFlag()) {
left_clip_length += ce.getLength();
} else {
right_clip_length += ce.getLength();
}
}
total_bases_count += ce.getLength();
break;
}
default:
{
onLeft = false;
if (op.consumesReadBases()) {
total_bases_count += ce.getLength();
}
break;
}
}
}
if (left_clip_length + right_clip_length == 0) {
count_unclipped_reads++;
} else {
if (left_clip_length > 0)
count_clipped_left_reads++;
if (right_clip_length > 0)
count_clipped_right_reads++;
count_clipped_reads++;
}
switch(type) {
case leftclip:
counter.incr(left_clip_length);
break;
case rightclip:
counter.incr(right_clip_length);
break;
case allclip:
counter.incr(left_clip_length + right_clip_length);
break;
case deletion:
counter.incr(deletion_D_length + deletion_N_length);
break;
case insert:
counter.incr(insertion_length);
break;
default:
LOG.error("Bad type: " + type);
return -1;
}
}
progress.finish();
pw.println("##UNMAPPED_READS=" + count_unmapped_reads);
pw.println("##MAPPED_READS=" + (count_clipped_reads + count_unclipped_reads));
pw.println("##CLIPPED_READS=" + count_clipped_reads);
pw.println("##CLIPPED_READS_5_PRIME=" + count_clipped_left_reads);
pw.println("##CLIPPED_READS_3_PRIME=" + count_clipped_right_reads);
pw.println("##UNCLIPPED_READS=" + count_unclipped_reads);
pw.println("##COUNT_BASES=" + total_bases_count);
pw.print("#");
switch(type) {
case leftclip:
pw.print("CLIP_5_PRIME");
break;
case rightclip:
pw.print("CLIP_3_PRIME");
break;
case allclip:
pw.print("CLIP");
break;
case deletion:
pw.print("DELETION");
break;
case insert:
pw.print("INSERTION");
break;
default:
LOG.error("Bad type: " + type);
return -1;
}
pw.println("\tCOUNT\tFRACTION_OF_MAPPED_READS");
for (final Integer size : new TreeSet<Integer>(counter.keySet())) {
pw.print(size);
pw.print('\t');
pw.print(counter.count(size));
pw.print('\t');
pw.println(counter.count(size) / (double) (count_unclipped_reads + count_unclipped_reads));
}
pw.flush();
pw.close();
pw = null;
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sfr);
CloserUtil.close(pw);
}
}
Aggregations