use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class SamMaskAlignedBases method doWork.
@Override
public int doWork(List<String> args) {
final byte RESET_CHAR = (byte) 'N';
final byte RESET_QUAL = (byte) SAMUtils.fastqToPhred('#');
long nRecords = 0L;
long nRecordMasked = 0L;
long nBasesMasked = 0L;
long nBases = 0L;
SAMRecordIterator iter = null;
SamReader sfr = null;
SAMFileWriter sfw = null;
try {
sfr = openSamReader(oneFileOrNull(args));
final SAMFileHeader header1 = sfr.getFileHeader();
if (header1 == null) {
LOG.error("File header missing");
return -1;
}
final SAMFileHeader header2 = header1.clone();
header2.addComment(getProgramName() + ":" + getVersion() + ":" + getProgramCommandLine());
header2.setSortOrder(SortOrder.unsorted);
sfw = this.writingBamArgs.openSAMFileWriter(outputFile, header2, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
++nRecords;
nBases += rec.getReadLength();
if (rec.getReadUnmappedFlag()) {
SAMUtils.makeReadUnmapped(rec);
sfw.addAlignment(rec);
continue;
}
if (rec.isSecondaryOrSupplementary()) {
continue;
}
final Cigar cigar = rec.getCigar();
byte[] bases = rec.getReadBases();
byte[] quals = rec.getBaseQualities();
if (cigar == null || cigar.isEmpty()) {
SAMUtils.makeReadUnmapped(rec);
sfw.addAlignment(rec);
continue;
}
int readpos = 0;
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
if (op.consumesReadBases()) {
if (op.consumesReferenceBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
if (bases != null)
bases[readpos + x] = RESET_CHAR;
if (quals != null)
quals[readpos + x] = RESET_QUAL;
++nBasesMasked;
}
}
readpos += ce.getLength();
}
}
++nRecordMasked;
SAMUtils.makeReadUnmapped(rec);
sfw.addAlignment(rec);
}
iter.close();
sfw.close();
progress.finish();
LOG.info("done : reads masked " + nRecordMasked + "/" + nRecords + " Bases masked:" + nBasesMasked + "/" + nBases);
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class Sam2Tsv method printAln.
private void printAln(final Row row) {
final SAMRecord rec = row.rec;
if (rec == null)
return;
Cigar cigar = rec.getCigar();
if (cigar == null)
return;
row.readbases = rec.getReadBases();
row.readQuals = rec.getBaseQualities();
if (row.readbases == null) {
row.op = null;
row.refPos = -1;
row.readPos = -1;
writeAln(row);
return;
}
if (rec.getReadUnmappedFlag()) {
row.op = null;
row.refPos = -1;
for (int i = 0; i < row.readbases.length; ++i) {
row.readPos = i;
writeAln(row);
}
return;
}
// fix hard clipped reads
StringBuilder fixReadBases = new StringBuilder(row.readbases.length);
StringBuilder fixReadQuals = new StringBuilder(row.readbases.length);
int readIndex = 0;
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
for (int i = 0; i < ce.getLength(); ++i) {
if (op.equals(CigarOperator.H)) {
fixReadBases.append('*');
fixReadQuals.append('*');
} else if (!op.consumesReadBases()) {
break;
} else {
fixReadBases.append((char) row.readbases[readIndex]);
fixReadQuals.append(row.readQuals == null || row.readQuals.length <= readIndex ? '*' : (char) row.readQuals[readIndex]);
readIndex++;
}
}
}
row.readbases = fixReadBases.toString().getBytes();
row.readQuals = fixReadQuals.toString().getBytes();
if (this.indexedFastaSequenceFile != null) {
if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(rec.getReferenceName())) {
this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, rec.getReferenceName());
}
}
readIndex = 0;
int refIndex = rec.getUnclippedStart();
for (final CigarElement e : cigar.getCigarElements()) {
row.op = e.getOperator();
switch(e.getOperator()) {
case S:
case // length of read has been fixed previously, so same as 'S'
H:
{
for (int i = 0; i < e.getLength(); ++i) {
row.readPos = readIndex;
row.refPos = refIndex;
writeAln(row);
readIndex++;
// because we used getUnclippedStart
refIndex++;
}
break;
}
case P:
{
row.refPos = -1;
row.readPos = -1;
for (int i = 0; i < e.getLength(); ++i) {
writeAln(row);
}
break;
}
case I:
{
row.refPos = -1;
for (int i = 0; i < e.getLength(); ++i) {
row.readPos = readIndex;
writeAln(row);
readIndex++;
}
break;
}
// cont. -- reference skip
case N:
case D:
{
row.readPos = -1;
for (int i = 0; i < e.getLength(); ++i) {
row.refPos = refIndex;
writeAln(row);
refIndex++;
}
break;
}
case M:
case EQ:
case X:
{
for (int i = 0; i < e.getLength(); ++i) {
row.refPos = refIndex;
row.readPos = readIndex;
writeAln(row);
refIndex++;
readIndex++;
}
break;
}
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
}
}
if (printAlignment) {
final int len = Math.max(rec.getReadNameLength(), rec.getReferenceName().length()) + 2;
this.out.printf(":%" + len + "s %8d %s %-8d\n", rec.getReferenceName(), rec.getUnclippedStart(), L3.toString(), rec.getUnclippedEnd());
this.out.printf(":%" + len + "s %8s %s\n", "", "", L2.toString());
this.out.printf(":%" + len + "s %8d %s %-8d\n", rec.getReadName(), 1, L1.toString(), rec.getReadLength());
L1.setLength(0);
L2.setLength(0);
L3.setLength(0);
}
}
use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class SAM4WebLogo method printRead.
private void printRead(final PrintWriter out, final SAMRecord rec, final Interval interval) {
final Cigar cigar = rec.getCigar();
final Function<Integer, Character> read2base = IDX -> {
final byte[] bases = rec.getReadBases();
if (SAMRecord.NULL_SEQUENCE.equals(bases))
return '?';
if (IDX < 0 || IDX >= bases.length)
return '?';
return (char) bases[IDX];
};
final Predicate<Integer> inInterval = IDX -> IDX >= interval.getStart() && IDX <= interval.getEnd();
final StringBuilder seq = new StringBuilder(interval.length());
int refPos = Math.min(interval.getStart(), rec.getUnclippedStart());
while (refPos < rec.getUnclippedStart()) {
if (inInterval.test(refPos)) {
seq.append('-');
}
++refPos;
}
int readPos = 0;
for (int i = 0; i < cigar.numCigarElements(); ++i) {
final CigarElement ce = cigar.getCigarElement(i);
final CigarOperator op = ce.getOperator();
switch(op) {
case P:
break;
case I:
{
readPos += ce.getLength();
break;
}
case D:
case N:
{
for (int j = 0; j < ce.getLength() && refPos <= interval.getEnd(); ++j) {
if (inInterval.test(refPos)) {
seq.append('-');
}
refPos++;
}
break;
}
case H:
{
for (int j = 0; j < ce.getLength() && refPos <= interval.getEnd(); ++j) {
if (inInterval.test(refPos)) {
seq.append(useClip ? 'n' : '-');
}
refPos++;
}
break;
}
case S:
{
for (int j = 0; j < ce.getLength() && refPos <= interval.getEnd(); ++j) {
if (inInterval.test(refPos)) {
if (useClip) {
seq.append(Character.toLowerCase(read2base.apply(readPos)));
} else {
seq.append('-');
}
}
readPos++;
refPos++;
}
break;
}
case M:
case X:
case EQ:
{
for (int j = 0; j < ce.getLength() && refPos <= interval.getEnd(); ++j) {
if (inInterval.test(refPos)) {
seq.append(read2base.apply(readPos));
}
readPos++;
refPos++;
}
break;
}
default:
throw new IllegalStateException("Not handled. op:" + op);
}
}
while (refPos <= interval.getEnd()) {
seq.append('-');
refPos++;
}
out.print(">" + rec.getReadName());
if (rec.getReadPairedFlag()) {
if (rec.getFirstOfPairFlag())
out.print("/1");
if (rec.getSecondOfPairFlag())
out.print("/2");
}
out.println();
out.println(seq);
}
use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class SamFixCigar method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidx == null) {
LOG.error("Reference was not specified.");
return -1;
}
GenomicSequence genomicSequence = null;
SamReader sfr = null;
SAMFileWriter sfw = null;
try {
this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(faidx);
sfr = openSamReader(oneFileOrNull(args));
final SAMFileHeader header = sfr.getFileHeader();
sfw = this.writingBamArgs.setReferenceFile(this.faidx).openSAMFileWriter(outputFile, header, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
final List<CigarElement> newCigar = new ArrayList<CigarElement>();
final SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
Cigar cigar = rec.getCigar();
byte[] bases = rec.getReadBases();
if (rec.getReadUnmappedFlag() || cigar == null || cigar.getCigarElements().isEmpty() || bases == null) {
sfw.addAlignment(rec);
continue;
}
if (genomicSequence == null || genomicSequence.getSAMSequenceRecord().getSequenceIndex() != rec.getReferenceIndex()) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
newCigar.clear();
int refPos1 = rec.getAlignmentStart();
int readPos0 = 0;
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (op.equals(CigarOperator.M)) {
for (int i = 0; i < ce.getLength(); ++i) {
char c1 = Character.toUpperCase((char) bases[readPos0]);
char c2 = Character.toUpperCase(refPos1 - 1 < genomicSequence.length() ? genomicSequence.charAt(refPos1 - 1) : '*');
if (c2 == 'N' || c1 == c2) {
newCigar.add(new CigarElement(1, CigarOperator.EQ));
} else {
newCigar.add(new CigarElement(1, CigarOperator.X));
}
refPos1++;
readPos0++;
}
} else {
newCigar.add(ce);
if (op.consumesReadBases())
readPos0 += ce.getLength();
if (op.consumesReferenceBases())
refPos1 += ce.getLength();
}
}
int i = 0;
while (i < newCigar.size()) {
final CigarOperator op1 = newCigar.get(i).getOperator();
final int length1 = newCigar.get(i).getLength();
if (i + 1 < newCigar.size() && newCigar.get(i + 1).getOperator() == op1) {
final CigarOperator op2 = newCigar.get(i + 1).getOperator();
int length2 = newCigar.get(i + 1).getLength();
newCigar.set(i, new CigarElement(length1 + length2, op2));
newCigar.remove(i + 1);
} else {
++i;
}
}
cigar = new Cigar(newCigar);
// info("changed "+rec.getCigarString()+" to "+newCigarStr+" "+rec.getReadName()+" "+rec.getReadString());
rec.setCigar(cigar);
sfw.addAlignment(rec);
}
progress.finish();
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.
the class GcPercentAndDepth method doWork.
@Override
public int doWork(final List<String> args) {
if (this.windowSize <= 0) {
LOG.error("Bad window size.");
return -1;
}
if (this.windowStep <= 0) {
LOG.error("Bad window step.");
return -1;
}
if (this.refFile == null) {
LOG.error("Undefined REF File");
return -1;
}
if (args.isEmpty()) {
LOG.error("Illegal Number of arguments.");
return -1;
}
ReferenceGenome indexedFastaSequenceFile = null;
List<SamReader> readers = new ArrayList<SamReader>();
PrintWriter out = null;
try {
LOG.info("Loading " + this.refFile);
indexedFastaSequenceFile = new ReferenceGenomeFactory().openFastaFile(this.refFile);
this.samSequenceDictionary = indexedFastaSequenceFile.getDictionary();
if (this.samSequenceDictionary == null) {
LOG.error("Cannot get sequence dictionary for " + this.refFile);
return -1;
}
out = super.openFileOrStdoutAsPrintWriter(outPutFile);
Set<String> all_samples = new TreeSet<String>();
/* create input, collect sample names */
for (int optind = 0; optind < args.size(); ++optind) {
LOG.info("Opening " + args.get(optind));
final SamReader samFileReaderScan = super.openSamReader(args.get(optind));
readers.add(samFileReaderScan);
final SAMFileHeader header = samFileReaderScan.getFileHeader();
if (!SequenceUtil.areSequenceDictionariesEqual(this.samSequenceDictionary, header.getSequenceDictionary())) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(this.samSequenceDictionary, header.getSequenceDictionary()));
return -1;
}
for (final SAMReadGroupRecord g : header.getReadGroups()) {
final String sample = this.partition.apply(g);
if (StringUtil.isBlank(sample)) {
LOG.warning("Read group " + g.getId() + " has no sample in merged dictionary");
continue;
}
all_samples.add(sample);
}
}
LOG.info("N " + this.partition.name() + "=" + all_samples.size());
/* print header */
out.print("#");
if (!this.hide_genomic_index) {
out.print("id");
out.print("\t");
}
out.print("chrom");
out.print("\t");
out.print("start");
out.print("\t");
out.print("end");
out.print("\t");
out.print("GCPercent");
for (final String sample : all_samples) {
out.print("\t");
out.print(sample);
}
out.println();
final List<RegionCaptured> regionsCaptured = new ArrayList<RegionCaptured>();
if (bedFile != null) {
LOG.info("Reading BED:" + bedFile);
final BedLineCodec bedLineCodec = new BedLineCodec();
BufferedReader r = IOUtils.openFileForBufferedReading(bedFile);
r.lines().filter(L -> !L.startsWith("#")).filter(L -> !StringUtil.isBlank(L)).map(L -> bedLineCodec.decode(L)).filter(B -> B != null).forEach(B -> {
final SAMSequenceRecord ssr = this.samSequenceDictionary.getSequence(B.getContig());
if (ssr == null) {
LOG.warning("Cannot resolve " + B.getContig());
return;
}
final RegionCaptured roi = new RegionCaptured(ssr, B.getStart() - 1, B.getEnd());
regionsCaptured.add(roi);
});
CloserUtil.close(r);
LOG.info("end Reading BED:" + bedFile);
Collections.sort(regionsCaptured);
} else {
LOG.info("No capture, peeking everything");
for (final SAMSequenceRecord ssr : this.samSequenceDictionary.getSequences()) {
final RegionCaptured roi = new RegionCaptured(ssr, 0, ssr.getSequenceLength());
regionsCaptured.add(roi);
}
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.samSequenceDictionary).logger(LOG);
ReferenceContig genomicSequence = null;
for (final RegionCaptured roi : regionsCaptured) {
if (genomicSequence == null || !genomicSequence.hasName(roi.getContig())) {
genomicSequence = indexedFastaSequenceFile.getContig(roi.getContig());
if (genomicSequence == null) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(roi.getContig(), this.samSequenceDictionary));
return -1;
}
}
Map<String, int[]> sample2depth = new HashMap<String, int[]>();
Map<String, Double> sample2meanDepth = new HashMap<String, Double>();
for (final String sample : all_samples) {
int[] depth = new int[roi.length()];
Arrays.fill(depth, 0);
sample2depth.put(sample, depth);
}
List<CloseableIterator<SAMRecord>> iterators = new ArrayList<CloseableIterator<SAMRecord>>();
for (final SamReader r : readers) {
iterators.add(r.query(roi.getContig(), roi.getStart(), roi.getEnd(), false));
}
final MergingIterator<SAMRecord> merginIter = new MergingIterator<>(new SAMRecordCoordinateComparator(), iterators);
while (merginIter.hasNext()) {
final SAMRecord rec = merginIter.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.filter.filterOut(rec))
continue;
final String sample = this.partition.getPartion(rec, null);
if (sample == null)
continue;
final int[] depth = sample2depth.get(sample);
if (depth == null)
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
int refpos1 = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (!op.consumesReferenceBases())
continue;
if (op.consumesReadBases()) {
for (int i = 0; i < ce.getLength(); ++i) {
if (refpos1 + i < roi.getStart())
continue;
if (refpos1 + i > roi.getEnd())
break;
depth[refpos1 + i - roi.getStart()]++;
}
}
refpos1 += ce.getLength();
}
}
merginIter.close();
for (final RegionCaptured.SlidingWindow win : roi) {
double total = 0f;
int countN = 0;
for (int pos1 = win.getStart(); pos1 <= win.getEnd(); ++pos1) {
switch(genomicSequence.charAt(pos1 - 1)) {
case 'c':
case 'C':
case 'g':
case 'G':
case 's':
case 'S':
{
total++;
break;
}
case 'n':
case 'N':
countN++;
break;
default:
break;
}
}
if (skip_if_contains_N && countN > 0)
continue;
double GCPercent = total / (double) win.length();
int max_depth_for_win = 0;
sample2meanDepth.clear();
for (final String sample : all_samples) {
int[] depth = sample2depth.get(sample);
double sum = 0;
for (int pos = win.getStart(); pos < win.getEnd() && (pos - roi.getStart()) < depth.length; ++pos) {
sum += depth[pos - roi.getStart()];
}
double mean = (sum / (double) depth.length);
max_depth_for_win = Math.max(max_depth_for_win, (int) mean);
sample2meanDepth.put(sample, mean);
}
if (max_depth_for_win < this.min_depth)
continue;
if (!this.hide_genomic_index) {
out.print(win.getGenomicIndex());
out.print("\t");
}
out.print(win.getContig());
out.print("\t");
out.print(win.getStart() - 1);
out.print("\t");
out.print(win.getEnd());
out.print("\t");
out.printf("%.2f", GCPercent);
for (String sample : all_samples) {
out.print("\t");
out.printf("%.2f", (double) sample2meanDepth.get(sample));
}
out.println();
}
}
progress.finish();
out.flush();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
for (SamReader r : readers) CloserUtil.close(r);
CloserUtil.close(indexedFastaSequenceFile);
CloserUtil.close(out);
}
}
Aggregations