use of htsjdk.samtools.CigarElement in project jvarkit by lindenb.
the class SamFindClippedRegions method doWork.
/*private static boolean closeTo(int pos1,int pos2, int max)
{
return Math.abs(pos2-pos1)<=max;
}*/
/*
private static boolean same(char c1,char c2)
{
if(c1=='N' || c2=='N') return false;
return Character.toUpperCase(c1)==Character.toUpperCase(c2);
}*/
@Override
public int doWork(List<String> args) {
int readLength = 150;
if (args.isEmpty()) {
LOG.error("illegal.number.of.arguments");
return -1;
}
List<Input> inputs = new ArrayList<Input>();
VariantContextWriter w = null;
// SAMFileWriter w=null;
try {
SAMSequenceDictionary dict = null;
/* create input, collect sample names */
Map<String, Input> sample2input = new HashMap<String, Input>();
for (final String filename : args) {
Input input = new Input(new File(filename));
// input.index=inputs.size();
inputs.add(input);
if (sample2input.containsKey(input.sampleName)) {
LOG.error("Duplicate sample " + input.sampleName + " in " + input.bamFile + " and " + sample2input.get(input.sampleName).bamFile);
return -1;
}
sample2input.put(input.sampleName, input);
if (dict == null) {
dict = input.header.getSequenceDictionary();
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, input.header.getSequenceDictionary())) {
LOG.error("Found more than one dictint sequence dictionary");
return -1;
}
}
LOG.info("Sample N= " + sample2input.size());
/* create merged iterator */
List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(sample2input.size());
for (Input input : inputs) headers.add(input.header);
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SortOrder.coordinate, headers, true);
List<SamReader> readers = new ArrayList<SamReader>(sample2input.size());
for (Input input : inputs) readers.add(input.samFileReaderScan);
MergingSamRecordIterator merginIter = new MergingSamRecordIterator(headerMerger, readers, true);
Allele reference_allele = Allele.create("N", true);
Allele[] alternate_alleles = new Allele[] { Allele.create("<CLIP5>", false), Allele.create("<CLIP3>", false) };
Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
for (Allele alt : alternate_alleles) {
vcfHeaderLines.add(new VCFSimpleHeaderLine("<ID=" + alt.getDisplayString() + ",Description=\"StructVar\">", VCFHeaderVersion.VCF4_1, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")));
}
vcfHeaderLines.add(new VCFInfoHeaderLine("COUNT_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples with depth>=" + this.min_depth));
vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
for (int side = 0; side < 2; ++side) {
vcfHeaderLines.add(new VCFFormatHeaderLine("CN" + (side == 0 ? 5 : 3), 1, VCFHeaderLineType.Integer, "count clipped in " + (side == 0 ? 5 : 3) + "'"));
}
if (dict != null) {
vcfHeaderLines.addAll(VCFUtils.samSequenceDictToVCFContigHeaderLine(dict));
}
VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample2input.keySet());
w = VCFUtils.createVariantContextWriterToStdout();
w.writeHeader(vcfHeader);
final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<>();
// w=swf.make(header, System.out);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
if (bedFile != null) {
final BedLineCodec bedLineCodec = new BedLineCodec();
LOG.info("Reading " + bedFile);
BufferedReader r = IOUtils.openFileForBufferedReading(bedFile);
String line;
while ((line = r.readLine()) != null) {
BedLine bedLine = bedLineCodec.decode(line);
if (bedLine == null)
continue;
if (dict != null && dict.getSequence(bedLine.getContig()) == null) {
LOG.warning("undefined chromosome in " + bedFile + " " + line);
continue;
}
intervals.put(bedLine.toInterval(), true);
}
CloserUtil.close(r);
}
LinkedList<SAMRecord> buffer = new LinkedList<SAMRecord>();
final Predicate<SAMRecord> filterSamRecords = new Predicate<SAMRecord>() {
@Override
public boolean test(SAMRecord rec) {
if (rec.getReadUnmappedFlag())
return false;
if (rec.isSecondaryOrSupplementary())
return false;
if (rec.getDuplicateReadFlag())
return false;
if (rec.getReadFailsVendorQualityCheckFlag())
return false;
Cigar cigar = rec.getCigar();
if (cigar == null || cigar.numCigarElements() < 2)
return false;
boolean found_S = false;
for (int side = 0; side < 2; ++side) {
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
// read must be clipped on 5' or 3' with a good length
if (!ce.getOperator().equals(CigarOperator.S))
continue;
found_S = true;
break;
}
if (!found_S)
return false;
SAMReadGroupRecord g = rec.getReadGroup();
if (g == null || g.getSample() == null || g.getSample().isEmpty())
return false;
return true;
}
};
final FilteringIterator<SAMRecord> forwardIterator = new FilteringIterator<SAMRecord>(merginIter, filterSamRecords);
for (; ; ) {
SAMRecord rec = null;
if (forwardIterator.hasNext()) {
rec = forwardIterator.next();
progress.watch(rec);
if (intervals != null && !intervals.containsOverlapping(new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd())))
continue;
}
// need to flush buffer ?
if (rec == null || (!buffer.isEmpty() && !buffer.getLast().getReferenceIndex().equals(rec.getReferenceIndex())) || (!buffer.isEmpty() && buffer.getLast().getUnclippedEnd() + readLength < rec.getUnclippedStart())) {
if (!buffer.isEmpty()) {
int chromStart = buffer.getFirst().getUnclippedStart();
int chromEnd = buffer.getFirst().getUnclippedEnd();
for (SAMRecord sam : buffer) {
chromStart = Math.min(chromStart, sam.getUnclippedStart());
chromEnd = Math.max(chromEnd, sam.getUnclippedEnd());
}
final int winShift = 5;
for (int pos = chromStart; pos + winShift <= chromEnd; pos += winShift) {
int[] count_big_clip = new int[] { 0, 0 };
// int max_depth[]=new int[]{0,0};
List<Genotype> genotypes = new ArrayList<Genotype>();
Set<Allele> all_alleles = new HashSet<Allele>();
all_alleles.add(reference_allele);
boolean found_one_depth_ok = false;
int sum_depth = 0;
int samples_with_high_depth = 0;
for (String sample : sample2input.keySet()) {
GenotypeBuilder gb = new GenotypeBuilder(sample);
int[] count_clipped = new int[] { 0, 0 };
Set<Allele> sample_alleles = new HashSet<Allele>(3);
for (int side = 0; side < 2; ++side) {
for (SAMRecord sam : buffer) {
if (!sam.getReadGroup().getSample().equals(sample))
continue;
Cigar cigar = sam.getCigar();
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
if (!ce.getOperator().equals(CigarOperator.S))
continue;
int clipStart = (side == 0 ? sam.getUnclippedStart() : sam.getAlignmentEnd() + 1);
int clipEnd = (side == 0 ? sam.getAlignmentStart() - 1 : sam.getUnclippedEnd());
if ((pos + winShift < clipStart || pos > clipEnd))
continue;
count_clipped[side]++;
if (ce.getLength() >= this.min_clip_length) {
count_big_clip[side]++;
}
sample_alleles.add(alternate_alleles[side]);
gb.attribute("CN" + (side == 0 ? 5 : 3), count_clipped[side]);
}
}
// if(!(found_one_big_clip[0] || found_one_big_clip[1])) continue;
if (count_clipped[0] + count_clipped[1] == 0)
continue;
if ((count_clipped[0] + count_clipped[1]) > min_depth) {
found_one_depth_ok = true;
++samples_with_high_depth;
}
sum_depth += (count_clipped[0] + count_clipped[1]);
gb.alleles(new ArrayList<Allele>(sample_alleles));
all_alleles.addAll(sample_alleles);
gb.DP(count_clipped[0] + count_clipped[1]);
genotypes.add(gb.make());
}
if (all_alleles.size() == 1) {
// all homozygotes
continue;
}
if (!found_one_depth_ok) {
continue;
}
if (!(count_big_clip[0] >= 1 || count_big_clip[1] >= 1)) {
continue;
}
Map<String, Object> atts = new HashMap<String, Object>();
atts.put("COUNT_SAMPLES", samples_with_high_depth);
atts.put(VCFConstants.DEPTH_KEY, sum_depth);
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(buffer.getFirst().getReferenceName());
vcb.start(pos);
vcb.stop(pos + winShift);
vcb.alleles(all_alleles);
vcb.attributes(atts);
vcb.genotypes(genotypes);
w.add(vcb.make());
}
buffer.clear();
}
if (rec == null) {
break;
}
}
buffer.add(rec);
}
merginIter.close();
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
for (Input input : inputs) {
CloserUtil.close(input);
}
}
}
use of htsjdk.samtools.CigarElement in project jvarkit by lindenb.
the class ReadClipper method clip.
public SAMRecord clip(final SAMRecord rec, final Interval fragment) {
if (rec.getReadUnmappedFlag()) {
return rec;
}
if (!fragment.getContig().equals(rec.getContig())) {
return rec;
}
if (rec.getAlignmentEnd() < fragment.getStart()) {
return rec;
}
if (rec.getAlignmentStart() > fragment.getEnd()) {
return rec;
}
Cigar cigar = rec.getCigar();
if (cigar == null) {
LOG.warning("cigar missing in " + rec);
return rec;
}
final List<BaseOp> bases = new ArrayList<>(cigar.getCigarElements().stream().mapToInt(C -> C.getLength()).sum());
// expand cigar
int refPos = rec.getUnclippedStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (op.equals(CigarOperator.P))
continue;
for (int x = 0; x < ce.getLength(); ++x) {
bases.add(new BaseOp(op, op.consumesReferenceBases() || op.isClipping() ? refPos : -1));
if (op.consumesReferenceBases() || op.isClipping()) {
refPos++;
}
}
}
/* 5' side */
int newStart = rec.getAlignmentStart();
int x = 0;
while (x < bases.size()) {
final BaseOp b = bases.get(x);
if (b.refPos != -1 && b.isMatch() && b.refPos >= fragment.getStart()) {
newStart = b.refPos;
break;
} else if (b.isDeletion()) {
bases.remove(x);
continue;
} else if (!b.op.isClipping()) {
b.op = CigarOperator.S;
++x;
} else {
++x;
}
}
/* 3' side */
x = bases.size() - 1;
while (x >= 0) {
final BaseOp b = bases.get(x);
if (b.refPos != -1 && b.isMatch() && b.refPos <= fragment.getEnd()) {
break;
} else if (b.isDeletion()) {
bases.remove(x);
--x;
continue;
} else if (!b.op.isClipping()) {
b.op = CigarOperator.S;
--x;
} else {
--x;
}
}
// build new cigar
boolean found_M = false;
final List<CigarElement> newcigarlist = new ArrayList<>();
x = 0;
while (x < bases.size()) {
final CigarOperator op = bases.get(x).op;
if (op.equals(CigarOperator.M) || op.equals(CigarOperator.EQ)) {
found_M = true;
}
int x2 = x;
while (x2 < bases.size() && op.equals(bases.get(x2).op)) {
++x2;
}
newcigarlist.add(new CigarElement(x2 - x, op));
x = x2;
}
if (!found_M) {
SAMUtils.makeReadUnmappedWithOriginalTags(rec);
if (this.programGroup != null) {
rec.setAttribute("PG", programGroup);
}
return rec;
}
cigar = new Cigar(newcigarlist);
rec.setCigar(cigar);
rec.setAlignmentStart(newStart);
if (this.programGroup != null) {
rec.setAttribute("PG", programGroup);
}
return rec;
}
use of htsjdk.samtools.CigarElement in project jvarkit by lindenb.
the class SamSlop method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidx == null) {
LOG.error("Reference was not specified.");
return -1;
}
if (this.defaultQual.length() != 1) {
LOG.error("default quality should have length==1 " + this.defaultQual);
}
GenomicSequence genomicSequence = null;
SamReader sfr = null;
SAMFileWriter sfw = null;
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
final char defaultQUAL = this.defaultQual.charAt(0);
try {
final String inputName = oneFileOrNull(args);
LOG.info("Loading reference");
indexedFastaSequenceFile = new IndexedFastaSequenceFile(faidx);
sfr = openSamReader(inputName);
final SAMFileHeader header = sfr.getFileHeader();
header.setSortOrder(SortOrder.unsorted);
sfw = writingBamArgs.openSAMFileWriter(outputFile, header, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
final SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
final Cigar cigar = rec.getCigar();
if (rec.getReadUnmappedFlag() || cigar == null || cigar.isEmpty() || rec.getReadBases() == SAMRecord.NULL_SEQUENCE || (this.extend5 <= 0 && this.extend3 <= 0)) {
sfw.addAlignment(rec);
continue;
}
final StringBuilder sbs = new StringBuilder(rec.getReadString());
final StringBuilder sbq = new StringBuilder(rec.getBaseQualityString());
if (genomicSequence == null || genomicSequence.getSAMSequenceRecord().getSequenceIndex() != rec.getReferenceIndex()) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
final int refPos1 = (this.removeClip ? rec.getAlignmentStart() : rec.getUnclippedStart());
final int endAlignmend1 = (this.removeClip ? rec.getAlignmentEnd() : rec.getUnclippedEnd());
final List<CigarElement> cl = new ArrayList<>(cigar.getCigarElements());
if (!this.removeClip) {
// replace clip S/H by match M
for (int i = 0; i < cl.size(); ++i) {
final CigarElement ce = cl.get(i);
if (ce.getOperator() == CigarOperator.S || ce.getOperator() == CigarOperator.H) {
cl.set(i, new CigarElement(ce.getLength(), CigarOperator.M));
}
}
}
if (this.extend5 > 0 && refPos1 > 1) {
if (this.removeClip) {
// /remove hard + soft clip 5'
while (!cl.isEmpty()) {
// first
final CigarElement ce = cl.get(0);
// not a clip
if (!(ce.getOperator() == CigarOperator.S || ce.getOperator() == CigarOperator.H)) {
break;
}
if (ce.getOperator() == CigarOperator.S) {
sbs.replace(0, ce.getLength(), "");
sbq.replace(0, ce.getLength(), "");
}
// remove first
cl.remove(0);
}
}
final StringBuilder prefix = new StringBuilder(this.extend5);
// /append + soft clip 5'
for (int i = 0; i < this.extend5; ++i) {
int x1 = (refPos1 - 1) - i;
// break if out of genome
if (x1 < 1)
break;
prefix.insert(0, genomicSequence.charAt(x1 - 1));
}
sbs.insert(0, prefix.toString());
// preprend quality
for (int i = 0; i < prefix.length(); ++i) sbq.insert(0, defaultQUAL);
// prepend cigar
cl.add(0, new CigarElement(prefix.length(), CigarOperator.M));
// update start pos
rec.setAlignmentStart(refPos1 - prefix.length());
}
if (this.extend3 > 0 && rec.getAlignmentEnd() < genomicSequence.length()) {
if (this.removeClip) {
// /remove hard + soft clip 3'
while (!cl.isEmpty()) {
// last
final CigarElement ce = cl.get(cl.size() - 1);
// not a clip
if (!(ce.getOperator() == CigarOperator.S || ce.getOperator() == CigarOperator.H)) {
break;
}
if (ce.getOperator() == CigarOperator.S) {
sbs.setLength(sbs.length() - ce.getLength());
sbq.setLength(sbq.length() - ce.getLength());
}
// remove last
cl.remove(cl.size() - 1);
}
}
int extend = 0;
for (int pos1 = endAlignmend1 + 1; pos1 <= (endAlignmend1 + this.extend3) && pos1 <= genomicSequence.length(); ++pos1) {
sbs.append(genomicSequence.charAt(pos1 - 1));
sbq.append(defaultQUAL);
++extend;
}
// append cigar
cl.add(new CigarElement(extend, CigarOperator.M));
}
// simplify cigar
int idx = 0;
while (idx + 1 < cl.size()) {
final CigarElement ce1 = cl.get(idx);
final CigarElement ce2 = cl.get(idx + 1);
if (ce1.getOperator() == ce2.getOperator()) {
cl.set(idx, new CigarElement(ce1.getLength() + ce2.getLength(), ce1.getOperator()));
cl.remove(idx + 1);
} else {
idx++;
}
}
rec.setCigar(new Cigar(cl));
rec.setReadString(sbs.toString());
rec.setBaseQualityString(sbq.toString());
final List<SAMValidationError> errors = rec.isValid();
if (errors != null && !errors.isEmpty()) {
for (SAMValidationError err : errors) {
LOG.error(err.getMessage());
}
}
// info("changed "+rec.getCigarString()+" to "+newCigarStr+" "+rec.getReadName()+" "+rec.getReadString());
sfw.addAlignment(rec);
}
progress.finish();
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(indexedFastaSequenceFile);
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
use of htsjdk.samtools.CigarElement in project gatk-protected by broadinstitute.
the class ReadThreadingGraph method mergeDanglingHead.
/**
* Actually merge the dangling head if possible
*
* @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference
* @return 1 if merge was successful, 0 otherwise
*/
@VisibleForTesting
int mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult) {
final List<CigarElement> elements = danglingHeadMergeResult.cigar.getCigarElements();
final CigarElement firstElement = elements.get(0);
Utils.validateArg(firstElement.getOperator() == CigarOperator.M, "The first Cigar element must be an M");
final int indexesToMerge = bestPrefixMatch(danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString, firstElement.getLength());
if (indexesToMerge <= 0) {
return 0;
}
// we can't push back the reference path
if (indexesToMerge >= danglingHeadMergeResult.referencePath.size() - 1) {
return 0;
}
// but we can manipulate the dangling path if we need to
if (indexesToMerge >= danglingHeadMergeResult.danglingPath.size() && !extendDanglingPathAgainstReference(danglingHeadMergeResult, indexesToMerge - danglingHeadMergeResult.danglingPath.size() + 2)) {
return 0;
}
addEdge(danglingHeadMergeResult.referencePath.get(indexesToMerge + 1), danglingHeadMergeResult.danglingPath.get(indexesToMerge), ((MyEdgeFactory) getEdgeFactory()).createEdge(false, 1));
return 1;
}
use of htsjdk.samtools.CigarElement in project gatk by broadinstitute.
the class EventMap method processCigarForInitialEvents.
protected void processCigarForInitialEvents() {
final Cigar cigar = haplotype.getCigar();
final byte[] alignment = haplotype.getBases();
int refPos = haplotype.getAlignmentStartHapwrtRef();
if (refPos < 0) {
return;
}
// Protection against SW failures
final List<VariantContext> proposedEvents = new ArrayList<>();
int alignmentPos = 0;
for (int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++) {
final CigarElement ce = cigar.getCigarElement(cigarIndex);
final int elementLength = ce.getLength();
switch(ce.getOperator()) {
case I:
{
if (refPos > 0) {
// protect against trying to create insertions/deletions at the beginning of a contig
final List<Allele> insertionAlleles = new ArrayList<>();
final int insertionStart = refLoc.getStart() + refPos - 1;
final byte refByte = ref[refPos - 1];
if (BaseUtils.isRegularBase(refByte)) {
insertionAlleles.add(Allele.create(refByte, true));
}
if (cigarIndex == 0 || cigarIndex == cigar.numCigarElements() - 1) {
// if the insertion isn't completely resolved in the haplotype, skip it
// note this used to emit SYMBOLIC_UNASSEMBLED_EVENT_ALLELE but that seems dangerous
} else {
byte[] insertionBases = {};
// add the padding base
insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]);
insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength));
if (BaseUtils.isAllRegularBases(insertionBases)) {
insertionAlleles.add(Allele.create(insertionBases, false));
}
}
if (insertionAlleles.size() == 2) {
// found a proper ref and alt allele
proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
}
}
alignmentPos += elementLength;
break;
}
case S:
{
alignmentPos += elementLength;
break;
}
case D:
{
if (refPos > 0) {
// protect against trying to create insertions/deletions at the beginning of a contig
// add padding base
final byte[] deletionBases = Arrays.copyOfRange(ref, refPos - 1, refPos + elementLength);
final List<Allele> deletionAlleles = new ArrayList<>();
final int deletionStart = refLoc.getStart() + refPos - 1;
final byte refByte = ref[refPos - 1];
if (BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases)) {
deletionAlleles.add(Allele.create(deletionBases, true));
deletionAlleles.add(Allele.create(refByte, false));
proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
}
}
refPos += elementLength;
break;
}
case M:
case EQ:
case X:
{
for (int iii = 0; iii < elementLength; iii++) {
final byte refByte = ref[refPos];
final byte altByte = alignment[alignmentPos];
if (refByte != altByte) {
// SNP!
if (BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte)) {
final List<Allele> snpAlleles = new ArrayList<>();
snpAlleles.add(Allele.create(refByte, true));
snpAlleles.add(Allele.create(altByte, false));
proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
}
}
refPos++;
alignmentPos++;
}
break;
}
case N:
case H:
case P:
default:
throw new GATKException("Unsupported cigar operator created during SW alignment: " + ce.getOperator());
}
}
for (final VariantContext proposedEvent : proposedEvents) addVC(proposedEvent, true);
}
Aggregations