use of com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlign in project jvarkit by lindenb.
the class BWAMemDigest method doWork.
@Override
public int doWork(List<String> args) {
IntervalTreeMap<Boolean> ignore = null;
DefaultMemOuput output = new DefaultMemOuput();
final float limitcigar = 0.15f;
SamReader r = null;
try {
r = super.openSamReader(oneFileOrNull(args));
SAMFileHeader header = r.getFileHeader();
if (IGNORE_BED != null) {
LOG.info("open " + IGNORE_BED);
ignore = new IntervalTreeMap<>();
BufferedReader in = IOUtils.openFileForBufferedReading(IGNORE_BED);
String line;
while ((line = in.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
String[] tokens = line.split("[\t]");
if (tokens.length < 3)
continue;
if (ignore.put(new Interval(tokens[0], Math.max(1, Integer.parseInt(tokens[1]) - (1 + IGNORE_EXTEND)), Integer.parseInt(tokens[2]) + IGNORE_EXTEND), Boolean.TRUE)) {
LOG.warn("BED:ignoring " + line);
}
}
in.close();
}
OtherCanonicalAlignFactory xPalignFactory = new OtherCanonicalAlignFactory(header);
SAMRecordIterator iter = r.iterator();
long readNum = 0L;
while (iter.hasNext()) {
SAMRecord record = iter.next();
++readNum;
if (!record.getReadPairedFlag())
continue;
if (record.getProperPairFlag())
continue;
if (record.getReadFailsVendorQualityCheckFlag())
continue;
if (record.getDuplicateReadFlag())
continue;
if (record.getReadUnmappedFlag())
continue;
if (ignore != null && ignore.containsOverlapping(new Interval(record.getReferenceName(), record.getAlignmentStart(), record.getAlignmentEnd()))) {
LOG.info("ignore " + record);
continue;
}
float countM = 0f;
float countS = 0f;
for (CigarElement c : record.getCigar().getCigarElements()) {
switch(c.getOperator()) {
case M:
countM += c.getLength();
break;
case S:
countS += c.getLength();
break;
default:
break;
}
}
if (countM > 10 && ((countS / countM) > (1f - limitcigar) && (countS / countM) < (1f + limitcigar))) {
output.insertion(record, readNum, countS, countM);
}
for (OtherCanonicalAlign xp : xPalignFactory.getXPAligns(record)) {
if (ignore != null && ignore.containsOverlapping(new Interval(xp.getReferenceName(), xp.getAlignmentStart(), xp.getAlignmentStart()))) {
LOG.info("ignore " + record);
continue;
}
output.xp(record, readNum, xp);
}
if (record.getMateUnmappedFlag()) {
output.orphan(record, readNum);
continue;
}
if (ignore != null && ignore.containsOverlapping(new Interval(record.getMateReferenceName(), record.getMateAlignmentStart(), record.getMateAlignmentStart()))) {
LOG.info("ignore " + record);
continue;
}
if (record.getReferenceIndex() == record.getMateReferenceIndex()) {
output.deletion(record, readNum);
} else {
output.transloc(record, readNum);
}
}
} catch (Exception e) {
e.printStackTrace();
LOG.error(e);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(output);
}
return 0;
}
use of com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlign in project jvarkit by lindenb.
the class BWAMemNOp method doWork.
@Override
public int doWork(List<String> args) {
SamReader r = null;
SAMFileWriter w = null;
try {
r = super.openSamReader(oneFileOrNull(args));
SAMFileHeader header = r.getFileHeader();
OtherCanonicalAlignFactory ocaf = new OtherCanonicalAlignFactory(header);
w = writingBamArgs.openSAMFileWriter(outputFile, header, true);
SAMRecordFactory samRecordFactory = new DefaultSAMRecordFactory();
SAMRecordIterator iter = r.iterator();
while (iter.hasNext()) {
SAMRecord rec = iter.next();
if (rec.getSupplementaryAlignmentFlag()) {
continue;
}
if (rec.getReadUnmappedFlag()) {
if (!print_only_spit_read)
w.addAlignment(rec);
continue;
}
Cigar cigar1 = rec.getCigar();
if (cigar1 == null || cigar1.isEmpty() || !(cigar1.getCigarElement(cigar1.numCigarElements() - 1).getOperator().equals(CigarOperator.S) || cigar1.getCigarElement(0).getOperator().equals(CigarOperator.S))) // last or first is soft clipping
{
if (!print_only_spit_read)
w.addAlignment(rec);
continue;
}
rec.getAlignmentStart();
List<OtherCanonicalAlign> xps = ocaf.getXPAligns(rec);
if (xps.isEmpty()) {
if (!print_only_spit_read)
w.addAlignment(rec);
continue;
}
boolean found_one = false;
for (OtherCanonicalAlign xp : xps) {
if (!rec.getReferenceName().equals(xp.getReferenceName()))
continue;
if (xp.getReadNegativeStrandFlag() != rec.getReadNegativeStrandFlag())
continue;
Cigar cigar2 = xp.getCigar();
if (cigar2 == null || cigar2.isEmpty()) {
continue;
}
SAMRecord newrec = null;
List<CigarEvt> L1 = null;
List<CigarEvt> L2 = null;
if (cigar1.getCigarElement(cigar1.numCigarElements() - 1).getOperator().equals(CigarOperator.S) && cigar1.getCigarElement(cigar1.numCigarElements() - 1).getLength() >= this.min_soft_clip_length && cigar2.getCigarElement(0).getOperator().equals(CigarOperator.S) && cigar2.getCigarElement(0).getLength() >= this.min_soft_clip_length && rec.getAlignmentEnd() < xp.getAlignmentStart()) {
newrec = samRecordFactory.createSAMRecord(header);
int ref1 = rec.getAlignmentStart();
newrec.setAlignmentStart(ref1);
L1 = cigarEvents(0, ref1, cigar1);
L2 = cigarEvents(0, xp.getAlignmentStart(), cigar2);
} else if (cigar2.getCigarElement(cigar2.numCigarElements() - 1).getOperator().equals(CigarOperator.S) && cigar2.getCigarElement(cigar2.numCigarElements() - 1).getLength() >= this.min_soft_clip_length && cigar1.getCigarElement(0).getOperator().equals(CigarOperator.S) && cigar1.getCigarElement(0).getLength() >= this.min_soft_clip_length && xp.getAlignmentEnd() < rec.getAlignmentStart()) {
newrec = samRecordFactory.createSAMRecord(header);
int ref1 = xp.getAlignmentStart();
newrec.setAlignmentStart(ref1);
L1 = cigarEvents(0, ref1, cigar2);
L2 = cigarEvents(0, rec.getAlignmentStart(), cigar1);
}
if (newrec == null)
continue;
newrec.setFlags(rec.getFlags());
newrec.setReadName(rec.getReadName());
newrec.setReadBases(rec.getReadBases());
newrec.setMappingQuality(rec.getMappingQuality());
newrec.setReferenceIndex(rec.getReferenceIndex());
newrec.setBaseQualities(rec.getBaseQualities());
if (found_one) {
newrec.setNotPrimaryAlignmentFlag(true);
}
found_one = true;
for (SAMTagAndValue tav : rec.getAttributes()) {
if (tav.tag.equals(ocaf.getAttributeKey()))
continue;
if (tav.tag.equals("NM"))
continue;
newrec.setAttribute(tav.tag, tav.value);
}
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
newrec.setMateAlignmentStart(rec.getMateAlignmentStart());
newrec.setMateReferenceIndex(rec.getMateReferenceIndex());
newrec.setInferredInsertSize(rec.getInferredInsertSize());
}
while (!L1.isEmpty() && (L1.get(L1.size() - 1).op.equals(CigarOperator.S) || L1.get(L1.size() - 1).op.equals(CigarOperator.D) || L1.get(L1.size() - 1).op.equals(CigarOperator.H))) {
L1.remove(L1.size() - 1);
}
while (!L2.isEmpty() && L2.get(0).read0 <= L1.get(L1.size() - 1).read0) {
L2.remove(0);
}
List<CigarElement> cigarElements = new ArrayList<CigarElement>();
int i = 0;
while (i < L1.size()) {
int j = i + 1;
while (j < L1.size() && L1.get(i).op.equals(L1.get(j).op)) {
j++;
}
cigarElements.add(new CigarElement(j - i, L1.get(i).op));
i = j;
}
// add 'N'
cigarElements.add(new CigarElement((L2.get(0).ref1 - L1.get(L1.size() - 1).ref1) - 1, CigarOperator.N));
i = 0;
while (i < L2.size()) {
int j = i + 1;
while (j < L2.size() && L2.get(i).op.equals(L2.get(j).op)) {
j++;
}
cigarElements.add(new CigarElement(j - i, L2.get(i).op));
i = j;
}
// cleanup : case where 'S' is close to 'N'
i = 0;
while (i + 1 < cigarElements.size()) {
CigarElement ce1 = cigarElements.get(i);
CigarElement ce2 = cigarElements.get(i + 1);
if (i > 0 && ce1.getOperator().equals(CigarOperator.S) && ce2.getOperator().equals(CigarOperator.N)) {
cigarElements.set(i, new CigarElement(ce1.getLength(), CigarOperator.X));
} else if (i + 2 < cigarElements.size() && ce1.getOperator().equals(CigarOperator.N) && ce2.getOperator().equals(CigarOperator.S)) {
cigarElements.set(i + 1, new CigarElement(ce2.getLength(), CigarOperator.X));
}
i++;
}
newrec.setCigar(new Cigar(cigarElements));
List<SAMValidationError> validations = newrec.isValid();
if (validations != null) {
for (SAMValidationError err : validations) {
LOG.warning(err.getType() + ":" + err.getMessage());
}
}
w.addAlignment(newrec);
}
if (!found_one) {
if (!print_only_spit_read)
w.addAlignment(rec);
}
}
iter.close();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(w);
}
}
use of com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlign in project jvarkit by lindenb.
the class FindMyVirus method doWork.
@Override
public int doWork(List<String> args) {
if (virusNames.isEmpty()) {
LOG.error("no virus name");
return -1;
}
SamReader sfr = null;
SAMFileWriter[] sfwArray = new SAMFileWriter[CAT.values().length];
try {
sfr = openSamReader(oneFileOrNull(args));
SAMFileHeader header = sfr.getFileHeader();
for (CAT category : CAT.values()) {
LOG.info("Opening " + category);
SAMFileHeader header2 = header.clone();
header2.addComment("Category:" + category.name());
header2.addComment("Description:" + category.getDescription());
SAMProgramRecord rec = header2.createProgramRecord();
rec.setCommandLine(this.getProgramCommandLine());
rec.setProgramName(getProgramName());
rec.setProgramVersion(getVersion());
rec.setAttribute("CAT", category.name());
File outputFile = new File(this.outputFile.getParentFile(), this.outputFile.getName() + "." + category.name() + ".bam");
LOG.info("Opening " + outputFile);
File countFile = new File(outputFile.getParentFile(), outputFile.getName() + "." + category.name() + ".count.txt");
SAMFileWriter sfw = writingBamArgs.openSAMFileWriter(outputFile, header2, true);
sfw = new SAMFileWriterCount(sfw, countFile, category);
sfwArray[category.ordinal()] = sfw;
}
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
OtherCanonicalAlignFactory xpAlignFactory = new OtherCanonicalAlignFactory(header);
SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
SAMRecord rec = iter.next();
progress.watch(rec);
CAT category = null;
List<OtherCanonicalAlign> xpList = Collections.emptyList();
if (category == null && !rec.getReadPairedFlag()) {
category = CAT.unpaired;
}
if (category == null && rec.isSecondaryOrSupplementary()) {
category = CAT.secondary;
}
if (category == null && rec.getReadFailsVendorQualityCheckFlag()) {
category = CAT.failsqual;
}
if (category == null && rec.getDuplicateReadFlag()) {
category = CAT.duplicate;
}
if (category == null && rec.getReadUnmappedFlag()) {
category = CAT.unmapped;
}
if (category == null) {
xpList = xpAlignFactory.getXPAligns(rec);
}
boolean xp_containsVirus = false;
boolean xp_containsChrom = false;
for (OtherCanonicalAlign xpa : xpList) {
if (virusNames.contains(xpa.getReferenceName())) {
xp_containsVirus = true;
} else {
xp_containsChrom = true;
}
}
/* both reads mapped on ref */
if (category == null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && !virusNames.contains(rec.getReferenceName()) && !virusNames.contains(rec.getMateReferenceName())) {
if (!xp_containsVirus) {
category = CAT.both_ref;
} else {
category = CAT.ref_and_virus_spliced;
}
}
/* pair(unmapped,mapped on reference) */
if (category == null && ((!rec.getReadUnmappedFlag() && rec.getMateUnmappedFlag() && !virusNames.contains(rec.getReferenceName())) || (rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && !virusNames.contains(rec.getMateReferenceName())))) {
if (!xp_containsVirus) {
category = CAT.ref_orphan;
} else {
category = CAT.ref_and_virus_spliced;
}
}
/* both reads mapped on virus */
if (category == null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && virusNames.contains(rec.getReferenceName()) && virusNames.contains(rec.getMateReferenceName())) {
if (!xp_containsChrom) {
category = CAT.both_virus;
} else {
category = CAT.ref_and_virus_spliced;
}
}
if (category == null && ((!rec.getReadUnmappedFlag() && rec.getMateUnmappedFlag() && virusNames.contains(rec.getReferenceName())) || (rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && virusNames.contains(rec.getMateReferenceName())))) {
if (!xp_containsChrom) {
category = CAT.virus_orphan;
} else {
category = CAT.ref_and_virus_spliced;
}
}
if (category == null && !rec.getReadUnmappedFlag() && !rec.getMateUnmappedFlag() && ((virusNames.contains(rec.getReferenceName()) && !virusNames.contains(rec.getMateReferenceName())) || (!virusNames.contains(rec.getReferenceName()) && virusNames.contains(rec.getMateReferenceName())))) {
category = CAT.ref_and_virus;
}
/*dispatch */
if (category == null) {
LOG.warning("Not handled: " + rec);
category = CAT.undetermined;
}
sfwArray[category.ordinal()].addAlignment(rec);
}
for (SAMFileWriter sfw : sfwArray) {
LOG.info("Closing " + sfw);
sfw.close();
}
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
LOG.info("Closing");
CloserUtil.close(sfr);
CloserUtil.close(sfwArray);
}
}
use of com.github.lindenb.jvarkit.util.picard.OtherCanonicalAlign in project jvarkit by lindenb.
the class SamShortInvertion method doWork.
@Override
public int doWork(List<String> input) {
SamReader r = null;
PrintStream out = null;
// SAMFileWriter w=null;
try {
final List<SamReaderList> samReaders = new ArrayList<>();
final Set<String> args = IOUtils.unrollFiles(input);
if (args.isEmpty()) {
LOG.error("No input file");
return -1;
}
SAMSequenceDictionary dict = null;
for (final String bam : args) {
final SamReaderList samReader = new SamReaderList(new File(bam), 2);
for (int i = 0; i < samReaders.size(); ++i) {
if (samReaders.get(i).sample.equals(samReader.sample)) {
samReader.close();
LOG.error("Sample defined in two bams " + samReader.sample);
return -1;
}
if (dict == null) {
dict = samReader.dict;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, samReader.dict)) {
samReader.close();
LOG.error("bam contains two sequence dict.");
return -1;
}
}
samReaders.add(samReader);
}
final Predicate<SAMRecord> samRecordFilter = new Predicate<SAMRecord>() {
@Override
public boolean test(SAMRecord rec) {
if (rec.getReadUnmappedFlag())
return false;
if (rec.isSecondaryOrSupplementary())
return false;
if (rec.getDuplicateReadFlag())
return false;
if (rec.getReadFailsVendorQualityCheckFlag())
return false;
return true;
}
};
for (SamReaderList samReader : samReaders) {
SAMRecordIterator iter = samReader.get(0).iterator();
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (!samRecordFilter.test(rec))
continue;
boolean skip = true;
if (rec.getCigar() != null && rec.getCigar().isClipped()) {
skip = false;
}
if (skip)
continue;
if (rec.getReadPairedFlag()) {
if (!rec.getMateUnmappedFlag()) {
SAMRecordIterator iter2 = samReader.get(1).query(rec.getMateReferenceName(), rec.getMateAlignmentStart(), rec.getMateAlignmentStart(), false);
while (iter2.hasNext()) {
final SAMRecord rec2 = iter2.next();
if (!samRecordFilter.test(rec2))
continue;
if (!rec2.getReadName().equals(rec.getReadName()))
continue;
if (rec2.getFirstOfPairFlag() == rec.getFirstOfPairFlag())
continue;
if (rec2.getSecondOfPairFlag() == rec.getSecondOfPairFlag())
continue;
challenge(rec, rec2);
break;
}
iter2.close();
} else {
challenge(rec, null);
}
}
}
iter.close();
}
r = null;
out = openFileOrStdoutAsPrintStream(outputFile);
final SAMFileHeader header = r.getFileHeader();
OtherCanonicalAlignFactory xpalignFactory = new OtherCanonicalAlignFactory(header);
int prev_tid = -1;
short[] coverage = null;
short max_coverage = 0;
// w=swf.make(header, System.out);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
final SAMRecordIterator it = r.iterator();
for (; ; ) {
SAMRecord rec = null;
if (it.hasNext()) {
rec = progress.watch(it.next());
if (rec.getReadUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
}
if (rec == null || prev_tid == -1 || (prev_tid != -1 && prev_tid != rec.getReferenceIndex())) {
if (coverage != null) {
while (max_coverage >= Math.max(1, this.min_coverage)) {
LOG.info("Scanning " + header.getSequence(prev_tid).getSequenceName() + " for cov:" + max_coverage);
int chromStart0 = 0;
while (chromStart0 < coverage.length) {
if (coverage[chromStart0] == max_coverage) {
// reset that pos
coverage[chromStart0] = 0;
int chromEnd0 = chromStart0 + 1;
while (chromEnd0 < coverage.length && coverage[chromEnd0] == max_coverage) {
// reset that pos
coverage[chromEnd0] = 0;
++chromEnd0;
}
out.print(header.getSequence(prev_tid).getSequenceName());
out.print('\t');
out.print(chromStart0);
out.print('\t');
out.print(chromEnd0);
out.print('\t');
out.print(max_coverage);
out.println();
// reset 3'
for (int x = chromEnd0; x < coverage.length && coverage[x] > 0; ++x) {
coverage[x] = 0;
}
// reset 5'
for (int x = chromStart0 - 1; x >= 0 && coverage[x] > 0; --x) {
coverage[x] = 0;
}
chromStart0 = chromEnd0;
} else {
++chromStart0;
}
}
max_coverage--;
}
coverage = null;
}
if (rec == null)
break;
prev_tid = rec.getReferenceIndex();
LOG.info("Alloc sizeof(short)*" + header.getSequence(prev_tid).getSequenceLength());
coverage = new short[header.getSequence(prev_tid).getSequenceLength()];
Arrays.fill(coverage, (short) 0);
max_coverage = 0;
}
List<OtherCanonicalAlign> saList = xpalignFactory.getXPAligns(rec);
if (saList.isEmpty())
continue;
for (OtherCanonicalAlign xp : saList) {
if (!xp.getReferenceName().equals(rec.getReferenceName()))
continue;
if (// read is plus
!rec.getReadNegativeStrandFlag()) {
if (!xp.getReadNegativeStrandFlag()) {
// ignore both same strand
continue;
}
} else // read.strand=='-'
{
if (xp.getReadNegativeStrandFlag()) {
// ignore both same strand
continue;
}
}
if (Math.abs(rec.getUnclippedStart() - xp.getAlignmentStart()) > max_size_inversion) {
// info(xp+" vs pos "+rec.getUnclippedStart());
continue;
}
int chromStart = Math.min(rec.getUnclippedStart(), xp.getAlignmentStart()) - 1;
int chromEnd = Math.max(rec.getUnclippedEnd(), xp.getAlignmentStart()) - 1;
for (int x = chromStart; x <= chromEnd && x < coverage.length; ++x) {
if (coverage[x] < Short.MAX_VALUE)
coverage[x]++;
if (max_coverage < coverage[x]) {
LOG.info("Max coverage " + max_coverage);
max_coverage = coverage[x];
}
}
}
}
it.close();
r.close();
r = null;
progress.finish();
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(out);
}
}
Aggregations