use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class Biostar76892 method doWork.
@Override
public int doWork(final List<String> args) {
SamReader sfr = null;
SAMFileWriter sfw = null;
try {
sfr = super.openSamReader(oneFileOrNull(args));
sfw = writingBamArgs.openSAMFileWriter(this.outputFile, sfr.getFileHeader(), true);
long nRecords = 0;
final List<SAMRecord> buffer = new ArrayList<SAMRecord>();
SAMRecordIterator iter = sfr.iterator();
for (; ; ) {
SAMRecord rec = null;
// get next record
if (iter.hasNext()) {
rec = iter.next();
++nRecords;
if (nRecords % 1000000 == 0)
LOG.info("records: " + nRecords);
if (!rec.getReadPairedFlag() || rec.getReadUnmappedFlag() || rec.getMateUnmappedFlag() || rec.getProperPairFlag() || rec.getReferenceIndex() != rec.getMateReferenceIndex() || rec.getReadNegativeStrandFlag() == !rec.getMateNegativeStrandFlag()) {
if (!onlySaveFixed)
sfw.addAlignment(rec);
continue;
}
}
if (rec != null) {
int i = 0;
// cleanup buffer
int mate_index = -1;
while (i < buffer.size()) {
SAMRecord prev = buffer.get(i);
if (prev.getReferenceIndex() != rec.getReferenceIndex() || prev.getAlignmentEnd() + distance < rec.getAlignmentStart()) {
if (!onlySaveFixed)
sfw.addAlignment(prev);
buffer.remove(i);
} else if (prev.getReadName().equals(rec.getReadName()) && ((prev.getFirstOfPairFlag() && rec.getSecondOfPairFlag()) || (rec.getFirstOfPairFlag() && prev.getSecondOfPairFlag()))) {
mate_index = i;
++i;
} else {
++i;
}
}
if (mate_index == -1) {
buffer.add(rec);
} else {
final SAMRecord mate = buffer.get(mate_index);
buffer.remove(mate_index);
LOG.info("changing " + rec + " " + mate);
if (mate.getReadNegativeStrandFlag()) {
mate.setReadNegativeStrandFlag(false);
rec.setMateNegativeStrandFlag(mate.getReadNegativeStrandFlag());
} else {
rec.setReadNegativeStrandFlag(false);
mate.setMateNegativeStrandFlag(rec.getReadNegativeStrandFlag());
}
if (!mate.getReadFailsVendorQualityCheckFlag() && !rec.getReadFailsVendorQualityCheckFlag()) {
mate.setProperPairFlag(true);
rec.setProperPairFlag(true);
}
mate.setAttribute("rv", 1);
rec.setAttribute("rv", 1);
sfw.addAlignment(mate);
sfw.addAlignment(rec);
}
} else {
for (final SAMRecord r : buffer) {
if (!onlySaveFixed)
sfw.addAlignment(r);
}
break;
}
}
LOG.info("done");
sfw.close();
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfw);
CloserUtil.close(sfr);
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class Biostar78400 method doWork.
@Override
public int doWork(List<String> args) {
if (this.XML == null) {
LOG.error("XML file missing");
return -1;
}
final Map<String, Map<Integer, String>> flowcell2lane2id = new HashMap<String, Map<Integer, String>>();
SamReader sfr = null;
SAMFileWriter sfw = null;
try {
final Pattern readNameSignature = Pattern.compile(this.readNameSignatureStr);
final JAXBContext context = JAXBContext.newInstance(ReadGroup.class, ReadGroupList.class);
final Unmarshaller unmarshaller = context.createUnmarshaller();
final ReadGroupList rgl = unmarshaller.unmarshal(new StreamSource(XML), ReadGroupList.class).getValue();
if (rgl.flowcells.isEmpty()) {
LOG.error("empty XML " + XML);
return -1;
}
sfr = openSamReader(oneFileOrNull(args));
final SAMFileHeader header = sfr.getFileHeader().clone();
header.addComment("Processed with " + getProgramName());
final Set<String> seenids = new HashSet<String>();
final List<SAMReadGroupRecord> samReadGroupRecords = new ArrayList<SAMReadGroupRecord>();
for (final FlowCell fc : rgl.flowcells) {
final Map<Integer, String> lane2id = new HashMap<Integer, String>();
for (final Lane lane : fc.lanes) {
for (final ReadGroup rg : lane.readGroups) {
if (seenids.contains(rg.id)) {
LOG.error("Group id " + rg.id + " defined twice");
return -1;
}
seenids.add(rg.id);
// create the read group we'll be using
final SAMReadGroupRecord rgrec = new SAMReadGroupRecord(rg.id);
rgrec.setLibrary(rg.library);
rgrec.setPlatform(rg.platform);
rgrec.setSample(rg.sample);
rgrec.setPlatformUnit(rg.platformunit);
if (rg.center != null)
rgrec.setSequencingCenter(rg.center);
if (rg.description != null)
rgrec.setDescription(rg.description);
lane2id.put(lane.id, rg.id);
samReadGroupRecords.add(rgrec);
}
}
if (flowcell2lane2id.containsKey(fc.name)) {
LOG.error("FlowCell id " + fc.name + " defined twice in XML");
return -1;
}
flowcell2lane2id.put(fc.name, lane2id);
}
header.setReadGroups(samReadGroupRecords);
sfw = this.writingBamArgs.openSAMFileWriter(this.outputFile, header, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
final SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
final Matcher matcher = readNameSignature.matcher(rec.getReadName());
final String flowcellStr;
final String laneStr;
if (matcher.matches()) {
flowcellStr = matcher.group(1);
laneStr = matcher.group(2);
} else {
LOG.error("Read name " + rec.getReadName() + " doesn't match regular expression " + readNameSignature.pattern() + ". please check options");
return -1;
}
String RGID = null;
final Map<Integer, String> lane2id = flowcell2lane2id.get(flowcellStr);
if (lane2id == null)
throw new RuntimeException("Cannot get flowcell/readgroup for " + rec.getReadName());
try {
RGID = lane2id.get(Integer.parseInt(laneStr));
} catch (final Exception e) {
LOG.error("bad lane-Id in " + rec.getReadName());
return -1;
}
if (RGID == null) {
throw new RuntimeException("Cannot get RGID for " + rec.getReadName() + " flowcell:" + flowcellStr + " lane2id:" + laneStr + " dict:" + lane2id);
}
rec.setAttribute(SAMTag.RG.name(), RGID);
sfw.addAlignment(rec);
}
progress.finish();
iter.close();
LOG.info("done");
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfw);
CloserUtil.close(sfr);
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class ExtendReferenceWithReads method scanRegion.
/**
*scanRegion
* @param contig Reference sequence of interest.
* @param start 0-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
* @param end 0-based, exclusive end of interval of interest. Zero implies end of the reference sequence.
*/
private Map<Integer, Counter<Byte>> scanRegion(final SAMSequenceRecord contig, final int chromStart, final int chromEnd, final Rescue rescue) {
final Map<Integer, Counter<Byte>> pos2bases = new HashMap<>(1 + chromEnd - chromStart);
LOG.info("Scanning :" + contig.getSequenceName() + ":" + chromStart + "-" + chromEnd);
for (int side = 0; side < 2; ++side) {
if (// 5' or 3'
!rescue.equals(Rescue.CENTER) && side > 0) {
// already done
break;
}
for (final SamReader samReader : samReaders) {
final SAMSequenceDictionary dict2 = samReader.getFileHeader().getSequenceDictionary();
final SAMSequenceRecord ssr2 = dict2.getSequence(contig.getSequenceName());
if (ssr2 == null || ssr2.getSequenceLength() != contig.getSequenceLength()) {
LOG.info("No contig " + contig.getSequenceName() + " with L=" + contig.getSequenceLength() + " bases in " + samReader.getResourceDescription());
continue;
}
int mappedPos = -1;
switch(rescue) {
case LEFT:
mappedPos = 1;
break;
case RIGHT:
mappedPos = contig.getSequenceLength();
break;
case CENTER:
mappedPos = (side == 0 ? chromStart + 1 : chromEnd);
break;
default:
throw new IllegalStateException();
}
final SAMRecordIterator iter = samReader.query(contig.getSequenceName(), mappedPos, mappedPos, false);
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.filter.filterOut(rec))
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
final byte[] bases = rec.getReadBases();
if (bases == null || bases.length == 0)
continue;
int refPos1 = rec.getUnclippedStart();
int readpos = 0;
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
for (int L = 0; L < ce.getLength(); ++L) {
if (op.consumesReadBases()) {
Counter<Byte> count = pos2bases.get(refPos1 - 1);
if (count == null) {
count = new Counter<>();
pos2bases.put(refPos1 - 1, count);
}
count.incr((byte) Character.toLowerCase(bases[readpos]));
readpos++;
}
if (op.consumesReferenceBases()) {
refPos1++;
}
}
}
}
iter.close();
}
}
return pos2bases;
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class BamToFastq method doWork.
@Override
public int doWork(List<String> args) {
SamReader sfr = null;
SortingCollection<MappedFastq> fastqCollection = null;
try {
boolean found_single = false;
boolean found_paired = false;
long non_primary_alignmaned_flag = 0L;
sfr = super.openSamReader(oneFileOrNull(args));
fastqCollection = SortingCollection.newInstance(MappedFastq.class, new MappedFastqCodec(), new MappedFastqComparator(), this.maxRecordsInRam, this.tmpDir.toPath());
fastqCollection.setDestructiveIteration(true);
SAMRecordIterator iter = sfr.iterator();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader().getSequenceDictionary());
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.isSecondaryOrSupplementary()) {
if (non_primary_alignmaned_flag == 0) {
LOG.warn("SKIPPING NON-PRIMARY " + (non_primary_alignmaned_flag + 1) + " ALIGNMENTS");
}
non_primary_alignmaned_flag++;
continue;
}
MappedFastq m = new MappedFastq();
m.name = rec.getReadName();
if (m.name == null)
m.name = "";
m.seq = rec.getReadString();
if (m.seq.equals(SAMRecord.NULL_SEQUENCE_STRING))
m.seq = "";
m.qual = rec.getBaseQualityString();
if (m.qual.equals(SAMRecord.NULL_QUALS_STRING))
m.qual = "";
if (!rec.getReadUnmappedFlag() && rec.getReadNegativeStrandFlag()) {
m.seq = AcidNucleics.reverseComplement(m.seq);
m.qual = new StringBuilder(m.qual).reverse().toString();
}
if (m.seq.length() != m.qual.length()) {
LOG.error("length(seq)!=length(qual) in " + m.name);
continue;
}
if (m.seq.isEmpty() && m.qual.isEmpty()) {
m.seq = "N";
m.qual = "#";
}
if (rec.getReadPairedFlag()) {
found_paired = true;
if (found_single) {
sfr.close();
throw new RuntimeException("input is a mix of paired/singled reads");
}
m.side = (byte) (rec.getSecondOfPairFlag() ? 2 : 1);
} else {
found_single = true;
if (found_paired) {
sfr.close();
throw new RuntimeException("input is a mix of paired/singled reads");
}
m.side = (byte) 0;
}
fastqCollection.add(m);
}
iter.close();
CloserUtil.close(iter);
CloserUtil.close(sfr);
progress.finish();
fastqCollection.doneAdding();
LOG.info("Done reading.");
if (found_paired) {
FastqWriter fqw1 = null;
FastqWriter fqw2 = null;
if (forwardFile != null) {
LOG.info("Writing to " + forwardFile);
fqw1 = new BasicFastqWriter(forwardFile);
} else {
LOG.info("Writing to stdout");
fqw1 = new BasicFastqWriter(new PrintStream(stdout()));
}
if (reverseFile != null) {
LOG.info("Writing to " + reverseFile);
fqw2 = new BasicFastqWriter(reverseFile);
} else {
LOG.info("Writing to interlaced stdout");
fqw2 = fqw1;
}
List<MappedFastq> row = new ArrayList<MappedFastq>();
CloseableIterator<MappedFastq> r = fastqCollection.iterator();
for (; ; ) {
MappedFastq curr = null;
if (r.hasNext())
curr = r.next();
if (curr == null || (!row.isEmpty() && !row.get(0).name.equals(curr.name))) {
if (!row.isEmpty()) {
if (row.size() > 2) {
LOG.warn("WTF :" + row);
}
boolean found_F = false;
boolean found_R = false;
for (MappedFastq m : row) {
switch((int) m.side) {
case 1:
if (found_F)
throw new RuntimeException("two forward reads found for " + row.get(0).name);
found_F = true;
echo(fqw1, m);
break;
case 2:
if (found_R)
throw new RuntimeException("two reverse reads found for " + row.get(0).name);
found_R = true;
echo(fqw2, m);
break;
default:
throw new IllegalStateException("uh???");
}
}
if (!found_F) {
if (this.repair_missing_read) {
LOG.warn("forward not found for " + row.get(0));
MappedFastq pad = new MappedFastq();
pad.side = (byte) 1;
pad.name = row.get(0).name;
pad.seq = "N";
pad.qual = "#";
echo(fqw1, pad);
} else {
throw new RuntimeException("forward not found for " + row);
}
}
if (!found_R) {
if (repair_missing_read) {
LOG.warn("reverse not found for " + row.get(0));
MappedFastq pad = new MappedFastq();
pad.side = (byte) 2;
pad.name = row.get(0).name;
pad.seq = "N";
pad.qual = "#";
echo(fqw2, pad);
} else {
throw new RuntimeException("reverse not found for " + row);
}
}
}
if (curr == null)
break;
row.clear();
}
row.add(curr);
}
r.close();
fqw1.close();
fqw2.close();
} else if (found_single) {
FastqWriter fqw1 = null;
if (forwardFile != null) {
LOG.info("Writing to " + forwardFile);
fqw1 = new BasicFastqWriter(forwardFile);
} else {
LOG.info("Writing to stdout");
fqw1 = new BasicFastqWriter(new PrintStream(stdout()));
}
final CloseableIterator<MappedFastq> r = fastqCollection.iterator();
while (r.hasNext()) {
echo(fqw1, r.next());
}
r.close();
fqw1.close();
}
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (fastqCollection != null)
fastqCollection.cleanup();
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class BamTreePack method scan.
private void scan(final SamReader sfr) {
super.bindings.put("header", sfr.getFileHeader());
final SAMRecordIterator iter = sfr.iterator();
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader());
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
super.bindings.put("record", rec);
super.nodeFactoryChain.watch(rootNode, rec);
}
progress.finish();
}
Aggregations