use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class PcrSliceReads method run.
@SuppressWarnings("resource")
private int run(SamReader reader) {
ReadClipper readClipper = new ReadClipper();
SAMFileHeader header1 = reader.getFileHeader();
SAMFileHeader header2 = header1.clone();
header2.addComment(getProgramName() + " " + getVersion() + ": Processed with " + getProgramCommandLine());
header2.setSortOrder(SortOrder.unsorted);
for (SAMReadGroupRecord srg : header1.getReadGroups()) {
for (Interval i : this.bedIntervals.keySet()) {
// create new read group for this interval
SAMReadGroupRecord rg = new SAMReadGroupRecord(srg.getId() + "_" + this.bedIntervals.get(i).getName(), srg);
header2.addReadGroup(rg);
}
}
SAMFileWriter sw = null;
SAMRecordIterator iter = null;
try {
sw = writingBamArgs.openSAMFileWriter(outputFile, header2, false);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = reader.iterator();
while (iter.hasNext()) {
SAMRecord rec = progress.watch(iter.next());
if (rec.getReadUnmappedFlag()) {
sw.addAlignment(rec);
continue;
}
if (!rec.getReadPairedFlag()) {
// @doc if the read is not a paired-end read , then the quality of the read is set to zero
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
}
if (rec.getMateUnmappedFlag()) {
// @doc if the MATE is not mapped , then the quality of the current read is set to zero
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
}
if (!rec.getProperPairFlag()) {
// @doc if the properly-paired flag is not set, then the quality of the current read is set to zero
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
}
if (rec.getMateReferenceIndex() != rec.getReferenceIndex()) {
// @doc if the read and the mate are not mapped on the same chromosome, then the quality of the current read is set to zero
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
}
if (rec.getReadNegativeStrandFlag() == rec.getMateNegativeStrandFlag()) {
// @doc if the read and the mate are mapped on the same strand, then the quality of the current read is set to zero
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
}
int chromStart;
int chromEnd;
if (rec.getAlignmentStart() < rec.getMateAlignmentStart()) {
if (rec.getReadNegativeStrandFlag()) {
// @doc if the read(POS) < mate(POS) and read is mapped on the negative strand, then the quality of the current read is set to zero
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
} else {
chromStart = rec.getAlignmentStart();
chromEnd = rec.getMateAlignmentStart();
}
} else {
if (!rec.getReadNegativeStrandFlag()) {
// @doc if the read(POS) > mate(POS) and read is mapped on the positive strand, then the quality of the current read is set to zero
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
} else {
chromStart = rec.getMateAlignmentStart();
// don't use getAlignmentEnd, to be consistent with mate data
chromEnd = rec.getAlignmentStart();
}
}
List<Interval> fragments = findIntervals(rec.getContig(), chromStart, chromEnd);
if (fragments.isEmpty()) {
// @doc if no BED fragment is found overlapping the read, then the quality of the read is set to zero
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
}
Interval best = null;
if (fragments.size() == 1) {
best = fragments.get(0);
} else
switch(this.ambiguityStrategy) {
case random:
{
best = fragments.get(this.random.nextInt(fragments.size()));
break;
}
case zero:
{
rec.setMappingQuality(0);
sw.addAlignment(rec);
continue;
}
case closer:
{
int best_distance = Integer.MAX_VALUE;
for (Interval frag : fragments) {
int distance = Math.abs(chromStart - frag.getStart()) + Math.abs(frag.getEnd() - chromEnd);
if (best == null || distance < best_distance) {
best = frag;
best_distance = distance;
}
}
break;
}
default:
throw new IllegalStateException();
}
if (clip_reads) {
rec = readClipper.clip(rec, best);
if (rec.getMappingQuality() == 0) {
sw.addAlignment(rec);
continue;
}
}
SAMReadGroupRecord rg = rec.getReadGroup();
if (rg == null) {
throw new IOException("Read " + rec + " is missing a Read-Group ID . See http://broadinstitute.github.io/picard/ http://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups");
}
rec.setAttribute("RG", rg.getId() + "_" + best.getName());
sw.addAlignment(rec);
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sw);
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class SoftClipAnnotator method map.
public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if (tracker == null)
return 0;
final Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
if (VCs.isEmpty()) {
return 0;
}
final Collection<ReadFilter> readFilters = super.getToolkit().getFilters();
for (final VariantContext ctx : VCs) {
int observed_genotypes = 0;
int num_some_clipped_genotypes = 0;
final List<Genotype> genotypes = new ArrayList<>(ctx.getNSamples());
for (int i = 0; i < ctx.getNSamples(); ++i) {
final Genotype g = ctx.getGenotype(i);
if (!g.isCalled() || g.isNoCall() || g.isHomRef()) {
genotypes.add(g);
continue;
}
final List<SamReader> bamReaderForSample = this.sample2bam.get(g.getSampleName());
if (bamReaderForSample.isEmpty()) {
genotypes.add(g);
continue;
}
observed_genotypes++;
int numberOfClipsOverlapping = 0;
for (final SamReader samReader : bamReaderForSample) {
final SAMRecordIterator iter = samReader.query(ctx.getContig(), Math.max(0, ctx.getStart() - extend4clip), ctx.getEnd() + extend4clip, false);
while (iter.hasNext()) {
final SAMRecord samRecord = iter.next();
if (samRecord.getReadUnmappedFlag() || samRecord.getCigar() == null || !samRecord.getContig().equals(ctx.getContig()) || samRecord.getUnclippedEnd() < ctx.getStart() || samRecord.getUnclippedStart() > ctx.getEnd() || samRecord.getReadGroup() == null || !g.getSampleName().equals(samRecord.getReadGroup().getSample()))
continue;
boolean ok = true;
for (final ReadFilter readFilter : readFilters) {
// this one cannot't be correctly initialized...
if (readFilter instanceof MalformedReadFilter)
continue;
if (readFilter.filterOut(samRecord)) {
ok = false;
break;
}
}
if (!ok)
continue;
int refPos = samRecord.getUnclippedStart();
for (final CigarElement ce : samRecord.getCigar()) {
if (ce.getOperator().consumesReferenceBases() || ce.getOperator().isClipping()) {
final int refEnd = refPos + ce.getLength();
if (!(ctx.getEnd() < refPos || refEnd < ctx.getStart())) {
// System.err.println("IN "+ce+" "+ctx.getStart()+"-"+ctx.getEnd()+" : "+refPos+"-"+refPos+ce.getLength());
if (ce.getOperator().equals(CigarOperator.S)) {
++numberOfClipsOverlapping;
}
}
refPos += ce.getLength();
}
if (refPos > ctx.getEnd())
break;
}
}
iter.close();
}
if (numberOfClipsOverlapping == 0) {
genotypes.add(g);
} else {
GenotypeBuilder gb = new GenotypeBuilder(g);
gb.attribute(numClipInGenotypeFormatHeaderLine.getID(), numberOfClipsOverlapping);
genotypes.add(gb.make());
num_some_clipped_genotypes++;
}
}
if (num_some_clipped_genotypes == 0) {
vcfWriter.add(ctx);
} else {
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.genotypes(genotypes);
if (observed_genotypes == num_some_clipped_genotypes) {
vcb.filter(this.filterHaveClipInVariant.getID());
}
vcfWriter.add(vcb.make());
}
}
return VCs.size();
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class LocalRealignReads method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidxFile == null) {
LOG.error("REFerence file missing;");
return -1;
}
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samReader = null;
SAMFileWriter w = null;
SAMRecordIterator iter = null;
GenomicSequence genomicSequence = null;
LongestCommonSequence matrix = new LongestCommonSequence();
try {
indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
samReader = openSamReader(oneFileOrNull(args));
final SAMFileHeader header1 = samReader.getFileHeader();
final SAMFileHeader header2 = header1.clone();
header2.setSortOrder(SortOrder.unsorted);
w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = samReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
w.addAlignment(rec);
continue;
}
final Cigar cigar = rec.getCigar();
final int nCigarElement = cigar.numCigarElements();
if (nCigarElement < 2) {
w.addAlignment(rec);
continue;
}
final CigarElement ce5 = cigar.getCigarElement(0);
final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
w.addAlignment(rec);
continue;
}
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
final CharSequence readseq = rec.getReadString();
/**
* short invert
*/
if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}
*/
}
if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}*/
}
/* other */
List<LongestCommonSequence.Hit> hits = new ArrayList<>();
align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
if (hits.size() < 2000)
continue;
for (LongestCommonSequence.Hit hit : hits) {
int readstart = 0;
boolean in_M = false;
for (int i = 0; i < nCigarElement; ++i) {
final CigarElement ce = cigar.getCigarElement(i);
if (ce.getOperator().consumesReadBases()) {
int readend = readstart + ce.getLength();
if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
in_M = true;
break;
}
}
readstart = readend;
}
}
if (in_M)
continue;
int align_size = hit.size();
System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
System.err.print("REF :");
for (int i = 0; i < align_size; ++i) {
System.err.print(genomicSequence.charAt(hit.getStartX() + i));
}
System.err.println();
System.err.print("READ :");
for (int i = 0; i < align_size; ++i) {
System.err.print(readseq.charAt(hit.getStartY() + i));
}
System.err.println();
}
System.err.println();
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
return wrapException(e);
} finally {
genomicSequence = null;
CloserUtil.close(iter);
CloserUtil.close(samReader);
CloserUtil.close(w);
CloserUtil.close(indexedFastaSequenceFile);
matrix = null;
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class ConvertBamChromosomes method doWork.
@Override
public int doWork(List<String> args) {
SamReader sfr = null;
SAMFileWriter sfw = null;
final Set<String> unmappedChromosomes = new HashSet<>();
try {
final ContigNameConverter customMapping = ContigNameConverter.fromFile(mappingFile);
customMapping.setOnNotFound(this.onNotFound);
sfr = super.openSamReader(oneFileOrNull(args));
SAMFileHeader header1 = sfr.getFileHeader();
if (header1 == null) {
LOG.error("File header missing");
return -1;
}
SAMFileHeader header2 = header1.clone();
// create new sequence dict
final SAMSequenceDictionary dict1 = header1.getSequenceDictionary();
if (dict1 == null) {
LOG.error("Sequence dict missing");
return -1;
}
final List<SAMSequenceRecord> ssrs = new ArrayList<SAMSequenceRecord>(dict1.size());
for (int i = 0; i < dict1.size(); ++i) {
SAMSequenceRecord ssr = dict1.getSequence(i);
String newName = customMapping.apply(ssr.getSequenceName());
if (newName == null) {
// skip unknown chromosomes
continue;
}
ssr = new SAMSequenceRecord(newName, ssr.getSequenceLength());
ssrs.add(ssr);
}
header2.setSequenceDictionary(new SAMSequenceDictionary(ssrs));
SAMSequenceDictionary dict2 = new SAMSequenceDictionary(ssrs);
header2.setSequenceDictionary(dict2);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict1);
sfw = this.writingBamArgs.openSAMFileWriter(output, header2, true);
long num_ignored = 0L;
SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
SAMRecord rec1 = iter.next();
progress.watch(rec1);
String newName1 = null;
String newName2 = null;
if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
newName1 = customMapping.apply(rec1.getReferenceName());
}
if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
newName2 = customMapping.apply(rec1.getMateReferenceName());
}
rec1.setHeader(header2);
if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
if (newName1 == null) {
++num_ignored;
continue;
}
rec1.setReferenceName(newName1);
}
if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
if (newName2 == null) {
++num_ignored;
continue;
}
rec1.setMateReferenceName(newName2);
}
sfw.addAlignment(rec1);
}
if (!unmappedChromosomes.isEmpty()) {
LOG.warning("Unmapped chromosomes: " + unmappedChromosomes);
}
LOG.warning("num ignored read:" + num_ignored);
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
use of htsjdk.samtools.SAMRecordIterator in project jvarkit by lindenb.
the class CoverageNormalizer method run.
private int run(SamReader sfr) throws IOException {
SAMSequenceDictionary dictionary = sfr.getFileHeader().getSequenceDictionary();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dictionary);
File tmpFile1 = File.createTempFile("cov_", ".dat.gz", this.writingSortingCollection.getTmpDirectories().get(0));
tmpFile1.deleteOnExit();
LOG.info("Opening tmp File " + tmpFile1);
GZIPOutputStream gos = null;
DataInputStream dis = null;
SortingCollection<Float> median = null;
try {
gos = new GZIPOutputStream(new FileOutputStream(tmpFile1));
DataOutputStream daos = new DataOutputStream(gos);
SAMRecordIterator iter = sfr.iterator();
int curr_tid = -1;
short[] array = null;
float minCov = Float.MAX_VALUE;
float maxCov = 0;
int[] num_written = new int[dictionary.size()];
Arrays.fill(num_written, 0);
for (; ; ) {
SAMRecord rec = null;
if (iter.hasNext()) {
rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
progress.watch(rec);
if (rec.isSecondaryOrSupplementary())
continue;
}
if (rec == null || curr_tid != rec.getReferenceIndex()) {
if (curr_tid != -1) {
LOG.info("Writing data for chromosome " + dictionary.getSequence(curr_tid).getSequenceName());
for (int i = 0; i + window_size <= array.length; i += window_shift) {
float sum = 0;
for (int j = 0; j < window_size; ++j) {
sum += array[i + j];
}
float v = sum / window_size;
daos.writeFloat(v);
minCov = (float) Math.min(minCov, v);
maxCov = (float) Math.max(maxCov, v);
num_written[curr_tid]++;
}
LOG.info("End writing data N=" + num_written[curr_tid]);
}
if (rec == null)
break;
curr_tid = rec.getReferenceIndex();
SAMSequenceRecord ssr = dictionary.getSequence(curr_tid);
LOG.info("allocate " + ssr.getSequenceLength() + " for " + ssr.getSequenceName());
array = null;
System.gc();
array = new short[ssr.getSequenceLength()];
LOG.info("done: allocate.");
Arrays.fill(array, (short) 0);
}
for (int i = rec.getAlignmentStart(); i < rec.getAlignmentEnd() && i < array.length; ++i) {
array[i] = (short) Math.min((int) Short.MAX_VALUE, 1 + (int) array[i]);
}
}
array = null;
LOG.info("Closing BAM");
CloserUtil.close(sfr);
LOG.info("Closing " + tmpFile1);
daos.flush();
gos.finish();
gos.flush();
gos.close();
// start normalizing min/max find median value
long nWritten = 0L;
median = SortingCollection.newInstance(Float.class, new FloatCodec(), new FloatCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
median.setDestructiveIteration(true);
dis = new DataInputStream(new GZIPInputStream(new FileInputStream(tmpFile1)));
for (int n_items : num_written) {
for (int i = 0; i < n_items; ++i) {
float v = dis.readFloat();
if (v < min_coverage)
continue;
v = (float) ((v - minCov) / (double) (maxCov - minCov));
median.add(v);
++nWritten;
}
}
median.doneAdding();
CloserUtil.close(dis);
// get median
float median_value = 0f;
long half = nWritten / 2L;
CloseableIterator<Float> iterFloat = median.iterator();
while (iterFloat.hasNext() && half > 0) {
median_value = iterFloat.next();
--half;
if (half <= 0) {
LOG.info("median = " + median_value);
break;
}
}
CloserUtil.close(iterFloat);
median.cleanup();
median = null;
progress = new SAMSequenceDictionaryProgress(dictionary);
// dump data
dis = new DataInputStream(new GZIPInputStream(new FileInputStream(tmpFile1)));
for (int chrom_id = 0; chrom_id < num_written.length; ++chrom_id) {
int n_items = num_written[chrom_id];
int i = 0;
Float value_start = null;
while (i < n_items) {
if (value_start == null) {
value_start = dis.readFloat();
}
int j = i + 1;
Float value_end = null;
while (j < n_items) {
value_end = dis.readFloat();
if (value_start.intValue() == value_end.intValue()) {
++j;
} else {
break;
}
}
progress.watch(chrom_id, i * window_shift);
if (value_start >= min_coverage) {
System.out.print(dictionary.getSequence(chrom_id).getSequenceName());
System.out.print('\t');
System.out.print(i * window_shift);
System.out.print('\t');
System.out.print((j - 1) * window_shift + window_size);
System.out.print('\t');
System.out.print((float) ((value_start - minCov) / (double) (maxCov - minCov)));
System.out.print('\t');
System.out.print(median_value);
System.out.println();
if (System.out.checkError())
break;
}
i = j;
if (value_end == null)
break;
value_start = value_end;
}
}
CloserUtil.close(dis);
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(gos);
CloserUtil.close(dis);
if (tmpFile1 != null)
tmpFile1.delete();
if (median != null)
median.cleanup();
}
}
Aggregations