use of htsjdk.samtools.SamReader in project jvarkit by lindenb.
the class SamAddPI method doWork.
@Override
public int doWork(final List<String> args) {
final Map<String, List<Integer>> rg2insertsize = new HashMap<>();
SamReader sfr = null;
SamReader sfrTmp = null;
SAMFileWriter sfw = null;
File tmpBam = null;
SAMFileWriter tmpBamWriter = null;
SAMFileWriter outWriter = null;
CloseableIterator<SAMRecord> iter = null;
CloseableIterator<SAMRecord> iterTmp = null;
try {
sfr = openSamReader(oneFileOrNull(args));
SAMFileHeader header = sfr.getFileHeader();
for (final SAMReadGroupRecord rg : header.getReadGroups()) {
if (!overwrite_existing && rg.getPredictedMedianInsertSize() != null) {
continue;
}
rg2insertsize.put(rg.getId(), new ArrayList<>(num_read_to_test < 1L ? 10000 : num_read_to_test));
}
tmpBam = File.createTempFile("__addpi", ".bam");
tmpBamWriter = this.writingBamArgs.openSAMFileWriter(tmpBam, header, true);
iter = sfr.iterator();
int n_processed = 0;
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext() && (this.num_read_to_test < 0 || n_processed < this.num_read_to_test)) {
final SAMRecord rec = progress.watch(iter.next());
tmpBamWriter.addAlignment(rec);
final SAMReadGroupRecord rg = rec.getReadGroup();
final List<Integer> insertlist = rg2insertsize.get(rg.getId());
if (insertlist == null)
continue;
if (rec.getReadUnmappedFlag())
continue;
if (!rec.getReadPairedFlag())
continue;
if (!rec.getFirstOfPairFlag())
continue;
if (rec.getMateUnmappedFlag())
continue;
if (this.samRecordFilter.filterOut(rec))
continue;
final int len = rec.getInferredInsertSize();
if (len == 0)
continue;
insertlist.add(Math.abs(len));
++n_processed;
}
tmpBamWriter.close();
tmpBamWriter = null;
// reopen tmp file
sfrTmp = super.createSamReaderFactory().open(tmpBam);
iterTmp = sfrTmp.iterator();
// update dMedianInsertSize
for (final SAMReadGroupRecord rg : header.getReadGroups()) {
final List<Integer> insertlist = rg2insertsize.get(rg.getId());
if (insertlist == null || insertlist.isEmpty())
continue;
rg.setPredictedMedianInsertSize((int) Percentile.median().evaluate(insertlist.stream().mapToDouble(I -> I.doubleValue())));
}
header.addComment("Processed with " + getClass().getSimpleName() + " " + getProgramCommandLine());
outWriter = this.writingBamArgs.openSAMFileWriter(this.outputFile, header, true);
while (iterTmp.hasNext()) {
outWriter.addAlignment(iterTmp.next());
}
iterTmp.close();
iterTmp = null;
sfrTmp.close();
sfrTmp = null;
tmpBam.delete();
// finish writing original input
while (iter.hasNext()) {
outWriter.addAlignment(progress.watch(iter.next()));
}
progress.finish();
iter.close();
iter = null;
sfr.close();
sfr = null;
outWriter.close();
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(tmpBamWriter);
if (tmpBam != null)
tmpBam.delete();
CloserUtil.close(outWriter);
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
use of htsjdk.samtools.SamReader in project jvarkit by lindenb.
the class SamClipIndelFraction method doWork.
@Override
public int doWork(final List<String> args) {
SamReader sfr = null;
SAMRecordIterator iter = null;
PrintWriter pw = null;
try {
sfr = openSamReader(oneFileOrNull(args));
pw = super.openFileOrStdoutAsPrintWriter(outputFile);
long total_bases_count = 0L;
long count_clipped_reads = 0L;
long count_clipped_left_reads = 0L;
long count_clipped_right_reads = 0L;
long count_unclipped_reads = 0L;
long count_unmapped_reads = 0L;
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader()).logger(LOG);
Counter<Integer> counter = new Counter<>();
iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord record = progress.watch(iter.next());
if (record.getReadUnmappedFlag()) {
++count_unmapped_reads;
continue;
}
final Cigar cigar = record.getCigar();
int left_clip_length = 0;
int right_clip_length = 0;
int deletion_N_length = 0;
int deletion_D_length = 0;
int insertion_length = 0;
boolean onLeft = true;
for (int i = 0; i < cigar.numCigarElements(); ++i) {
final CigarElement ce = cigar.getCigarElement(i);
final CigarOperator op = ce.getOperator();
switch(op) {
case N:
{
onLeft = false;
deletion_D_length += ce.getLength();
total_bases_count += ce.getLength();
break;
}
case D:
{
onLeft = false;
deletion_N_length += ce.getLength();
total_bases_count += ce.getLength();
break;
}
case I:
{
onLeft = false;
insertion_length += ce.getLength();
total_bases_count += ce.getLength();
break;
}
case S:
case H:
{
if (onLeft) {
if (record.getReadNegativeStrandFlag()) {
right_clip_length += ce.getLength();
} else {
left_clip_length += ce.getLength();
}
} else {
if (record.getReadNegativeStrandFlag()) {
left_clip_length += ce.getLength();
} else {
right_clip_length += ce.getLength();
}
}
total_bases_count += ce.getLength();
break;
}
default:
{
onLeft = false;
if (op.consumesReadBases()) {
total_bases_count += ce.getLength();
}
break;
}
}
}
if (left_clip_length + right_clip_length == 0) {
count_unclipped_reads++;
} else {
if (left_clip_length > 0)
count_clipped_left_reads++;
if (right_clip_length > 0)
count_clipped_right_reads++;
count_clipped_reads++;
}
switch(type) {
case leftclip:
counter.incr(left_clip_length);
break;
case rightclip:
counter.incr(right_clip_length);
break;
case allclip:
counter.incr(left_clip_length + right_clip_length);
break;
case deletion:
counter.incr(deletion_D_length + deletion_N_length);
break;
case insert:
counter.incr(insertion_length);
break;
default:
LOG.error("Bad type: " + type);
return -1;
}
}
progress.finish();
pw.println("##UNMAPPED_READS=" + count_unmapped_reads);
pw.println("##MAPPED_READS=" + (count_clipped_reads + count_unclipped_reads));
pw.println("##CLIPPED_READS=" + count_clipped_reads);
pw.println("##CLIPPED_READS_5_PRIME=" + count_clipped_left_reads);
pw.println("##CLIPPED_READS_3_PRIME=" + count_clipped_right_reads);
pw.println("##UNCLIPPED_READS=" + count_unclipped_reads);
pw.println("##COUNT_BASES=" + total_bases_count);
pw.print("#");
switch(type) {
case leftclip:
pw.print("CLIP_5_PRIME");
break;
case rightclip:
pw.print("CLIP_3_PRIME");
break;
case allclip:
pw.print("CLIP");
break;
case deletion:
pw.print("DELETION");
break;
case insert:
pw.print("INSERTION");
break;
default:
LOG.error("Bad type: " + type);
return -1;
}
pw.println("\tCOUNT\tFRACTION_OF_MAPPED_READS");
for (final Integer size : new TreeSet<Integer>(counter.keySet())) {
pw.print(size);
pw.print('\t');
pw.print(counter.count(size));
pw.print('\t');
pw.println(counter.count(size) / (double) (count_unclipped_reads + count_unclipped_reads));
}
pw.flush();
pw.close();
pw = null;
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sfr);
CloserUtil.close(pw);
}
}
use of htsjdk.samtools.SamReader in project jvarkit by lindenb.
the class BamLiftOver method doWork.
@Override
public int doWork(final List<String> args) {
final double minMatch = (this.userMinMatch <= 0.0 ? LiftOver.DEFAULT_LIFTOVER_MINMATCH : this.userMinMatch);
if (this.liftOverFile == null) {
LOG.error("LiftOver file is undefined.");
return -1;
}
if (this.faidx == null) {
LOG.error("New Sequence Dictionary file is undefined.");
return -1;
}
SAMRecordIterator iter = null;
SamReader sfr = null;
SAMFileWriter sfw = null;
try {
LOG.info("Reading " + liftOverFile);
LiftOver liftOver = new LiftOver(liftOverFile);
liftOver.setLiftOverMinMatch(minMatch);
final SAMSequenceDictionary newDict = SAMSequenceDictionaryExtractor.extractDictionary(faidx);
sfr = super.openSamReader(oneFileOrNull(args));
final SAMFileHeader headerIn = sfr.getFileHeader();
final SAMFileHeader headerOut = headerIn.clone();
headerOut.setSortOrder(SortOrder.unsorted);
sfw = this.writingBamArgs.openSAMFileWriter(outputFile, headerOut, true);
iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
final SAMRecord copy = (SAMRecord) rec.clone();
copy.setHeader(headerOut);
final StringBuilder sb = new StringBuilder();
if (!rec.getReadUnmappedFlag()) {
final String chrom = rec.getReferenceName();
int pos = rec.getAlignmentStart();
final Interval interval = liftOver.liftOver(new Interval(chrom, pos, pos, rec.getReadNegativeStrandFlag(), null));
if (interval != null) {
sb.append(chrom + ":" + pos + ":" + (rec.getReadNegativeStrandFlag() ? "-" : "+"));
final SAMSequenceRecord ssr = newDict.getSequence(interval.getContig());
if (ssr == null) {
sfr.close();
sfr = null;
LOG.error("the chromosome " + interval.getContig() + " is undefined in the sequence dict.");
return -1;
}
copy.setReferenceName(ssr.getSequenceName());
copy.setReferenceIndex(ssr.getSequenceIndex());
copy.setAlignmentStart(interval.getStart());
copy.setReadNegativeStrandFlag(interval.isNegativeStrand());
if (rec.getReadNegativeStrandFlag() != copy.getReadNegativeStrandFlag()) {
copy.setReadString(AcidNucleics.reverseComplement(rec.getReadString()));
byte[] qual = rec.getBaseQualities();
byte[] quals2 = new byte[qual.length];
for (int i = 0; i < qual.length; ++i) {
quals2[i] = qual[(qual.length - 1) - i];
}
copy.setBaseQualities(quals2);
}
} else {
sb.append(".");
SAMUtils.makeReadUnmapped(copy);
}
}
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
sb.append("/");
String chrom = rec.getMateReferenceName();
int pos = rec.getMateAlignmentStart();
final Interval interval = liftOver.liftOver(new Interval(chrom, pos, pos, rec.getMateNegativeStrandFlag(), null));
if (interval != null) {
sb.append(chrom + ":" + pos + ":" + (rec.getMateNegativeStrandFlag() ? "-" : "+"));
final SAMSequenceRecord ssr = newDict.getSequence(interval.getContig());
if (ssr == null) {
sfr.close();
sfr = null;
LOG.error("the chromosome " + interval.getContig() + " is undefined in the sequence dict.");
return -1;
}
copy.setMateReferenceName(ssr.getSequenceName());
copy.setMateReferenceIndex(ssr.getSequenceIndex());
copy.setMateAlignmentStart(interval.getStart());
copy.setMateNegativeStrandFlag(interval.isNegativeStrand());
if (!copy.getReadUnmappedFlag() && copy.getReferenceIndex() == copy.getMateReferenceIndex()) // && copy.getReadNegativeStrandFlag()!=copy.getMateNegativeStrandFlag()
{
// don't change ?
} else {
copy.setProperPairFlag(false);
copy.setInferredInsertSize(0);
}
} else {
sb.append(".");
SAMUtils.makeReadUnmapped(copy);
}
}
if (sb.length() > 0)
copy.setAttribute("LO", sb.toString());
sfw.addAlignment(copy);
}
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
use of htsjdk.samtools.SamReader in project jvarkit by lindenb.
the class SamExtractClip method doWork.
@Override
public int doWork(List<String> args) {
SamReader r = null;
BasicFastqWriter out = null;
try {
if (this.outputFile != null) {
LOG.info("writing to " + this.outputFile);
out = new BasicFastqWriter(this.outputFile);
} else {
LOG.info("writing to stdout");
out = new BasicFastqWriter(stdout());
}
if (args.isEmpty()) {
LOG.info("Reading from stdin");
r = createSamReaderFactory().open(SamInputResource.of(stdin()));
run(r, out);
r.close();
} else {
for (final String filename : args) {
LOG.info("Reading from " + filename);
r = createSamReaderFactory().open(SamInputResource.of(filename));
run(r, out);
r.close();
r = null;
}
}
out.flush();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(out);
}
}
use of htsjdk.samtools.SamReader in project jvarkit by lindenb.
the class SamFindClippedRegions method doWork.
/*private static boolean closeTo(int pos1,int pos2, int max)
{
return Math.abs(pos2-pos1)<=max;
}*/
/*
private static boolean same(char c1,char c2)
{
if(c1=='N' || c2=='N') return false;
return Character.toUpperCase(c1)==Character.toUpperCase(c2);
}*/
@Override
public int doWork(List<String> args) {
int readLength = 150;
if (args.isEmpty()) {
LOG.error("illegal.number.of.arguments");
return -1;
}
List<Input> inputs = new ArrayList<Input>();
VariantContextWriter w = null;
// SAMFileWriter w=null;
try {
SAMSequenceDictionary dict = null;
/* create input, collect sample names */
Map<String, Input> sample2input = new HashMap<String, Input>();
for (final String filename : args) {
Input input = new Input(new File(filename));
// input.index=inputs.size();
inputs.add(input);
if (sample2input.containsKey(input.sampleName)) {
LOG.error("Duplicate sample " + input.sampleName + " in " + input.bamFile + " and " + sample2input.get(input.sampleName).bamFile);
return -1;
}
sample2input.put(input.sampleName, input);
if (dict == null) {
dict = input.header.getSequenceDictionary();
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, input.header.getSequenceDictionary())) {
LOG.error("Found more than one dictint sequence dictionary");
return -1;
}
}
LOG.info("Sample N= " + sample2input.size());
/* create merged iterator */
List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(sample2input.size());
for (Input input : inputs) headers.add(input.header);
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SortOrder.coordinate, headers, true);
List<SamReader> readers = new ArrayList<SamReader>(sample2input.size());
for (Input input : inputs) readers.add(input.samFileReaderScan);
MergingSamRecordIterator merginIter = new MergingSamRecordIterator(headerMerger, readers, true);
Allele reference_allele = Allele.create("N", true);
Allele[] alternate_alleles = new Allele[] { Allele.create("<CLIP5>", false), Allele.create("<CLIP3>", false) };
Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
for (Allele alt : alternate_alleles) {
vcfHeaderLines.add(new VCFSimpleHeaderLine("<ID=" + alt.getDisplayString() + ",Description=\"StructVar\">", VCFHeaderVersion.VCF4_1, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")));
}
vcfHeaderLines.add(new VCFInfoHeaderLine("COUNT_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples with depth>=" + this.min_depth));
vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
for (int side = 0; side < 2; ++side) {
vcfHeaderLines.add(new VCFFormatHeaderLine("CN" + (side == 0 ? 5 : 3), 1, VCFHeaderLineType.Integer, "count clipped in " + (side == 0 ? 5 : 3) + "'"));
}
if (dict != null) {
vcfHeaderLines.addAll(VCFUtils.samSequenceDictToVCFContigHeaderLine(dict));
}
VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample2input.keySet());
w = VCFUtils.createVariantContextWriterToStdout();
w.writeHeader(vcfHeader);
final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<>();
// w=swf.make(header, System.out);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
if (bedFile != null) {
final BedLineCodec bedLineCodec = new BedLineCodec();
LOG.info("Reading " + bedFile);
BufferedReader r = IOUtils.openFileForBufferedReading(bedFile);
String line;
while ((line = r.readLine()) != null) {
BedLine bedLine = bedLineCodec.decode(line);
if (bedLine == null)
continue;
if (dict != null && dict.getSequence(bedLine.getContig()) == null) {
LOG.warning("undefined chromosome in " + bedFile + " " + line);
continue;
}
intervals.put(bedLine.toInterval(), true);
}
CloserUtil.close(r);
}
LinkedList<SAMRecord> buffer = new LinkedList<SAMRecord>();
final Predicate<SAMRecord> filterSamRecords = new Predicate<SAMRecord>() {
@Override
public boolean test(SAMRecord rec) {
if (rec.getReadUnmappedFlag())
return false;
if (rec.isSecondaryOrSupplementary())
return false;
if (rec.getDuplicateReadFlag())
return false;
if (rec.getReadFailsVendorQualityCheckFlag())
return false;
Cigar cigar = rec.getCigar();
if (cigar == null || cigar.numCigarElements() < 2)
return false;
boolean found_S = false;
for (int side = 0; side < 2; ++side) {
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
// read must be clipped on 5' or 3' with a good length
if (!ce.getOperator().equals(CigarOperator.S))
continue;
found_S = true;
break;
}
if (!found_S)
return false;
SAMReadGroupRecord g = rec.getReadGroup();
if (g == null || g.getSample() == null || g.getSample().isEmpty())
return false;
return true;
}
};
final FilteringIterator<SAMRecord> forwardIterator = new FilteringIterator<SAMRecord>(merginIter, filterSamRecords);
for (; ; ) {
SAMRecord rec = null;
if (forwardIterator.hasNext()) {
rec = forwardIterator.next();
progress.watch(rec);
if (intervals != null && !intervals.containsOverlapping(new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd())))
continue;
}
// need to flush buffer ?
if (rec == null || (!buffer.isEmpty() && !buffer.getLast().getReferenceIndex().equals(rec.getReferenceIndex())) || (!buffer.isEmpty() && buffer.getLast().getUnclippedEnd() + readLength < rec.getUnclippedStart())) {
if (!buffer.isEmpty()) {
int chromStart = buffer.getFirst().getUnclippedStart();
int chromEnd = buffer.getFirst().getUnclippedEnd();
for (SAMRecord sam : buffer) {
chromStart = Math.min(chromStart, sam.getUnclippedStart());
chromEnd = Math.max(chromEnd, sam.getUnclippedEnd());
}
final int winShift = 5;
for (int pos = chromStart; pos + winShift <= chromEnd; pos += winShift) {
int[] count_big_clip = new int[] { 0, 0 };
// int max_depth[]=new int[]{0,0};
List<Genotype> genotypes = new ArrayList<Genotype>();
Set<Allele> all_alleles = new HashSet<Allele>();
all_alleles.add(reference_allele);
boolean found_one_depth_ok = false;
int sum_depth = 0;
int samples_with_high_depth = 0;
for (String sample : sample2input.keySet()) {
GenotypeBuilder gb = new GenotypeBuilder(sample);
int[] count_clipped = new int[] { 0, 0 };
Set<Allele> sample_alleles = new HashSet<Allele>(3);
for (int side = 0; side < 2; ++side) {
for (SAMRecord sam : buffer) {
if (!sam.getReadGroup().getSample().equals(sample))
continue;
Cigar cigar = sam.getCigar();
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
if (!ce.getOperator().equals(CigarOperator.S))
continue;
int clipStart = (side == 0 ? sam.getUnclippedStart() : sam.getAlignmentEnd() + 1);
int clipEnd = (side == 0 ? sam.getAlignmentStart() - 1 : sam.getUnclippedEnd());
if ((pos + winShift < clipStart || pos > clipEnd))
continue;
count_clipped[side]++;
if (ce.getLength() >= this.min_clip_length) {
count_big_clip[side]++;
}
sample_alleles.add(alternate_alleles[side]);
gb.attribute("CN" + (side == 0 ? 5 : 3), count_clipped[side]);
}
}
// if(!(found_one_big_clip[0] || found_one_big_clip[1])) continue;
if (count_clipped[0] + count_clipped[1] == 0)
continue;
if ((count_clipped[0] + count_clipped[1]) > min_depth) {
found_one_depth_ok = true;
++samples_with_high_depth;
}
sum_depth += (count_clipped[0] + count_clipped[1]);
gb.alleles(new ArrayList<Allele>(sample_alleles));
all_alleles.addAll(sample_alleles);
gb.DP(count_clipped[0] + count_clipped[1]);
genotypes.add(gb.make());
}
if (all_alleles.size() == 1) {
// all homozygotes
continue;
}
if (!found_one_depth_ok) {
continue;
}
if (!(count_big_clip[0] >= 1 || count_big_clip[1] >= 1)) {
continue;
}
Map<String, Object> atts = new HashMap<String, Object>();
atts.put("COUNT_SAMPLES", samples_with_high_depth);
atts.put(VCFConstants.DEPTH_KEY, sum_depth);
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(buffer.getFirst().getReferenceName());
vcb.start(pos);
vcb.stop(pos + winShift);
vcb.alleles(all_alleles);
vcb.attributes(atts);
vcb.genotypes(genotypes);
w.add(vcb.make());
}
buffer.clear();
}
if (rec == null) {
break;
}
}
buffer.add(rec);
}
merginIter.close();
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
for (Input input : inputs) {
CloserUtil.close(input);
}
}
}
Aggregations