use of htsjdk.samtools.util.IntervalTreeMap in project jvarkit by lindenb.
the class KnimeVariantHelper method parseBedAsBooleanIntervalTreeMap.
public IntervalTreeMap<Boolean> parseBedAsBooleanIntervalTreeMap(final String bedUri) throws IOException {
if (bedUri == null || bedUri.isEmpty())
throw new IllegalArgumentException("bad bed uri");
BufferedReader r = null;
final IntervalTreeMap<Boolean> treeMap = new IntervalTreeMap<>();
try {
r = IOUtils.openURIForBufferedReading(bedUri);
final BedLineCodec codec = new BedLineCodec();
r.lines().forEach(L -> {
final BedLine bedline = codec.decode(L);
if (bedline == null) {
LOG.warn("Ignoring line in BED (doesn't look like a BED line):" + L);
return;
}
treeMap.put(bedline.toInterval(), Boolean.TRUE);
});
return treeMap;
} finally {
CloserUtil.close(r);
}
}
use of htsjdk.samtools.util.IntervalTreeMap in project jvarkit by lindenb.
the class Launcher method readBedFileAsBooleanIntervalTreeMap.
/**
* reads a Bed file and convert it to a IntervalTreeMap<Boolean>
*/
protected IntervalTreeMap<Boolean> readBedFileAsBooleanIntervalTreeMap(final java.io.File file) throws java.io.IOException {
java.io.BufferedReader r = null;
try {
final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<Boolean>();
r = com.github.lindenb.jvarkit.io.IOUtils.openFileForBufferedReading(file);
final BedLineCodec bedCodec = new BedLineCodec();
r.lines().filter(line -> !(line.startsWith("#") || com.github.lindenb.jvarkit.util.bio.bed.BedLine.isBedHeader(line) || line.isEmpty())).map(line -> bedCodec.decode(line)).filter(B -> B != null).map(B -> B.toInterval()).filter(L -> L.getStart() < L.getEnd()).forEach(L -> {
intervals.put(L, true);
});
return intervals;
} finally {
htsjdk.samtools.util.CloserUtil.close(r);
}
}
use of htsjdk.samtools.util.IntervalTreeMap in project jvarkit by lindenb.
the class KnownGene method loadUriAsIntervalTreeMap.
/**
* load knownGene file/uri as an IntervalTreeMap. Intervals in the IntervalTreeMap are *1-based* (interval.start= kg.txStart+1)
*/
public static IntervalTreeMap<List<KnownGene>> loadUriAsIntervalTreeMap(final String uri, final Predicate<KnownGene> filterOrNull) throws IOException {
final IntervalTreeMap<List<KnownGene>> treeMap = new IntervalTreeMap<>();
BufferedReader in = null;
try {
in = IOUtils.openURIForBufferedReading(uri);
String line;
final Pattern tab = Pattern.compile("[\t]");
while ((line = in.readLine()) != null) {
if (line.isEmpty())
continue;
final String[] tokens = tab.split(line);
final KnownGene g = new KnownGene(tokens);
if (filterOrNull != null && !filterOrNull.test(g))
continue;
final Interval interval = new Interval(g.getContig(), g.getTxStart() + 1, g.getTxEnd(), g.isNegativeStrand(), g.getName());
List<KnownGene> L = treeMap.get(interval);
if (L == null) {
L = new ArrayList<>(2);
treeMap.put(interval, L);
}
L.add(g);
}
in.close();
in = null;
return treeMap;
} finally {
CloserUtil.close(in);
}
}
use of htsjdk.samtools.util.IntervalTreeMap in project jvarkit by lindenb.
the class SamFindClippedRegions method doWork.
/*private static boolean closeTo(int pos1,int pos2, int max)
{
return Math.abs(pos2-pos1)<=max;
}*/
/*
private static boolean same(char c1,char c2)
{
if(c1=='N' || c2=='N') return false;
return Character.toUpperCase(c1)==Character.toUpperCase(c2);
}*/
@Override
public int doWork(List<String> args) {
int readLength = 150;
if (args.isEmpty()) {
LOG.error("illegal.number.of.arguments");
return -1;
}
List<Input> inputs = new ArrayList<Input>();
VariantContextWriter w = null;
// SAMFileWriter w=null;
try {
SAMSequenceDictionary dict = null;
/* create input, collect sample names */
Map<String, Input> sample2input = new HashMap<String, Input>();
for (final String filename : args) {
Input input = new Input(new File(filename));
// input.index=inputs.size();
inputs.add(input);
if (sample2input.containsKey(input.sampleName)) {
LOG.error("Duplicate sample " + input.sampleName + " in " + input.bamFile + " and " + sample2input.get(input.sampleName).bamFile);
return -1;
}
sample2input.put(input.sampleName, input);
if (dict == null) {
dict = input.header.getSequenceDictionary();
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, input.header.getSequenceDictionary())) {
LOG.error("Found more than one dictint sequence dictionary");
return -1;
}
}
LOG.info("Sample N= " + sample2input.size());
/* create merged iterator */
List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(sample2input.size());
for (Input input : inputs) headers.add(input.header);
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SortOrder.coordinate, headers, true);
List<SamReader> readers = new ArrayList<SamReader>(sample2input.size());
for (Input input : inputs) readers.add(input.samFileReaderScan);
MergingSamRecordIterator merginIter = new MergingSamRecordIterator(headerMerger, readers, true);
Allele reference_allele = Allele.create("N", true);
Allele[] alternate_alleles = new Allele[] { Allele.create("<CLIP5>", false), Allele.create("<CLIP3>", false) };
Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
for (Allele alt : alternate_alleles) {
vcfHeaderLines.add(new VCFSimpleHeaderLine("<ID=" + alt.getDisplayString() + ",Description=\"StructVar\">", VCFHeaderVersion.VCF4_1, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")));
}
vcfHeaderLines.add(new VCFInfoHeaderLine("COUNT_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples with depth>=" + this.min_depth));
vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
for (int side = 0; side < 2; ++side) {
vcfHeaderLines.add(new VCFFormatHeaderLine("CN" + (side == 0 ? 5 : 3), 1, VCFHeaderLineType.Integer, "count clipped in " + (side == 0 ? 5 : 3) + "'"));
}
if (dict != null) {
vcfHeaderLines.addAll(VCFUtils.samSequenceDictToVCFContigHeaderLine(dict));
}
VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample2input.keySet());
w = VCFUtils.createVariantContextWriterToStdout();
w.writeHeader(vcfHeader);
final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<>();
// w=swf.make(header, System.out);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
if (bedFile != null) {
final BedLineCodec bedLineCodec = new BedLineCodec();
LOG.info("Reading " + bedFile);
BufferedReader r = IOUtils.openFileForBufferedReading(bedFile);
String line;
while ((line = r.readLine()) != null) {
BedLine bedLine = bedLineCodec.decode(line);
if (bedLine == null)
continue;
if (dict != null && dict.getSequence(bedLine.getContig()) == null) {
LOG.warning("undefined chromosome in " + bedFile + " " + line);
continue;
}
intervals.put(bedLine.toInterval(), true);
}
CloserUtil.close(r);
}
LinkedList<SAMRecord> buffer = new LinkedList<SAMRecord>();
final Predicate<SAMRecord> filterSamRecords = new Predicate<SAMRecord>() {
@Override
public boolean test(SAMRecord rec) {
if (rec.getReadUnmappedFlag())
return false;
if (rec.isSecondaryOrSupplementary())
return false;
if (rec.getDuplicateReadFlag())
return false;
if (rec.getReadFailsVendorQualityCheckFlag())
return false;
Cigar cigar = rec.getCigar();
if (cigar == null || cigar.numCigarElements() < 2)
return false;
boolean found_S = false;
for (int side = 0; side < 2; ++side) {
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
// read must be clipped on 5' or 3' with a good length
if (!ce.getOperator().equals(CigarOperator.S))
continue;
found_S = true;
break;
}
if (!found_S)
return false;
SAMReadGroupRecord g = rec.getReadGroup();
if (g == null || g.getSample() == null || g.getSample().isEmpty())
return false;
return true;
}
};
final FilteringIterator<SAMRecord> forwardIterator = new FilteringIterator<SAMRecord>(merginIter, filterSamRecords);
for (; ; ) {
SAMRecord rec = null;
if (forwardIterator.hasNext()) {
rec = forwardIterator.next();
progress.watch(rec);
if (intervals != null && !intervals.containsOverlapping(new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd())))
continue;
}
// need to flush buffer ?
if (rec == null || (!buffer.isEmpty() && !buffer.getLast().getReferenceIndex().equals(rec.getReferenceIndex())) || (!buffer.isEmpty() && buffer.getLast().getUnclippedEnd() + readLength < rec.getUnclippedStart())) {
if (!buffer.isEmpty()) {
int chromStart = buffer.getFirst().getUnclippedStart();
int chromEnd = buffer.getFirst().getUnclippedEnd();
for (SAMRecord sam : buffer) {
chromStart = Math.min(chromStart, sam.getUnclippedStart());
chromEnd = Math.max(chromEnd, sam.getUnclippedEnd());
}
final int winShift = 5;
for (int pos = chromStart; pos + winShift <= chromEnd; pos += winShift) {
int[] count_big_clip = new int[] { 0, 0 };
// int max_depth[]=new int[]{0,0};
List<Genotype> genotypes = new ArrayList<Genotype>();
Set<Allele> all_alleles = new HashSet<Allele>();
all_alleles.add(reference_allele);
boolean found_one_depth_ok = false;
int sum_depth = 0;
int samples_with_high_depth = 0;
for (String sample : sample2input.keySet()) {
GenotypeBuilder gb = new GenotypeBuilder(sample);
int[] count_clipped = new int[] { 0, 0 };
Set<Allele> sample_alleles = new HashSet<Allele>(3);
for (int side = 0; side < 2; ++side) {
for (SAMRecord sam : buffer) {
if (!sam.getReadGroup().getSample().equals(sample))
continue;
Cigar cigar = sam.getCigar();
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
if (!ce.getOperator().equals(CigarOperator.S))
continue;
int clipStart = (side == 0 ? sam.getUnclippedStart() : sam.getAlignmentEnd() + 1);
int clipEnd = (side == 0 ? sam.getAlignmentStart() - 1 : sam.getUnclippedEnd());
if ((pos + winShift < clipStart || pos > clipEnd))
continue;
count_clipped[side]++;
if (ce.getLength() >= this.min_clip_length) {
count_big_clip[side]++;
}
sample_alleles.add(alternate_alleles[side]);
gb.attribute("CN" + (side == 0 ? 5 : 3), count_clipped[side]);
}
}
// if(!(found_one_big_clip[0] || found_one_big_clip[1])) continue;
if (count_clipped[0] + count_clipped[1] == 0)
continue;
if ((count_clipped[0] + count_clipped[1]) > min_depth) {
found_one_depth_ok = true;
++samples_with_high_depth;
}
sum_depth += (count_clipped[0] + count_clipped[1]);
gb.alleles(new ArrayList<Allele>(sample_alleles));
all_alleles.addAll(sample_alleles);
gb.DP(count_clipped[0] + count_clipped[1]);
genotypes.add(gb.make());
}
if (all_alleles.size() == 1) {
// all homozygotes
continue;
}
if (!found_one_depth_ok) {
continue;
}
if (!(count_big_clip[0] >= 1 || count_big_clip[1] >= 1)) {
continue;
}
Map<String, Object> atts = new HashMap<String, Object>();
atts.put("COUNT_SAMPLES", samples_with_high_depth);
atts.put(VCFConstants.DEPTH_KEY, sum_depth);
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(buffer.getFirst().getReferenceName());
vcb.start(pos);
vcb.stop(pos + winShift);
vcb.alleles(all_alleles);
vcb.attributes(atts);
vcb.genotypes(genotypes);
w.add(vcb.make());
}
buffer.clear();
}
if (rec == null) {
break;
}
}
buffer.add(rec);
}
merginIter.close();
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
for (Input input : inputs) {
CloserUtil.close(input);
}
}
}
use of htsjdk.samtools.util.IntervalTreeMap in project jvarkit by lindenb.
the class ReferenceToVCF method doWork.
@Override
public int doWork(List<String> args) {
if (this.bedFile != null) {
if (limitBed == null)
limitBed = new IntervalTreeMap<Boolean>();
try {
Pattern tab = Pattern.compile("[\t]");
BufferedReader r = IOUtils.openFileForBufferedReading(this.bedFile);
String line;
while ((line = r.readLine()) != null) {
if (BedLine.isBedHeader(line))
continue;
if (line.startsWith("#") || line.isEmpty())
continue;
String[] tokens = tab.split(line, 4);
limitBed.put(new Interval(tokens[0], 1 + Integer.parseInt(tokens[1]), 1 + Integer.parseInt(tokens[2])), true);
}
CloserUtil.close(r);
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
Random random = new Random(0L);
VariantContextWriter out = null;
try {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(oneAndOnlyOneFile(args)));
SAMSequenceDictionary dict = fasta.getSequenceDictionary();
out = super.openVariantContextWriter(this.outputFile);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
VCFHeader header = new VCFHeader();
header.setSequenceDictionary(dict);
out.writeHeader(header);
final List<List<Allele>> combination = new ArrayList<List<Allele>>(4);
// always keep REF as first allele please
combination.add(Arrays.asList(Allele.create("A", true), Allele.create("C", false), Allele.create("G", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("C", true), Allele.create("A", false), Allele.create("G", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("G", true), Allele.create("A", false), Allele.create("C", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("T", true), Allele.create("A", false), Allele.create("C", false), Allele.create("G", false)));
for (SAMSequenceRecord ssr : dict.getSequences()) {
GenomicSequence genome = new GenomicSequence(fasta, ssr.getSequenceName());
if (this.limitBed != null) {
Interval interval = new Interval(ssr.getSequenceName(), 1, genome.length());
if (!this.limitBed.containsOverlapping(interval))
continue;
}
for (int n = 0; n < genome.length(); ++n) {
progress.watch(ssr.getSequenceIndex(), n);
List<Allele> alleles = null;
byte ref = (byte) genome.charAt(n);
switch(ref) {
case 'a':
case 'A':
alleles = combination.get(0);
break;
case 'c':
case 'C':
alleles = combination.get(1);
break;
case 'g':
case 'G':
alleles = combination.get(2);
break;
case 't':
case 'T':
alleles = combination.get(3);
break;
default:
break;
}
if (alleles == null)
continue;
if (this.limitBed != null) {
Interval interval = new Interval(ssr.getSequenceName(), n + 1, n + 1);
if (!this.limitBed.containsOverlapping(interval))
continue;
}
if (!disjoint_alts) {
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.stop(n + 1);
vcb.alleles(alleles);
out.add(vcb.make());
} else {
for (// index 0 is always REF
int a = 1; // index 0 is always REF
a < 4; // index 0 is always REF
++a) {
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.stop(n + 1);
// index 0 is always REF
vcb.alleles(Arrays.asList(alleles.get(0), alleles.get(a)));
out.add(vcb.make());
}
}
if (insertion_size > 0 && n + 1 < genome.length()) {
alleles = new ArrayList<Allele>(2);
// REFERENCE
alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(n + 1), true));
StringBuilder sb = new StringBuilder(insertion_size + 2);
sb.append(genome.charAt(n));
for (int n2 = 0; n2 < insertion_size; ++n2) {
switch(random.nextInt(4)) {
case 0:
sb.append('A');
break;
case 1:
sb.append('C');
break;
case 2:
sb.append('G');
break;
case 3:
sb.append('T');
break;
}
}
sb.append(genome.charAt(n + 1));
alleles.add(Allele.create(sb.toString(), false));
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.alleles(alleles);
vcb.computeEndFromAlleles(alleles, n + 1);
out.add(vcb.make());
}
if (deletion_size > 0 && n + deletion_size + 1 < genome.length()) {
alleles = new ArrayList<Allele>(2);
// REF
StringBuilder sb = new StringBuilder(deletion_size + 2);
sb.append(genome.charAt(n));
int lastpos = n + 1;
for (int n2 = 0; n2 < deletion_size; ++n2, lastpos++) {
sb.append(genome.charAt(lastpos));
}
sb.append(genome.charAt(lastpos));
alleles.add(Allele.create(sb.toString(), true));
alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(lastpos), false));
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.alleles(alleles);
vcb.computeEndFromAlleles(alleles, n + 1);
out.add(vcb.make());
}
if (out.checkError())
break;
}
if (out.checkError())
break;
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
Aggregations