use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class BamCmpCoverage method readBedFile.
private void readBedFile(final Path bedFile) {
if (this.intervals == null) {
intervals = new IntervalTreeMap<Boolean>();
}
try (BedLineReader r = new BedLineReader(bedFile)) {
while (r.hasNext()) {
final BedLine bedLine = r.next();
if (bedLine == null)
continue;
this.intervals.put(bedLine.toInterval(), true);
}
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class ReferenceToVCF method doWork.
@Override
public int doWork(final List<String> args) {
if (this.bedFile != null) {
if (limitBed == null)
limitBed = new IntervalTreeMap<Boolean>();
try {
try (final BedLineReader r = new BedLineReader(this.bedFile)) {
while (r.hasNext()) {
final BedLine record = r.next();
limitBed.put(record.toInterval(), true);
}
}
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
final Random random = new Random(0L);
VariantContextWriter out = null;
try {
final ReferenceSequenceFile fasta = ReferenceSequenceFileFactory.getReferenceSequenceFile(Paths.get(oneAndOnlyOneFile(args)));
SAMSequenceDictionary dict = fasta.getSequenceDictionary();
out = this.writingVariants.dictionary(dict).open(this.outputFile);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
VCFHeader header = new VCFHeader();
JVarkitVersion.getInstance().addMetaData(this, header);
header.setSequenceDictionary(dict);
out.writeHeader(header);
final List<List<Allele>> combination = new ArrayList<List<Allele>>(4);
// always keep REF as first allele please
combination.add(Arrays.asList(Allele.create("A", true), Allele.create("C", false), Allele.create("G", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("C", true), Allele.create("A", false), Allele.create("G", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("G", true), Allele.create("A", false), Allele.create("C", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("T", true), Allele.create("A", false), Allele.create("C", false), Allele.create("G", false)));
for (SAMSequenceRecord ssr : dict.getSequences()) {
GenomicSequence genome = new GenomicSequence(fasta, ssr.getSequenceName());
if (this.limitBed != null) {
Interval interval = new Interval(ssr.getSequenceName(), 1, genome.length());
if (!this.limitBed.containsOverlapping(interval))
continue;
}
for (int n = 0; n < genome.length(); ++n) {
progress.watch(ssr.getSequenceIndex(), n);
List<Allele> alleles = null;
byte ref = (byte) genome.charAt(n);
switch(ref) {
case 'a':
case 'A':
alleles = combination.get(0);
break;
case 'c':
case 'C':
alleles = combination.get(1);
break;
case 'g':
case 'G':
alleles = combination.get(2);
break;
case 't':
case 'T':
alleles = combination.get(3);
break;
default:
break;
}
if (alleles == null)
continue;
if (this.limitBed != null) {
Interval interval = new Interval(ssr.getSequenceName(), n + 1, n + 1);
if (!this.limitBed.containsOverlapping(interval))
continue;
}
if (!disjoint_alts) {
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.stop(n + 1);
vcb.alleles(alleles);
out.add(vcb.make());
} else {
for (// index 0 is always REF
int a = 1; // index 0 is always REF
a < 4; // index 0 is always REF
++a) {
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.stop(n + 1);
// index 0 is always REF
vcb.alleles(Arrays.asList(alleles.get(0), alleles.get(a)));
out.add(vcb.make());
}
}
if (insertion_size > 0 && n + 1 < genome.length()) {
alleles = new ArrayList<Allele>(2);
// REFERENCE
alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(n + 1), true));
StringBuilder sb = new StringBuilder(insertion_size + 2);
sb.append(genome.charAt(n));
for (int n2 = 0; n2 < insertion_size; ++n2) {
switch(random.nextInt(4)) {
case 0:
sb.append('A');
break;
case 1:
sb.append('C');
break;
case 2:
sb.append('G');
break;
case 3:
sb.append('T');
break;
}
}
sb.append(genome.charAt(n + 1));
alleles.add(Allele.create(sb.toString(), false));
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.alleles(alleles);
vcb.computeEndFromAlleles(alleles, n + 1);
out.add(vcb.make());
}
if (deletion_size > 0 && n + deletion_size + 1 < genome.length()) {
alleles = new ArrayList<Allele>(2);
// REF
final StringBuilder sb = new StringBuilder(deletion_size + 2);
sb.append(genome.charAt(n));
int lastpos = n + 1;
for (int n2 = 0; n2 < deletion_size; ++n2, lastpos++) {
sb.append(genome.charAt(lastpos));
}
sb.append(genome.charAt(lastpos));
alleles.add(Allele.create(sb.toString(), true));
alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(lastpos), false));
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.alleles(alleles);
vcb.computeEndFromAlleles(alleles, n + 1);
out.add(vcb.make());
}
if (out.checkError())
break;
}
if (out.checkError())
break;
}
progress.finish();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class FindAllCoverageAtPosition method doWork.
@Override
public int doWork(final List<String> args) {
final Set<SimplePosition> mutations = new TreeSet<>();
if (this.extend < 1) {
LOG.error("-x should be >=1");
return -1;
}
try {
this.samReaderFactory = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
if (this.referenceFileFile != null) {
this.indexedFastaSequenceFile = htsjdk.samtools.reference.ReferenceSequenceFileFactory.getReferenceSequenceFile(this.referenceFileFile);
this.samReaderFactory.referenceSequence(this.referenceFileFile);
this.fastaDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
}
this.positionStrs.stream().flatMap(S -> Arrays.stream(S.split("[ \t;]+"))).filter(S -> !StringUtils.isBlank(S)).map(S -> new SimplePosition(S)).forEach(X -> mutations.add(X));
if (this.positionFile != null) {
if (this.positionFile.toString().endsWith(".bed") || this.positionFile.toString().endsWith(".bed.gz")) {
try (BedLineReader br = new BedLineReader(this.positionFile)) {
br.stream().filter(B -> B != null).forEach(bedLine -> {
for (int x = bedLine.getStart(); x <= bedLine.getEnd(); ++x) {
mutations.add(new SimplePosition(bedLine.getContig(), x));
}
});
}
} else {
try (BufferedReader r2 = IOUtils.openPathForBufferedReading(this.positionFile)) {
r2.lines().filter(L -> !StringUtils.isBlank(L)).filter(L -> !L.startsWith("#")).forEach(L -> mutations.add(new SimplePosition(L)));
}
}
}
if (mutations.isEmpty()) {
LOG.fatal("undefined position \'str\'");
return -1;
}
LOG.info("number of mutations " + mutations.size());
this.out = this.openPathOrStdoutAsPrintWriter(this.outputFile);
out.print("#File");
out.print('\t');
out.print("CHROM");
out.print('\t');
out.print("POS");
if (this.indexedFastaSequenceFile != null) {
out.print('\t');
out.print("REF");
}
out.print('\t');
out.print(this.groupBy.name().toUpperCase());
out.print('\t');
out.print("DEPTH");
for (final CigarOperator op : CigarOperator.values()) {
out.print('\t');
out.print(op.name());
}
for (char c : BASES_To_PRINT) {
out.print('\t');
out.print("Base(" + c + ")");
}
out.println();
if (args.isEmpty()) {
try (BufferedReader r = new BufferedReader(new InputStreamReader(stdin()))) {
scan(r, mutations);
}
} else {
for (final String filename : args) {
try (BufferedReader r = IOUtils.openURIForBufferedReading(filename)) {
scan(r, mutations);
}
}
}
this.out.flush();
return 0;
} catch (final Throwable err) {
LOG.severe(err);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
CloserUtil.close(this.out);
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class DepthOfCoverage method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
if (this.auto_mask && this.faidx == null) {
LOG.error("Cannot auto mask if REF is not defined");
return -1;
}
if (this.maskBed != null && this.includeBed != null) {
LOG.error("both --mask and --bed both defined");
return -1;
}
ReferenceSequenceFile referenceSequenceFile = null;
try {
final Predicate<String> isRejectContigRegex;
if (!StringUtils.isBlank(this.skipContigExpr)) {
final Pattern pat = Pattern.compile(this.skipContigExpr);
isRejectContigRegex = S -> pat.matcher(S).matches();
} else {
isRejectContigRegex = S -> false;
}
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null) {
srf.referenceSequence(this.faidx);
srf.setUseAsyncIo(this.asyncIo);
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
}
out = super.openPathOrStdoutAsPrintWriter(this.outputFile);
out.print("#BAM\tSample\tContig\tContig-Length\tMasked-Contig-Length\tCount\tDepth\tMedian\tMin\tMax\tMaxPos");
for (RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(r.toString());
}
out.println();
for (final Path path : IOUtils.unrollPaths(args)) {
try (final SamReader sr = srf.open(path)) {
if (!sr.hasIndex()) {
LOG.error("File " + path + " is not indexed.");
return -1;
}
final SAMFileHeader header = sr.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final Set<String> rejectContigSet = dict.getSequences().stream().map(SSR -> SSR.getSequenceName()).filter(isRejectContigRegex).collect(Collectors.toCollection(HashSet::new));
rejectContigSet.addAll(dict.getSequences().stream().filter(SSR -> SSR.getSequenceLength() < this.skipContigLength).map(SSR -> SSR.getSequenceName()).collect(Collectors.toCollection(HashSet::new)));
if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
LOG.error("file is not sorted on coordinate :" + header.getSortOrder() + " " + path);
return -1;
}
final QueryInterval[] intervals;
if (this.useBamIndexFlag && this.includeBed != null) {
if (!sr.hasIndex()) {
LOG.error("Bam is not indexed. " + path);
return -1;
}
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
final List<QueryInterval> L = new ArrayList<>();
try (BedLineReader br = new BedLineReader(this.includeBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
final String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
final int tid = dict.getSequenceIndex(ctg);
if (tid < 0)
continue;
L.add(new QueryInterval(tid, bed.getStart(), bed.getEnd()));
}
}
intervals = QueryInterval.optimizeIntervals(L.toArray(new QueryInterval[L.size()]));
} else {
intervals = null;
}
Integer minCov = null;
Integer maxCov = null;
ContigPos maxCovPosition = null;
long count_raw_bases = 0L;
long count_bases = 0L;
long sum_coverage = 0L;
final DiscreteMedian<Integer> discreteMedian_wg = new DiscreteMedian<>();
final Counter<RangeOfIntegers.Range> countMap_wg = new Counter<>();
final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(path.toString());
int[] coverage = null;
String prevContig = null;
BitSet mask = null;
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
try (CloseableIterator<SAMRecord> iter = intervals == null ? sr.iterator() : sr.queryOverlapping(intervals)) {
for (; ; ) {
final SAMRecord rec = iter.hasNext() ? progress.apply(iter.next()) : null;
if (rec != null) {
if (!SAMRecordDefaultFilter.accept(rec, this.mapping_quality))
continue;
if (rejectContigSet.contains(rec.getContig()))
continue;
}
if (rec == null || !rec.getContig().equals(prevContig)) {
if (coverage != null) {
// DUMP
long count_bases_ctg = 0L;
long sum_coverage_ctg = 0L;
Integer minV_ctg = null;
Integer maxV_ctg = null;
ContigPos maxPos_ctg = null;
final DiscreteMedian<Integer> discreteMedian_ctg = new DiscreteMedian<>();
final Counter<RangeOfIntegers.Range> countMap_ctg = new Counter<>();
for (int i = 0; i < coverage.length; i++) {
if (mask.get(i))
continue;
final int covi = coverage[i];
if (covi > this.max_depth)
continue;
if (minV_ctg == null || minV_ctg.intValue() > covi)
minV_ctg = covi;
if (maxV_ctg == null || maxV_ctg.intValue() < covi) {
maxV_ctg = covi;
maxPos_ctg = new ContigPos(prevContig, i + 1);
}
countMap_ctg.incr(this.summaryCov.getRange(covi));
count_bases_ctg++;
sum_coverage_ctg += covi;
discreteMedian_ctg.add(covi);
}
out.print(path);
out.print("\t");
out.print(sample);
out.print("\t");
out.print(prevContig);
out.print("\t");
out.print(coverage.length);
out.print("\t");
out.print(count_bases_ctg);
out.print("\t");
out.print(sum_coverage_ctg);
out.print("\t");
if (count_bases_ctg > 0) {
out.printf("%.2f", sum_coverage_ctg / (double) count_bases_ctg);
} else {
out.print("N/A");
}
out.print("\t");
final OptionalDouble median = discreteMedian_ctg.getMedian();
if (median.isPresent()) {
out.print(median.getAsDouble());
} else {
out.print("N/A");
}
out.print("\t");
if (minV_ctg != null) {
out.print(minV_ctg);
} else {
out.print("N/A");
}
out.print("\t");
if (maxV_ctg != null) {
out.print(maxV_ctg);
out.print("\t");
out.print(maxPos_ctg);
} else {
out.print("N/A\tN/A");
}
for (final RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(countMap_ctg.count(r));
if (!countMap_ctg.isEmpty()) {
out.print(" ");
out.printf("(%.2f%%)", (countMap_ctg.count(r) / (countMap_ctg.getTotal() * 1.0)) * 100.0);
}
}
out.println();
if (minCov == null || (minV_ctg != null && minV_ctg.compareTo(minCov) < 0))
minCov = minV_ctg;
if (maxCov == null || (maxV_ctg != null && maxV_ctg.compareTo(maxCov) > 0)) {
maxCov = maxV_ctg;
maxCovPosition = maxPos_ctg;
}
count_bases += count_bases_ctg;
sum_coverage += sum_coverage_ctg;
count_raw_bases += coverage.length;
discreteMedian_wg.add(discreteMedian_ctg);
countMap_wg.putAll(countMap_ctg);
}
coverage = null;
mask = null;
// /
System.gc();
if (rec == null)
break;
final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(rec.getContig()));
coverage = new int[ssr.getSequenceLength()];
mask = new BitSet(ssr.getSequenceLength());
if (this.auto_mask && referenceSequenceFile != null) {
final byte[] refSeq = Objects.requireNonNull(referenceSequenceFile.getSequence(ssr.getSequenceName())).getBases();
for (int i = 0; i < refSeq.length; i++) {
if (AcidNucleics.isATGC(refSeq[i]))
continue;
mask.set(i);
}
}
/* read mask */
if (this.maskBed != null) {
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.maskBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
if (bed == null)
continue;
String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
if (!rec.getContig().equals(ctg))
continue;
for (int p1 = bed.getStart(); p1 <= bed.getEnd() && p1 <= coverage.length; ++p1) {
mask.set(p1 - 1);
}
}
}
} else if (this.includeBed != null) {
final List<Locatable> list = new ArrayList<>();
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.includeBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
if (bed == null)
continue;
final String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
if (!rec.getContig().equals(ctg))
continue;
list.add(new SimpleInterval(ctg, bed.getStart(), bed.getEnd()));
}
}
// sort on starts
Collections.sort(list, (A, B) -> Integer.compare(A.getStart(), B.getStart()));
int p1 = 1;
while (p1 <= coverage.length) {
while (!list.isEmpty() && list.get(0).getEnd() < p1) {
list.remove(0);
}
if (!list.isEmpty() && list.get(0).getStart() <= p1 && p1 <= list.get(0).getEnd()) {
++p1;
continue;
}
mask.set(p1 - 1);
p1++;
}
}
prevContig = rec.getContig();
}
int max_end1 = coverage.length;
if (!this.disable_paired_overlap_flag && rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) && rec.getAlignmentStart() < rec.getMateAlignmentStart() && rec.getAlignmentEnd() > rec.getMateAlignmentStart()) {
max_end1 = rec.getMateAlignmentStart() - 1;
}
for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
final int pos1 = block.getReferenceStart();
final int len = block.getLength();
for (int i = 0; i < len; i++) {
if (pos1 + i - 1 >= 0 && pos1 + i <= max_end1) {
coverage[pos1 + i - 1]++;
}
}
}
}
/* end rec */
}
/* end iter */
progress.close();
out.print(path);
out.print("\t");
out.print(sample);
out.print("\t");
out.print(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
out.print("\t");
out.print(count_raw_bases);
out.print("\t");
out.print(count_bases);
out.print("\t");
out.print(sum_coverage);
out.print("\t");
if (count_bases > 0) {
out.printf("%.2f", sum_coverage / (double) count_bases);
} else {
out.print("N/A");
}
out.print("\t");
final OptionalDouble median = discreteMedian_wg.getMedian();
if (median.isPresent()) {
out.print(median.getAsDouble());
} else {
out.print("N/A");
}
out.print("\t");
if (minCov != null) {
out.print(minCov);
} else {
out.print("N/A");
}
out.print("\t");
if (maxCov != null) {
out.print(maxCov + "\t" + maxCovPosition);
} else {
out.print("N/A\tN/A");
}
for (final RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(countMap_wg.count(r));
if (!countMap_wg.isEmpty()) {
out.print(" ");
out.printf("(%.2f%%)", (countMap_wg.count(r) / (countMap_wg.getTotal() * 1.0)) * 100.0);
}
}
out.println();
}
}
out.flush();
out.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(referenceSequenceFile);
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class SetFileTools method fromBed.
private int fromBed(final List<String> args) throws IOException {
final String input = oneFileOrNull(args);
final Function<BedLine, String> bed2name = bed -> {
if (bed.getColumnCount() < 4)
throw new IllegalArgumentException("Expected 4 columns but got " + bed);
return bed.get(3);
};
try (BufferedReader br = super.openBufferedReader(input)) {
try (BedLineReader blr = new BedLineReader(br, input)) {
blr.setValidationStringency(validationStringency);
blr.setContigNameConverter(this.theCtgConverter);
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
final EqualIterator<BedLine> iter = new EqualIterator<BedLine>(blr.stream().iterator(), (A, B) -> bed2name.apply(A).compareTo(bed2name.apply(B)));
while (iter.hasNext()) {
final List<BedLine> lines = iter.next();
final List<Locatable> L = sortAndMerge(lines);
pw.print(bed2name.apply(lines.get(0)));
for (int i = 0; i < L.size(); i++) {
pw.print(i == 0 ? "\t" : ",");
final Locatable rec = L.get(i);
pw.print(noChr(rec.getContig()));
pw.print(":");
pw.print(rec.getStart());
pw.print("-");
pw.print(rec.getEnd());
}
pw.println();
}
iter.close();
pw.flush();
}
}
}
return 0;
}
Aggregations