use of com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome in project jvarkit by lindenb.
the class TestNg01 method testRefGenomeFactoryForFile.
@Test
public void testRefGenomeFactoryForFile() throws IOException {
final ReferenceGenomeFactory rgf = new ReferenceGenomeFactory();
rgf.setBufferSize(2);
final ReferenceGenome ref = rgf.open(TOY_FA);
final FastaSequenceReader fsr = new FastaSequenceReader();
final List<FastaSequence> seqs = fsr.readAll(new File(TOY_FA));
Assert.assertEquals(seqs.size(), ref.size());
for (int i = 0; i < seqs.size(); i++) {
Assert.assertEquals(ref.getContig(i).length(), seqs.get(i).length());
Assert.assertEquals(ref.getContig(i).getContig(), seqs.get(i).getName());
for (int x = 0; x < ref.getContig(i).length(); ++x) {
Assert.assertEquals(ref.getContig(i).charAt(x), seqs.get(i).charAt(x));
}
}
ref.close();
}
use of com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome in project jvarkit by lindenb.
the class GcPercentAndDepth method doWork.
@Override
public int doWork(final List<String> args) {
if (this.windowSize <= 0) {
LOG.error("Bad window size.");
return -1;
}
if (this.windowStep <= 0) {
LOG.error("Bad window step.");
return -1;
}
if (this.refFile == null) {
LOG.error("Undefined REF File");
return -1;
}
if (args.isEmpty()) {
LOG.error("Illegal Number of arguments.");
return -1;
}
ReferenceGenome indexedFastaSequenceFile = null;
List<SamReader> readers = new ArrayList<SamReader>();
PrintWriter out = null;
try {
LOG.info("Loading " + this.refFile);
indexedFastaSequenceFile = new ReferenceGenomeFactory().openFastaFile(this.refFile);
this.samSequenceDictionary = indexedFastaSequenceFile.getDictionary();
if (this.samSequenceDictionary == null) {
LOG.error("Cannot get sequence dictionary for " + this.refFile);
return -1;
}
out = super.openFileOrStdoutAsPrintWriter(outPutFile);
Set<String> all_samples = new TreeSet<String>();
/* create input, collect sample names */
for (int optind = 0; optind < args.size(); ++optind) {
LOG.info("Opening " + args.get(optind));
final SamReader samFileReaderScan = super.openSamReader(args.get(optind));
readers.add(samFileReaderScan);
final SAMFileHeader header = samFileReaderScan.getFileHeader();
if (!SequenceUtil.areSequenceDictionariesEqual(this.samSequenceDictionary, header.getSequenceDictionary())) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(this.samSequenceDictionary, header.getSequenceDictionary()));
return -1;
}
for (final SAMReadGroupRecord g : header.getReadGroups()) {
final String sample = this.partition.apply(g);
if (StringUtil.isBlank(sample)) {
LOG.warning("Read group " + g.getId() + " has no sample in merged dictionary");
continue;
}
all_samples.add(sample);
}
}
LOG.info("N " + this.partition.name() + "=" + all_samples.size());
/* print header */
out.print("#");
if (!this.hide_genomic_index) {
out.print("id");
out.print("\t");
}
out.print("chrom");
out.print("\t");
out.print("start");
out.print("\t");
out.print("end");
out.print("\t");
out.print("GCPercent");
for (final String sample : all_samples) {
out.print("\t");
out.print(sample);
}
out.println();
final List<RegionCaptured> regionsCaptured = new ArrayList<RegionCaptured>();
if (bedFile != null) {
LOG.info("Reading BED:" + bedFile);
final BedLineCodec bedLineCodec = new BedLineCodec();
BufferedReader r = IOUtils.openFileForBufferedReading(bedFile);
r.lines().filter(L -> !L.startsWith("#")).filter(L -> !StringUtil.isBlank(L)).map(L -> bedLineCodec.decode(L)).filter(B -> B != null).forEach(B -> {
final SAMSequenceRecord ssr = this.samSequenceDictionary.getSequence(B.getContig());
if (ssr == null) {
LOG.warning("Cannot resolve " + B.getContig());
return;
}
final RegionCaptured roi = new RegionCaptured(ssr, B.getStart() - 1, B.getEnd());
regionsCaptured.add(roi);
});
CloserUtil.close(r);
LOG.info("end Reading BED:" + bedFile);
Collections.sort(regionsCaptured);
} else {
LOG.info("No capture, peeking everything");
for (final SAMSequenceRecord ssr : this.samSequenceDictionary.getSequences()) {
final RegionCaptured roi = new RegionCaptured(ssr, 0, ssr.getSequenceLength());
regionsCaptured.add(roi);
}
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.samSequenceDictionary).logger(LOG);
ReferenceContig genomicSequence = null;
for (final RegionCaptured roi : regionsCaptured) {
if (genomicSequence == null || !genomicSequence.hasName(roi.getContig())) {
genomicSequence = indexedFastaSequenceFile.getContig(roi.getContig());
if (genomicSequence == null) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(roi.getContig(), this.samSequenceDictionary));
return -1;
}
}
Map<String, int[]> sample2depth = new HashMap<String, int[]>();
Map<String, Double> sample2meanDepth = new HashMap<String, Double>();
for (final String sample : all_samples) {
int[] depth = new int[roi.length()];
Arrays.fill(depth, 0);
sample2depth.put(sample, depth);
}
List<CloseableIterator<SAMRecord>> iterators = new ArrayList<CloseableIterator<SAMRecord>>();
for (final SamReader r : readers) {
iterators.add(r.query(roi.getContig(), roi.getStart(), roi.getEnd(), false));
}
final MergingIterator<SAMRecord> merginIter = new MergingIterator<>(new SAMRecordCoordinateComparator(), iterators);
while (merginIter.hasNext()) {
final SAMRecord rec = merginIter.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.filter.filterOut(rec))
continue;
final String sample = this.partition.getPartion(rec, null);
if (sample == null)
continue;
final int[] depth = sample2depth.get(sample);
if (depth == null)
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
int refpos1 = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (!op.consumesReferenceBases())
continue;
if (op.consumesReadBases()) {
for (int i = 0; i < ce.getLength(); ++i) {
if (refpos1 + i < roi.getStart())
continue;
if (refpos1 + i > roi.getEnd())
break;
depth[refpos1 + i - roi.getStart()]++;
}
}
refpos1 += ce.getLength();
}
}
merginIter.close();
for (final RegionCaptured.SlidingWindow win : roi) {
double total = 0f;
int countN = 0;
for (int pos1 = win.getStart(); pos1 <= win.getEnd(); ++pos1) {
switch(genomicSequence.charAt(pos1 - 1)) {
case 'c':
case 'C':
case 'g':
case 'G':
case 's':
case 'S':
{
total++;
break;
}
case 'n':
case 'N':
countN++;
break;
default:
break;
}
}
if (skip_if_contains_N && countN > 0)
continue;
double GCPercent = total / (double) win.length();
int max_depth_for_win = 0;
sample2meanDepth.clear();
for (final String sample : all_samples) {
int[] depth = sample2depth.get(sample);
double sum = 0;
for (int pos = win.getStart(); pos < win.getEnd() && (pos - roi.getStart()) < depth.length; ++pos) {
sum += depth[pos - roi.getStart()];
}
double mean = (sum / (double) depth.length);
max_depth_for_win = Math.max(max_depth_for_win, (int) mean);
sample2meanDepth.put(sample, mean);
}
if (max_depth_for_win < this.min_depth)
continue;
if (!this.hide_genomic_index) {
out.print(win.getGenomicIndex());
out.print("\t");
}
out.print(win.getContig());
out.print("\t");
out.print(win.getStart() - 1);
out.print("\t");
out.print(win.getEnd());
out.print("\t");
out.printf("%.2f", GCPercent);
for (String sample : all_samples) {
out.print("\t");
out.printf("%.2f", (double) sample2meanDepth.get(sample));
}
out.println();
}
}
progress.finish();
out.flush();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
for (SamReader r : readers) CloserUtil.close(r);
CloserUtil.close(indexedFastaSequenceFile);
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome in project jvarkit by lindenb.
the class TestNg01 method testRefGenomeFactoryForDAS.
@Test
public void testRefGenomeFactoryForDAS() throws IOException {
final ReferenceGenomeFactory rgf = new ReferenceGenomeFactory();
rgf.setBufferSize(10);
final ReferenceGenome ref = rgf.openDAS(new URL("http://genome.cse.ucsc.edu/cgi-bin/das/hg19"));
Assert.assertTrue(ref.size() > 23);
ReferenceContig contig = ref.getContig("1");
Assert.assertNotNull(contig);
Assert.assertEquals(contig.getContig(), "1");
Assert.assertEquals(contig.length(), 249250621);
contig = ref.getContig("M");
Assert.assertNotNull(contig);
Assert.assertEquals(contig.getContig(), "M");
Assert.assertEquals(contig.length(), 16571);
String dna = "gatcacaggtctatcacc";
for (int i = 0; i < dna.length(); ++i) {
Assert.assertEquals(dna.charAt(i), contig.charAt(i));
}
dna = "cttaaataagacatcacgatg";
for (int i = 0; i < dna.length(); ++i) {
Assert.assertEquals(dna.charAt(i), contig.charAt(contig.length() - dna.length() + i));
}
ref.close();
}
use of com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome in project jvarkit by lindenb.
the class BamStats04 method doWork.
@Override
public int doWork(final List<String> args) {
if (this.bedFile == null || !this.bedFile.exists()) {
LOG.error("undefined option -B (bed file)");
return -1;
}
if (args.isEmpty()) {
LOG.error("Bam files missing");
return -1;
}
if (this.minCoverages.isEmpty()) {
this.minCoverages.add(0);
}
final String NO_PARTITION = "N/A";
BufferedReader bedIn = null;
final List<SamReader> samReaders = new ArrayList<>(args.size());
PrintWriter pw = null;
ReferenceGenome referenceGenome = null;
ReferenceContig referenceContig = null;
try {
final BedLineCodec codec = new BedLineCodec();
final Set<String> all_partitions = new TreeSet<>();
bedIn = IOUtils.openFileForBufferedReading(this.bedFile);
SAMSequenceDictionary dict = null;
for (final String filename : IOUtils.unrollFiles(args)) {
LOG.info(filename);
final SamReader samReader = super.openSamReader(filename);
if (!samReader.hasIndex()) {
LOG.error(filename + " is not indexed");
samReader.close();
return -1;
}
final SAMFileHeader samFileheader = samReader.getFileHeader();
if (samFileheader == null) {
LOG.error("SAM file is missing a header " + filename);
return -1;
}
final List<SAMReadGroupRecord> readGroups = samFileheader.getReadGroups();
if (readGroups == null || readGroups.isEmpty()) {
LOG.warn("No Read group (RG) in the header of " + filename);
all_partitions.add(NO_PARTITION);
} else {
for (final SAMReadGroupRecord rg : readGroups) {
all_partitions.add(this.partition.apply(rg, NO_PARTITION));
}
}
final SAMSequenceDictionary d = samFileheader.getSequenceDictionary();
if (d == null) {
samReader.close();
LOG.error(JvarkitException.BamDictionaryMissing.getMessage(filename));
return -1;
}
samReaders.add(samReader);
if (dict == null) {
dict = d;
} else if (SequenceUtil.areSequenceDictionariesEqual(d, dict)) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(d, dict));
return -1;
}
}
if (samReaders.isEmpty()) {
LOG.error("No Bam defined");
return -1;
}
if (!StringUtil.isBlank(this.faidxUri)) {
referenceGenome = new ReferenceGenomeFactory().open(this.faidxUri);
}
pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
pw.print("#chrom\tstart\tend\tlength\t" + this.partition.name() + (referenceGenome == null ? "" : "\tgc_percent"));
pw.print("\tmincov\tmaxcov");
for (final int MIN_COVERAGE : this.minCoverages) {
pw.print("\tmeancov_" + MIN_COVERAGE + "\tmediancov_" + MIN_COVERAGE + "\tnocoveragebp_" + MIN_COVERAGE + "\tpercentcovered_" + MIN_COVERAGE);
}
pw.println();
String line = null;
while ((line = bedIn.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
final BedLine bedLine = codec.decode(line);
if (bedLine == null)
continue;
if (dict.getSequence(bedLine.getContig()) == null) {
LOG.error("Unknown contig in " + line);
return -1;
}
if (bedLine.getStart() > bedLine.getEnd()) {
LOG.info("ignoring " + bedLine);
continue;
}
if (referenceGenome != null && (referenceContig == null || !referenceContig.hasName(bedLine.getContig()))) {
referenceContig = referenceGenome.getContig(bedLine.getContig());
}
final Map<String, IntervalStat> sample2stats = new HashMap<>(all_partitions.size());
for (final String rgId : all_partitions) {
sample2stats.put(rgId, new IntervalStat(bedLine));
}
for (final SamReader samReader : samReaders) {
/**
* start - 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
* end - 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
*/
final SAMRecordIterator r = samReader.queryOverlapping(bedLine.getContig(), bedLine.getStart(), bedLine.getEnd());
while (r.hasNext()) {
final SAMRecord rec = r.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.filter.filterOut(rec))
continue;
if (!rec.getReferenceName().equals(bedLine.getContig()))
continue;
final String partition;
final SAMReadGroupRecord group = rec.getReadGroup();
if (group == null) {
partition = NO_PARTITION;
} else {
final String name = this.partition.apply(group);
partition = (StringUtil.isBlank(name) ? NO_PARTITION : name);
}
IntervalStat stat = sample2stats.get(partition);
if (stat == null) {
stat = new IntervalStat(bedLine);
sample2stats.put(partition, stat);
}
stat.visit(rec);
}
r.close();
}
// end of loop over sam Readers
final OptionalInt gcPercentInt = (referenceContig == null ? OptionalInt.empty() : referenceContig.getGCPercent(bedLine.getStart() - 1, bedLine.getEnd()).getGCPercentAsInteger());
for (final String partitionName : sample2stats.keySet()) {
final IntervalStat stat = sample2stats.get(partitionName);
Arrays.sort(stat.counts);
pw.print(bedLine.getContig() + "\t" + (bedLine.getStart() - 1) + "\t" + (bedLine.getEnd()) + "\t" + stat.counts.length + "\t" + partitionName);
if (referenceGenome != null) {
pw.print("\t");
if (gcPercentInt.isPresent())
pw.print(gcPercentInt.getAsInt());
}
pw.print("\t" + stat.counts[0] + "\t" + stat.counts[stat.counts.length - 1]);
for (final int MIN_COVERAGE : this.minCoverages) {
/**
* map depth to 0 if depth <= MIN_COVERAGE
*/
final IntUnaryOperator depthAdjuster = (D) -> (D <= MIN_COVERAGE ? 0 : D);
final int count_no_coverage = (int) Arrays.stream(stat.counts).filter(D -> depthAdjuster.applyAsInt(D) <= 0).count();
final double mean = Percentile.average().evaluate(Arrays.stream(stat.counts).map(depthAdjuster));
final double median_depth = Percentile.median().evaluate(Arrays.stream(stat.counts).map(depthAdjuster));
pw.print("\t" + mean + "\t" + median_depth + "\t" + count_no_coverage + "\t" + (int) (((stat.counts.length - count_no_coverage) / (double) stat.counts.length) * 100.0));
}
pw.println();
}
}
pw.flush();
pw.close();
pw = null;
LOG.info("done");
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(referenceGenome);
CloserUtil.close(pw);
CloserUtil.close(bedIn);
CloserUtil.close(samReaders);
}
}
Aggregations