use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class CnvSlidingWindow method doWork.
@Override
public int doWork(final List<String> args) {
final List<Sample> samples = new ArrayList<>();
try {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
final DistanceParser distParser = new DistanceParser();
final int[] windows_array = Arrays.stream(CharSplitter.SEMICOLON.split(windowDefs)).filter(S -> !StringUtils.isBlank(S)).mapToInt(N -> distParser.applyAsInt(N)).toArray();
if (windows_array.length == 0) {
LOG.error("No window defined.");
return -1;
}
if (windows_array.length % 2 != 0) {
LOG.error("odd number of windows ? " + this.windowDefs);
return -1;
}
final List<Path> inputBams = IOUtils.unrollPaths(args);
if (inputBams.isEmpty()) {
LOG.error("input bam file missing.");
return -1;
}
final Set<String> sampleNames = new TreeSet<>();
for (final Path samFile : inputBams) {
final Sample sample = new Sample(samFile);
if (sampleNames.contains(sample.name)) {
LOG.error("duplicate sample " + sample.name);
sample.close();
return -1;
}
sampleNames.add(sample.name);
samples.add(sample);
SequenceUtil.assertSequenceDictionariesEqual(dict, sample.dict);
}
final List<Locatable> contigs = dict.getSequences().stream().filter(SR -> !SR.getSequenceName().matches(this.excludeRegex)).collect(Collectors.toCollection(ArrayList::new));
if (this.excludeBedPath != null) {
final BedLineCodec bedLineCodec = new BedLineCodec();
final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(dict);
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
final List<SimpleInterval> exclude = br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> bedLineCodec.decode(L)).filter(B -> B != null && !StringUtils.isBlank(ctgConverter.apply(B.getContig()))).map(B -> new SimpleInterval(ctgConverter.apply(B.getContig()), B.getStart(), B.getEnd())).collect(Collectors.toList());
boolean done = false;
while (!done) {
done = true;
int i = 0;
while (i < contigs.size()) {
final Locatable contig = contigs.get(i);
final Locatable overlapper = exclude.stream().filter(EX -> EX.overlaps(contig)).findAny().orElse(null);
if (overlapper != null) {
contigs.remove(i);
contigs.addAll(split(contig, overlapper));
done = false;
} else {
i++;
}
}
}
}
}
contigs.sort(new ContigDictComparator(dict).createLocatableComparator());
final Allele ref_allele = Allele.create("N", true);
final Allele dup_allele = Allele.create("<DUP>", false);
final Allele del_allele = Allele.create("<DEL>", false);
final Function<Integer, List<Allele>> cnv2allele = CNV -> {
switch(CNV) {
case 0:
return Arrays.asList(ref_allele, ref_allele);
case 1:
return Arrays.asList(ref_allele, dup_allele);
case 2:
return Arrays.asList(dup_allele, dup_allele);
case -1:
return Arrays.asList(ref_allele, del_allele);
case -2:
return Arrays.asList(del_allele, del_allele);
default:
throw new IllegalArgumentException("cnv:" + CNV);
}
};
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
final VCFFormatHeaderLine fmtLeftCov = new VCFFormatHeaderLine("LC", 1, VCFHeaderLineType.Float, "Left median coverage.");
final VCFFormatHeaderLine fmtMidCov = new VCFFormatHeaderLine("MC", 1, VCFHeaderLineType.Float, "Middle median coverage.");
final VCFFormatHeaderLine fmtRightCov = new VCFFormatHeaderLine("RC", 1, VCFHeaderLineType.Float, "right median coverage.");
final VCFFormatHeaderLine fmtLeftMedian = new VCFFormatHeaderLine("LM", 1, VCFHeaderLineType.Float, "Left normalized median coverage.");
final VCFFormatHeaderLine fmtMidMedian = new VCFFormatHeaderLine("MM", 1, VCFHeaderLineType.Float, "Middle normalized median coverage.");
final VCFFormatHeaderLine fmtRightMedian = new VCFFormatHeaderLine("RM", 1, VCFHeaderLineType.Float, "right normalized median coverage.");
metaData.add(fmtLeftCov);
metaData.add(fmtMidCov);
metaData.add(fmtRightCov);
metaData.add(fmtLeftMedian);
metaData.add(fmtMidMedian);
metaData.add(fmtRightMedian);
final VCFHeader header = new VCFHeader(metaData, sampleNames);
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
final VariantContextWriter vcw = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
vcw.writeHeader(header);
for (final Locatable contig : contigs) {
System.gc();
final short[] array = new short[contig.getLengthOnReference()];
final SortingCollection<Gt> sorter = SortingCollection.newInstance(Gt.class, new GtCodec(), (A, B) -> A.compare1(B), this.sorting.getMaxRecordsInRam(), this.sorting.getTmpPaths());
for (int bam_index = 0; bam_index < samples.size(); bam_index++) {
final Sample sampleBam = samples.get(bam_index);
Arrays.fill(array, (short) 0);
try (SAMRecordIterator iter = sampleBam.samReader.queryOverlapping(contig.getContig(), contig.getStart(), contig.getEnd())) {
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
int refPos = rec.getStart();
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int i = 0; i < ce.getLength(); i++) {
final int idx = refPos - contig.getStart() + i;
if (idx < 0)
continue;
if (idx >= array.length)
break;
if (array[idx] == Short.MAX_VALUE)
continue;
array[idx]++;
}
}
refPos += ce.getLength();
}
}
}
}
for (int widx = 0; widx < windows_array.length; widx += 2) {
final int window_size = windows_array[widx + 0];
final int extend = (int) Math.ceil(window_size * this.extend);
if (extend <= 0)
continue;
if (window_size > contig.getLengthOnReference())
continue;
final int window_shift = windows_array[widx + 1];
LOG.info(contig + " " + window_size + "+-" + extend + ";" + window_shift + " " + sampleBam.name);
final Coverage leftcov = new Coverage(extend);
final Coverage rightcov = new Coverage(extend);
final Coverage leftrightcov = new Coverage(extend + extend);
final Coverage midcov = new Coverage(window_size);
for (int pos1 = contig.getStart(); pos1 + window_size + extend + extend < contig.getEnd(); pos1 += window_shift) {
leftcov.reset();
rightcov.reset();
leftrightcov.reset();
midcov.reset();
for (int x = 0; x < extend; x++) {
final int idx = pos1 - contig.getStart() + x;
leftcov.add(array[idx]);
leftrightcov.add(array[idx]);
}
final double leftMedian = leftcov.median();
if (leftMedian < this.min_depth)
continue;
for (int x = 0; x < extend; x++) {
final int idx = pos1 - contig.getStart() + extend + window_size + x;
rightcov.add(array[idx]);
leftrightcov.add(array[idx]);
}
final double rightMedian = rightcov.median();
if (rightMedian < this.min_depth)
continue;
for (int x = 0; x < window_size; x++) {
final int idx = pos1 - contig.getStart() + extend + x;
midcov.add(array[idx]);
}
final double median = leftrightcov.median();
if (rightcov.median() < this.min_depth)
continue;
for (int x = 0; x < midcov.count; x++) {
midcov.array[x] /= median;
}
for (int x = 0; x < leftcov.count; x++) {
leftcov.array[x] /= median;
}
for (int x = 0; x < rightcov.count; x++) {
rightcov.array[x] /= median;
}
final double norm_depth = midcov.median();
final int cnv = getCNVIndex(norm_depth);
if (cnv != CNV_UNDEFINED && cnv != 1 && getCNVIndex(leftcov.median()) == 1 && getCNVIndex(rightcov.median()) == 1) {
final Gt gt = new Gt();
gt.start = pos1 + extend;
gt.end = gt.start + window_size;
gt.sample_idx = bam_index;
gt.cnv = cnv;
gt.leftMedian = (float) leftMedian;
gt.midMedian = (float) median;
gt.rightMedian = (float) rightMedian;
gt.leftCov = (float) leftcov.median();
gt.midCov = (float) norm_depth;
gt.rightCov = (float) rightcov.median();
sorter.add(gt);
}
}
}
}
sorter.setDestructiveIteration(true);
try (CloseableIterator<Gt> iter = sorter.iterator()) {
final EqualRangeIterator<Gt> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare0(B));
while (eq.hasNext()) {
final List<Gt> row = eq.next();
if (row.isEmpty())
continue;
final Gt first = row.get(0);
final Set<Allele> alleles = new HashSet<>();
row.stream().flatMap(GT -> cnv2allele.apply(GT.cnv).stream()).forEach(CNV -> alleles.add(CNV));
alleles.add(ref_allele);
if (alleles.size() < 2)
continue;
final VariantContextBuilder vcb = new VariantContextBuilder(null, contig.getContig(), first.start, first.end, alleles);
vcb.attribute(VCFConstants.END_KEY, first.end);
final List<Genotype> genotypes = new ArrayList<>(samples.size());
for (final Gt gt : row) {
final GenotypeBuilder gb = new GenotypeBuilder(samples.get(gt.sample_idx).name, cnv2allele.apply(gt.cnv));
gb.attribute(fmtLeftMedian.getID(), (double) gt.leftMedian);
gb.attribute(fmtMidMedian.getID(), (double) gt.midMedian);
gb.attribute(fmtRightMedian.getID(), (double) gt.rightMedian);
gb.attribute(fmtLeftCov.getID(), (double) gt.leftCov);
gb.attribute(fmtMidCov.getID(), (double) gt.midCov);
gb.attribute(fmtRightCov.getID(), (double) gt.rightCov);
genotypes.add(gb.make());
}
vcb.genotypes(genotypes);
vcw.add(vcb.make());
}
eq.close();
}
sorter.cleanup();
}
vcw.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
samples.forEach(S -> CloserUtil.close(S));
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class GtfRetroCopy method doWork.
@Override
public int doWork(final List<String> args) {
VCFReader vcfFileReader = null;
VariantContextWriter vcw0 = null;
try {
/* load the reference genome */
/* create a contig name converter from the REF */
final Set<String> knownGeneIds;
if (this.knownPath != null) {
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.knownPath)) {
knownGeneIds = br.lines().filter(L -> !StringUtils.isBlank(L)).map(S -> S.trim()).filter(S -> !(S.equals("-") || S.equals(".") || S.startsWith("#"))).collect(Collectors.toSet());
}
} else {
knownGeneIds = Collections.emptySet();
}
// open the sam file
final String input = oneAndOnlyOneFile(args);
vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(input), true);
final VCFHeader header = vcfFileReader.getHeader();
final SAMSequenceDictionary dict = header.getSequenceDictionary();
final Comparator<String> contigCmp = dict == null ? (A, B) -> A.compareTo(B) : new ContigDictComparator(dict);
final Comparator<Gene> geneCmp = (A, B) -> {
int i = contigCmp.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
};
final GtfReader gtfReader = new GtfReader(this.gtfPath);
if (dict != null && !dict.isEmpty()) {
this.writingVcf.dictionary(dict);
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
}
final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.getTranscripts().stream().count() > 0L).filter(G -> G.getTranscripts().stream().anyMatch(T -> T.getIntronCount() >= this.min_intron_count)).sorted(geneCmp).collect(Collectors.toList());
gtfReader.close();
/**
* build vcf header
*/
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_BOUNDS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Introns boundaries"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_SIZES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Introns sizes"));
metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
}
// metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
// metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
// "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
metaData.add(new VCFFilterHeaderLine(ATT_KNOWN, "Known RetroGenes. " + (this.knownPath == null ? "" : " Source: " + this.knownPath)));
final VCFHeader header2 = new VCFHeader(header);
metaData.stream().forEach(H -> header2.addMetaDataLine(H));
JVarkitVersion.getInstance().addMetaData(this, header2);
final Allele ref = Allele.create((byte) 'N', true);
final Allele alt = Allele.create("<RETROCOPY>", false);
/* open vcf for writing*/
vcw0 = this.writingVcf.open(this.outputFile);
vcw0.writeHeader(header2);
final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(dict).build();
for (final Gene gene : genes) {
progress.apply(gene);
final List<VariantContext> variants = new ArrayList<>();
final CloseableIterator<VariantContext> iter2 = vcfFileReader.query(gene.getContig(), gene.getStart(), gene.getEnd());
while (iter2.hasNext()) {
final VariantContext ctx = iter2.next();
// SNV
if (ctx.getStart() == ctx.getEnd())
continue;
StructuralVariantType svType = ctx.getStructuralVariantType();
if (StructuralVariantType.BND.equals(svType))
continue;
if (StructuralVariantType.INS.equals(svType))
continue;
variants.add(ctx);
}
iter2.close();
if (variants.isEmpty())
continue;
for (final Transcript transcript : gene.getTranscripts()) {
if (!transcript.hasIntron())
continue;
if (transcript.getIntronCount() < this.min_intron_count)
continue;
final Counter<String> samples = new Counter<>();
for (final Intron intron : transcript.getIntrons()) {
for (final VariantContext ctx : variants) {
if (!isWithinDistance(intron.getStart(), ctx.getStart()))
continue;
if (!isWithinDistance(intron.getEnd(), ctx.getEnd()))
continue;
if (ctx.hasGenotypes()) {
for (final Genotype g : ctx.getGenotypes()) {
if (g.isNoCall() || g.isHomRef())
continue;
samples.incr(g.getSampleName());
}
} else {
samples.incr("*");
}
}
// end iter2
}
// end intron
final long max_count = samples.stream().mapToLong(E -> E.getValue()).max().orElse(0L);
if (max_count == 0)
continue;
if (this.only_all_introns && max_count != transcript.getIntronCount())
continue;
// ok good candidate
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(transcript.getContig());
vcb.start(transcript.getStart());
vcb.stop(transcript.getEnd());
switch(this.idKey) {
case gene_name:
final String gn = transcript.getGene().getGeneName();
vcb.id(StringUtils.isBlank(gn) ? transcript.getId() : gn);
break;
case gene_id:
vcb.id(transcript.getGene().getId());
break;
case transcript_id:
vcb.id(transcript.getId());
break;
default:
throw new IllegalStateException();
}
final List<Allele> alleles = Arrays.asList(ref, alt);
// vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY,2);
// vcb.attribute(VCFConstants.ALLELE_COUNT_KEY,1);
// vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY,0.5);
vcb.attribute(VCFConstants.SVTYPE, "DEL");
vcb.attribute(VCFConstants.END_KEY, transcript.getEnd());
vcb.attribute("SVLEN", transcript.getLengthOnReference());
vcb.attribute(ATT_INTRONS_BOUNDS, transcript.getIntrons().stream().map(S -> "" + S.getStart() + "-" + S.getEnd()).collect(Collectors.toList()));
vcb.attribute(ATT_INTRONS_SIZES, transcript.getIntrons().stream().mapToInt(S -> S.getLengthOnReference()).toArray());
for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
final String v = transcript.getProperties().get(att);
if (StringUtils.isBlank(v))
continue;
vcb.attribute(att, v);
}
vcb.alleles(alleles);
boolean pass_filter = true;
// introns sequences
vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, max_count);
vcb.attribute(ATT_INTRONS_COUNT, transcript.getIntronCount());
vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, max_count / (float) transcript.getIntronCount());
if (transcript.getIntronCount() != max_count) {
vcb.filter(ATT_NOT_ALL_INTRONS);
pass_filter = false;
}
if (knownGeneIds.contains(transcript.getGene().getId())) {
vcb.filter(ATT_KNOWN);
pass_filter = false;
}
if (header.hasGenotypingData()) {
final List<Genotype> genotypes = new ArrayList<>();
for (final String sn : header.getSampleNamesInOrder()) {
final List<Allele> gtalleles;
if (samples.count(sn) == 0L) {
gtalleles = Arrays.asList(ref, ref);
} else {
gtalleles = Arrays.asList(ref, alt);
}
final GenotypeBuilder gb = new GenotypeBuilder(sn, gtalleles);
genotypes.add(gb.make());
}
vcb.genotypes(genotypes);
}
if (pass_filter)
vcb.passFilters();
vcw0.add(vcb.make());
}
}
progress.close();
vcw0.close();
vcfFileReader.close();
vcfFileReader = null;
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(vcfFileReader);
CloserUtil.close(vcw0);
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class ReferenceToHtml method doWork.
@Override
public int doWork(final List<String> args) {
try {
try (VCFReader reader = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)), true)) {
final VCFHeader header = reader.getHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final Optional<String> buildName = SequenceDictionaryUtils.getBuildName(dict);
try (ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx)) {
SequenceUtil.assertSequenceDictionariesEqual(SequenceDictionaryUtils.extractRequired(reference), dict);
final List<Locatable> locs = IntervalListProvider.from(this.regionsStr).dictionary(dict).stream().sorted(new ContigDictComparator(dict).createLocatableComparator()).collect(Collectors.toList());
if (locs.isEmpty()) {
LOG.error("no region was defined");
return -1;
}
final XMLOutputFactory xof = XMLOutputFactory.newFactory();
try (ArchiveFactory archive = ArchiveFactory.open(archiveOutput)) {
try (PrintWriter pw = archive.openWriter(this.prefix + "script.js")) {
pw.println(StaticCodeExtractor.forClass(this.getClass()).extract("SCRIPT").get());
pw.flush();
}
try (PrintWriter pw = archive.openWriter(this.prefix + "style.css")) {
pw.println(StaticCodeExtractor.forClass(this.getClass()).extract("CSS").get());
pw.flush();
}
final OutputStream index_os = archive.openOuputStream(this.prefix + "index.html");
final XMLStreamWriter index_html = xof.createXMLStreamWriter(index_os, "UTF-8");
index_html.writeStartDocument("UTF-8", "1.0");
index_html.writeStartElement("html");
index_html.writeStartElement("head");
writeMeta(index_html);
index_html.writeStartElement("title");
index_html.writeCharacters(this.getProgramName());
// title
index_html.writeEndElement();
index_html.writeStartElement("link");
index_html.writeAttribute("rel", "stylesheet");
index_html.writeAttribute("href", this.prefix + "style.css");
index_html.writeCharacters("");
// link
index_html.writeEndElement();
// head
index_html.writeEndElement();
index_html.writeStartElement("body");
index_html.writeStartElement("ul");
for (final Locatable loc : locs) {
final List<VariantContext> variants;
try (CloseableIterator<VariantContext> iter = reader.query(loc)) {
variants = iter.stream().collect(Collectors.toList());
}
StringWriter sw = new StringWriter();
sw.append("var g = ");
JsonWriter jsw = new JsonWriter(sw);
jsw.beginObject();
jsw.name("contig").value(loc.getContig());
jsw.name("start").value(loc.getStart());
jsw.name("end").value(loc.getEnd());
jsw.name("length").value(loc.getLengthOnReference());
jsw.name("sequence").value(reference.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getEnd()).getBaseString());
jsw.name("variants");
jsw.beginArray();
for (VariantContext ctx : variants) {
jsw.beginObject();
jsw.name("start").value(ctx.getStart());
jsw.name("end").value(ctx.getEnd());
jsw.name("id").value(ctx.hasID() ? ctx.getID() : "");
jsw.name("type").value(ctx.getType().name());
jsw.name("ref").value(ctx.getReference().getDisplayString());
jsw.name("alts");
jsw.beginArray();
for (Allele alt : ctx.getAlternateAlleles()) {
jsw.value(alt.getDisplayString());
}
jsw.endArray();
jsw.endObject();
}
jsw.endArray();
jsw.endObject();
jsw.flush();
jsw.close();
sw.append(";");
final String filename = this.prefix + loc.getContig() + "_" + loc.getStart() + "_" + loc.getEnd() + ".html";
final String title = (buildName.isPresent() ? buildName.get() + " " : "") + new SimpleInterval(loc).toNiceString();
OutputStream os = archive.openOuputStream(filename);
XMLStreamWriter out = xof.createXMLStreamWriter(os, "UTF-8");
out.writeStartDocument("UTF-8", "1.0");
out.writeStartElement("html");
out.writeStartElement("head");
writeMeta(out);
out.writeStartElement("script");
out.writeCharacters(sw.toString());
// script
out.writeEndElement();
out.writeStartElement("script");
out.writeAttribute("src", this.prefix + "script.js");
out.writeCharacters("");
// script
out.writeEndElement();
out.writeStartElement("link");
out.writeAttribute("rel", "stylesheet");
out.writeAttribute("href", this.prefix + "style.css");
out.writeCharacters("");
// link
out.writeEndElement();
out.writeStartElement("title");
out.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
// title
out.writeEndElement();
// head
out.writeEndElement();
out.writeStartElement("body");
out.writeStartElement("h1");
out.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
out.writeEndElement();
out.writeStartElement("div");
checkBox(out, "showPos", "Line Number");
checkBox(out, "showSpace", "Spaces");
checkBox(out, "showVar", "Show Variants");
out.writeCharacters(" ");
out.writeEmptyElement("input");
out.writeAttribute("id", "primer");
out.writeAttribute("type", "text");
out.writeAttribute("placeholder", "Primer Sequence");
out.writeStartElement("button");
out.writeAttribute("id", "search");
out.writeCharacters("Search...");
out.writeEndElement();
// div
out.writeEndElement();
out.writeStartElement("div");
out.writeAttribute("class", "sequence");
out.writeAttribute("id", "main");
// div
out.writeEndElement();
// body
out.writeEndElement();
// html
out.writeEndElement();
os.flush();
os.close();
index_html.writeStartElement("li");
index_html.writeStartElement("a");
index_html.writeAttribute("href", filename);
index_html.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
// a
index_html.writeEndElement();
// li
index_html.writeEndElement();
}
// ul
index_html.writeEndElement();
// body
index_html.writeEndElement();
// html
index_html.writeEndElement();
index_html.writeEndDocument();
index_html.close();
index_os.flush();
index_os.close();
}
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class CopyNumber01 method doWork.
@Override
public int doWork(final List<String> args) {
ReferenceSequenceFile indexedFastaSequenceFile = null;
try {
this.sexContigSet.addAll(Arrays.stream(this.sexContigStr.split("[, \t]")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
/* loading REF Reference */
indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
final Comparator<Locatable> locComparator = new ContigDictComparator(dict).createLocatableComparator();
final List<Locatable> intervals = new ArrayList<>();
if (this.bedFile == null) {
for (final Locatable loc : dict.getSequences()) {
intervals.add(loc);
}
} else {
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.bedFile)) {
br.stream().filter(L -> !StringUtil.isBlank(converter.apply(L.getContig()))).forEach(B -> {
final String ctg = converter.apply(B.getContig());
intervals.add(new SimpleInterval(ctg, B.getStart(), B.getEnd()));
});
}
}
if (intervals.isEmpty()) {
LOG.error("no interval defined.");
return -1;
}
LOG.info("intervals N=" + intervals.size() + " mean-size:" + intervals.stream().mapToInt(R -> R.getLengthOnReference()).average().orElse(0.0));
final List<GCAndDepth> user_items = new ArrayList<>();
// split intervals
for (final Locatable loc : intervals) {
int pos = loc.getStart();
while (pos < loc.getEnd()) {
final int pos_end = Math.min(pos + this.windowSize, loc.getEnd());
final GCAndDepth dataRow = new GCAndDepth(loc.getContig(), pos, pos_end);
if (dataRow.getLengthOnReference() < this.windowMin) {
break;
}
user_items.add(dataRow);
pos += this.windowShift;
}
}
// free memory
intervals.clear();
LOG.info("sorting N=" + user_items.size());
Collections.sort(user_items, locComparator);
// fill gc percent
LOG.info("fill gc% N=" + user_items.size());
for (final String ctg : user_items.stream().map(T -> T.getContig()).collect(Collectors.toSet())) {
final GenomicSequence gseq = new GenomicSequence(indexedFastaSequenceFile, ctg);
for (final GCAndDepth dataRow : user_items) {
if (!dataRow.getContig().equals(ctg))
continue;
final GCPercent gc = gseq.getGCPercent(dataRow.getStart(), dataRow.getEnd());
if (gc.isEmpty())
continue;
dataRow.gc = gc.getGCPercent();
}
}
// remove strange gc
user_items.removeIf(B -> B.gc < this.minGC);
user_items.removeIf(B -> B.gc > this.maxGC);
LOG.info("remove high/low gc% N=" + user_items.size());
if (user_items.stream().allMatch(P -> isSex(P.getContig()))) {
LOG.error("All chromosomes are defined as sexual. Cannot normalize");
return -1;
}
final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
/* header */
pw.println("#CHROM\tSTART\tEND\tSample\tIDX\tGC\tRAW-DEPTH\tNORM-DEPTH");
for (final Path bamPath : IOUtils.unrollPaths(args)) {
// open this samReader
try (SamReader samReader = super.createSamReaderFactory().referenceSequence(this.refFile).open(bamPath)) {
if (!samReader.hasIndex()) {
LOG.error("file is not indexed " + bamPath);
return -1;
}
final SAMFileHeader header = samReader.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
final String sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet()).stream().collect(HtsCollectors.toSingleton());
final List<GCAndDepth> bam_items = new ArrayList<>(user_items.size());
/* loop over contigs */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
/* create a **COPY** of the intervals */
final List<GCAndDepth> ctgitems = user_items.stream().filter(T -> T.contigsMatch(ssr)).collect(Collectors.toList());
if (ctgitems.isEmpty())
continue;
LOG.info("Getting coverage for " + ssr.getSequenceName() + " N=" + ctgitems.size());
// get coverage
final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(samReader, ctgitems, sampleName);
// fill coverage
for (final GCAndDepth gc : ctgitems) {
final OptionalDouble optCov;
switch(this.univariateDepth) {
case median:
optCov = coverage.getMedian(gc);
break;
case mean:
optCov = coverage.getAverage(gc);
break;
default:
throw new IllegalStateException();
}
gc.raw_depth = optCov.orElse(-1.0);
gc.norm_depth = gc.raw_depth;
}
ctgitems.removeIf(V -> V.raw_depth < 0);
ctgitems.removeIf(V -> V.raw_depth > this.weirdMaxDepth);
ctgitems.removeIf(V -> V.raw_depth < this.weirdMinDepth);
if (ctgitems.isEmpty())
continue;
bam_items.addAll(ctgitems);
}
double[] y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.raw_depth).toArray();
LOG.info("median raw depth " + new Median().evaluate(y, 0, y.length));
Collections.sort(bam_items, (a, b) -> {
final int i = Double.compare(a.getX(), b.getX());
if (i != 0)
return i;
return Double.compare(a.getY(), b.getY());
});
double[] x = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getX()).toArray();
y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getY()).toArray();
// get min GC
final double min_x = x[0];
// get max GC
final double max_x = x[x.length - 1];
LOG.info("min/max gc " + min_x + " " + max_x);
/* merge adjacent x (GC%) having same values */
int i = 0;
int k = 0;
while (i < x.length) {
int j = i + 1;
while (j < x.length && Precision.equals(x[i], x[j])) {
++j;
}
x[k] = x[i];
y[k] = this.univariateGCLoess.create().evaluate(y, i, j - i);
++k;
i = j;
}
LOG.info("merged n=" + (x.length - k) + " items.");
/* reduce size of x et y */
final List<XY> xyL = new ArrayList<>(k);
for (int t = 0; t < k; ++t) {
xyL.add(new XYImpl(x[t], y[t]));
}
/* sort on Y */
Collections.sort(xyL, (a, b) -> {
final int d = Double.compare(a.getX(), b.getX());
if (d != 0)
return d;
return Double.compare(a.getY(), b.getY());
});
x = xyL.stream().mapToDouble(R -> R.getX()).toArray();
y = xyL.stream().mapToDouble(R -> R.getY()).toArray();
final UnivariateInterpolator interpolator = createInterpolator();
UnivariateFunction spline = null;
try {
spline = interpolator.interpolate(x, y);
} catch (final org.apache.commons.math3.exception.NumberIsTooSmallException err) {
spline = null;
LOG.error("Cannot use " + interpolator.getClass().getName() + ":" + err.getMessage());
}
// min depth cal
int points_removed = 0;
i = 0;
while (i < bam_items.size()) {
final GCAndDepth r = bam_items.get(i);
if (spline == null) {
++i;
} else if (r.getX() < min_x || r.getX() > max_x) {
bam_items.remove(i);
++points_removed;
} else {
final double norm;
if (this.gcDepthInterpolation.equals(UnivariateInerpolation.identity)) {
norm = r.getY();
} else {
norm = spline.value(r.getX());
}
if (Double.isNaN(norm) || Double.isInfinite(norm)) {
LOG.info("NAN " + r);
bam_items.remove(i);
++points_removed;
continue;
}
r.norm_depth = norm;
++i;
}
}
LOG.info("removed " + points_removed + ". now N=" + bam_items.size());
if (bam_items.isEmpty())
continue;
spline = null;
// DO NOT NORMALIZE ON MINIMUM DEPTH, STUPID.
// normalize on median
y = bam_items.stream().mapToDouble(G -> G.getY()).toArray();
final double median_depth = this.univariateMid.create().evaluate(y, 0, y.length);
LOG.info("median norm depth : " + median_depth);
for (i = 0; median_depth > 0 && i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
gc.norm_depth /= median_depth;
}
// restore genomic order
Collections.sort(bam_items, locComparator);
// smoothing values with neighbours
y = bam_items.stream().mapToDouble(V -> V.getY()).toArray();
for (i = 0; i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
int left = i;
for (int j = Math.max(0, i - this.smooth_window); j <= i; ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
continue;
left = j;
break;
}
int right = i;
for (int j = i; j <= i + this.smooth_window && j < bam_items.size(); ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
break;
right = j;
// no break;
}
gc.norm_depth = this.univariateSmooth.create().evaluate(y, left, (right - left) + 1);
}
/* print data */
for (final GCAndDepth r : bam_items) {
pw.print(r.getContig());
pw.print('\t');
pw.print(r.getStart() - 1);
pw.print('\t');
pw.print(r.getEnd());
pw.print('\t');
pw.print(sampleName);
pw.print('\t');
pw.print(getGenomicIndex(dict, r.getContig(), r.getStart()) - 1);
pw.print('\t');
pw.printf("%.3f", r.gc);
pw.print('\t');
pw.printf("%.3f", r.raw_depth);
pw.print('\t');
pw.printf("%.3f", r.norm_depth);
pw.println();
}
pw.flush();
}
// samReader
}
// end loop bamPath
pw.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(indexedFastaSequenceFile);
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class BedNonOverlappingSet method doWork.
@Override
public int doWork(final List<String> args) {
if (this.extend < 0) {
LOG.info("extend cannot be negative.");
return -1;
}
ArchiveFactory archiveFactory = null;
PrintWriter manifest = null;
BufferedReader br;
try {
final Comparator<String> cmpContig;
if (this.refFile != null) {
final SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(this.refFile);
cmpContig = new ContigDictComparator(dict);
} else {
cmpContig = (A, B) -> A.compareTo(B);
}
final Comparator<Locatable> cmpbed = (A, B) -> {
int i = cmpContig.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
i = Integer.compare(A.getEnd(), B.getEnd());
return i;
};
if (this.manifestFile != null) {
manifest = IOUtils.openPathForPrintWriter(this.manifestFile);
} else {
manifest = new PrintWriter(new NullOuputStream());
}
if (args.isEmpty()) {
br = super.openBufferedReader(null);
scan(br);
br.close();
} else {
for (final String fname : args) {
br = super.openBufferedReader(fname);
scan(br);
br.close();
}
}
if (this.bedsets.isEmpty()) {
LOG.warn("No set was created");
}
archiveFactory = ArchiveFactory.open(this.archivePath);
if (compressed)
archiveFactory.setCompressionLevel(0);
for (int i = 0; i < this.bedsets.size(); i++) {
final BlockCompressedOutputStream bcos;
final PrintWriter pw;
String filename = String.format("%05d", (i + 1));
filename += FileExtensions.BED;
if (this.compressed) {
filename += ".gz";
bcos = new BlockCompressedOutputStream(archiveFactory.openOuputStream(filename), (Path) null);
pw = new PrintWriter(bcos);
} else {
bcos = null;
pw = archiveFactory.openWriter(filename);
}
final IntervalTreeMap<BedRecord> itm = this.bedsets.get(i);
final List<BedLine> list = itm.values().stream().map(R -> R.bedline).sorted(cmpbed).collect(Collectors.toList());
LOG.info("saving " + filename + " " + list.size());
for (final BedLine bl : list) {
pw.println(bl.join());
}
if (bcos != null)
bcos.flush();
pw.flush();
pw.close();
manifest.println(filename + "\t" + list.size());
}
archiveFactory.close();
archiveFactory = null;
manifest.flush();
manifest.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(archiveFactory);
CloserUtil.close(manifest);
}
}
Aggregations