use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class ExpansionHunterMerge method doWork.
@Override
public int doWork(List<String> args) {
SortingCollection<VariantContext> sorter = null;
try {
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.info("no input file.");
return -1;
}
final String REPID = "REPID";
final String RU = "RU";
final String fakeSample = "___FAKE";
SAMSequenceDictionary dict = null;
final Set<String> samples = new TreeSet<>();
final Set<VCFHeaderLine> metaData = new HashSet<>();
for (final Path path : inputs) {
try (VCFReader r = VCFReaderFactory.makeDefault().open(path, false)) {
final VCFHeader header = r.getHeader();
if (header.getInfoHeaderLine(REPID) == null) {
LOG.error("missing INFO/" + REPID);
return -1;
}
final SAMSequenceDictionary d = header.getSequenceDictionary();
if (d != null) {
if (dict == null) {
dict = d;
} else {
SequenceUtil.assertSequenceDictionariesEqual(d, dict);
}
}
if (header.getNGenotypeSamples() != 1) {
LOG.error("expected one and only one genotyped sample in " + path);
return -1;
}
final String sn = header.getSampleNamesInOrder().get(0);
if (sn.equals(fakeSample)) {
LOG.error(sn + " cannot be named " + fakeSample + " in " + path);
return -1;
}
if (samples.contains(sn)) {
LOG.error("duplicate sample " + sn + " in " + path);
return -1;
}
metaData.addAll(header.getMetaDataInInputOrder());
samples.add(sn);
}
}
final VCFHeader tmpHeader = new VCFHeader(metaData, Arrays.asList(fakeSample));
final VCFInfoHeaderLine sampleInfo = new VCFInfoHeaderLine("SRCSAMPLE", 1, VCFHeaderLineType.String, "SRC SAMPLE");
final Comparator<String> ctgComparator = dict == null ? (A, B) -> A.compareTo(B) : new ContigDictComparator(dict);
final BiFunction<VariantContext, String, String> getAtt = (V, A) -> {
final String s1 = V.getAttributeAsString(A, "");
if (StringUtils.isBlank(s1))
throw new IllegalStateException("INFO/" + A + " missing in " + V);
return s1;
};
final Comparator<VariantContext> comparator1 = (V1, V2) -> {
String s1 = V1.getContig();
String s2 = V2.getContig();
int i = ctgComparator.compare(s1, s2);
if (i != 0)
return i;
i = Integer.compare(V1.getStart(), V2.getStart());
if (i != 0)
return i;
s1 = getAtt.apply(V1, REPID);
s2 = getAtt.apply(V2, REPID);
return s1.compareTo(s2);
};
final Comparator<VariantContext> comparator2 = (V1, V2) -> {
final int i = comparator1.compare(V1, V2);
if (i != 0)
return i;
final String s1 = getAtt.apply(V1, sampleInfo.getID());
final String s2 = getAtt.apply(V2, sampleInfo.getID());
return s1.compareTo(s2);
};
tmpHeader.addMetaDataLine(sampleInfo);
sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(tmpHeader, false), comparator2, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorter.setDestructiveIteration(true);
for (final Path path : inputs) {
LOG.info("adding " + path);
try (VCFReader r = VCFReaderFactory.makeDefault().open(path, false)) {
try (CloseableIterator<VariantContext> iter = r.iterator()) {
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
final Genotype gt0 = ctx.getGenotype(0);
final Genotype gt = new GenotypeBuilder(gt0).name(fakeSample).make();
vcb.genotypes(Collections.singletonList(gt));
vcb.attribute(sampleInfo.getID(), gt0.getSampleName());
sorter.add(vcb.make());
}
}
}
}
sorter.doneAdding();
final VCFHeader outputHeader = new VCFHeader(metaData, samples);
if (dict != null)
outputHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, outputHeader);
try (VariantContextWriter w = writingVariants.dictionary(dict).open(this.outputFile)) {
w.writeHeader(outputHeader);
try (CloseableIterator<VariantContext> iter = sorter.iterator()) {
final EqualRangeIterator<VariantContext> eq = new EqualRangeIterator<>(iter, comparator1);
while (eq.hasNext()) {
final List<VariantContext> calls = eq.next();
final VariantContext first = calls.get(0);
final Set<Allele> altAllelesSet = calls.stream().flatMap(V -> V.getGenotypes().stream()).flatMap(GT -> GT.getAlleles().stream()).filter(A -> !(A.isReference() || A.isNoCall())).collect(Collectors.toSet());
if (altAllelesSet.isEmpty())
continue;
final List<Allele> altAllelesList = new ArrayList<>(altAllelesSet);
final List<Allele> vcAlleles = new ArrayList<>(altAllelesList.size() + 1);
vcAlleles.add(first.getReference());
vcAlleles.addAll(altAllelesList);
final List<Genotype> genotypes = new ArrayList<>(samples.size());
for (final String sn : samples) {
final VariantContext vcs = calls.stream().filter(V -> getAtt.apply(V, sampleInfo.getID()).equals(sn)).findFirst().orElse(null);
final Genotype gt;
if (vcs == null) {
gt = GenotypeBuilder.createMissing(sn, 2);
} else {
gt = new GenotypeBuilder(vcs.getGenotype(0)).name(sn).make();
}
genotypes.add(gt);
}
final VariantContextBuilder vcb = new VariantContextBuilder(null, first.getContig(), first.getStart(), first.getEnd(), vcAlleles);
vcb.attribute(VCFConstants.END_KEY, first.getEnd());
vcb.attribute(REPID, getAtt.apply(first, REPID));
vcb.attribute(RU, getAtt.apply(first, RU));
vcb.genotypes(genotypes);
w.add(vcb.make());
}
eq.close();
}
}
sorter.cleanup();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class StarRetroCopy method doWork.
@Override
public int doWork(final List<String> args) {
if (this.min_depth < 1) {
LOG.error("Bad min depth");
return -1;
}
PrintWriter saveInsertionsPw = null;
SamReader sr = null;
VariantContextWriter vcw0 = null;
CloseableIterator<SAMRecord> iter = null;
SAMFileWriter sfw = null;
try {
/* load the reference genome */
/* create a contig name converter from the REF */
// open the sam file
final String input = oneFileOrNull(args);
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null)
srf.referenceSequence(this.faidx);
if (input != null) {
sr = srf.open(SamInputResource.of(input));
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sr.getFileHeader());
/* READ KNOWGENES FILES */
loadGTF(dict);
if (this.use_bai && !sr.hasIndex()) {
LOG.warning("Cannot used bai because input is not indexed");
}
// if there is a bai, only query the bam in the regions of splicing
if (this.use_bai && sr.hasIndex()) {
LOG.info("building intervals...");
final QueryInterval[] intervals = this.intronTreeMap.keySet().stream().flatMap(intron -> {
// we need the reads overlapping the exon bounds
final int tid = dict.getSequenceIndex(intron.getContig());
final int extend = 1 + Math.max(0, this.minCigarSize);
final QueryInterval q1 = new QueryInterval(tid, Math.max(1, intron.getStart() - extend), intron.getStart() + extend);
final QueryInterval q2 = new QueryInterval(tid, Math.max(1, intron.getEnd() - extend), intron.getEnd() + extend);
return Arrays.stream(new QueryInterval[] { q1, q2 });
}).sorted().collect(HtsCollectors.optimizedQueryIntervals());
LOG.debug("Query bam using " + intervals.length + " random access intervals. Please wait...");
iter = sr.queryOverlapping(intervals);
} else {
iter = sr.iterator();
}
} else {
sr = srf.open(SamInputResource.of(stdin()));
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sr.getFileHeader());
/* READ GTF FILES */
loadGTF(dict);
iter = sr.iterator();
}
final SAMFileHeader samFileHeader = sr.getFileHeader();
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(samFileHeader);
/* save gene writer */
if (this.saveBedPeTo != null) {
saveInsertionsPw = super.openPathOrStdoutAsPrintWriter(this.saveBedPeTo);
} else {
saveInsertionsPw = NullOuputStream.newPrintWriter();
}
if (this.saveBamTo != null) {
sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(samFileHeader, true, this.saveBamTo);
}
final String sample = samFileHeader.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse("SAMPLE");
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(samFileHeader).logger(LOG).build();
final String SAM_ATT_JI = "jI";
while (iter.hasNext()) {
final SAMRecord rec = progress.apply(iter.next());
if (rec.getReadUnmappedFlag())
continue;
if (rec.getMappingQuality() < this.min_read_mapq)
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
boolean save_read_to_bam = false;
/* save read if it is not properly mapped (problem with size) and he and his mate surround an intron */
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) && !rec.getProperPairFlag() && /* MUST NOT be proper pair */
rec.getReadNegativeStrandFlag() != rec.getMateNegativeStrandFlag()) {
final SimpleInterval intronInterval;
if (rec.getEnd() + 1 < rec.getMateAlignmentStart()) {
intronInterval = new SimpleInterval(rec.getContig(), rec.getEnd() + 1, rec.getMateAlignmentStart() - 1);
} else if (SAMUtils.hasMateCigar(rec) && SAMUtils.getMateAlignmentEnd(rec) + 1 < rec.getAlignmentStart()) {
intronInterval = new SimpleInterval(rec.getContig(), SAMUtils.getMateAlignmentEnd(rec) + 1, rec.getAlignmentStart() - 1);
} else {
intronInterval = null;
}
if (intronInterval != null) {
if (this.intronTreeMap.getOverlapping(intronInterval).stream().flatMap(L -> L.stream()).anyMatch(S -> intronInterval.contains(S))) {
save_read_to_bam = true;
}
}
}
/* WE use STAR DATA */
if (!this.use_cigar_string) {
if (!rec.hasAttribute(SAM_ATT_JI))
continue;
final Object tagValue = rec.getAttribute(SAM_ATT_JI);
paranoid.assertTrue((tagValue instanceof int[]));
final int[] bounds = (int[]) tagValue;
// jI:B:i,-1
if (bounds.length == 1 && bounds[0] < 0)
continue;
if (bounds.length % 2 != 0) {
LOG.warn("bound.length%2!=0 with " + rec.getSAMString());
continue;
}
for (int i = 0; i < bounds.length; i += 2) {
int intron_start = bounds[i];
int intron_end = bounds[i + 1];
final Interval r = new Interval(rec.getContig(), intron_start, intron_end);
// don't use overlapping : with STAR it is strictly the intron boundaries
final List<Segment> introns = this.intronTreeMap.get(r);
if (introns == null)
continue;
save_read_to_bam = true;
for (final Segment intron : introns) {
intron.match++;
}
}
} else /* WE use other bam like bwa-mem */
{
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.numCigarElements() < 2)
continue;
int ref1 = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
final int ref_end = ref1 + (op.consumesReferenceBases() ? ce.getLength() : 0);
if (op.equals(CigarOperator.N) || op.equals(CigarOperator.D)) {
final Interval r = new Interval(rec.getContig(), ref1, ref_end - 1);
final List<Segment> introns = this.intronTreeMap.get(r);
if (introns == null)
continue;
save_read_to_bam = true;
for (final Segment intron : introns) {
intron.match++;
}
}
ref1 = ref_end;
}
/**
* 2019-07-29. I tried using SA:Z:tag doesn't work well , so let's look a the clipping only
*/
if (cigar.isClipped()) {
for (int side = 0; side < 2; side++) {
final Interval r;
if (side == 0 && cigar.isRightClipped() && cigar.getLastCigarElement().getLength() >= this.minCigarSize) {
r = new Interval(rec.getContig(), rec.getEnd() + 1, rec.getEnd() + 1 + this.minCigarSize);
} else if (side == 1 && cigar.isLeftClipped() && cigar.getFirstCigarElement().getLength() >= this.minCigarSize) {
r = new Interval(rec.getContig(), Math.max(1, rec.getStart() - (1 + this.minCigarSize)), Math.max(1, rec.getStart() - (1)));
} else {
continue;
}
final int final_side = side;
final List<Segment> introns = this.intronTreeMap.getOverlapping(r).stream().flatMap(L -> L.stream()).filter(SEG -> isWithinIntronBound(SEG, r, final_side)).collect(Collectors.toList());
if (introns.isEmpty())
continue;
// System.err.println("SA for "+r+" "+rec.getReadName()+" "+introns.size());
save_read_to_bam = true;
for (final Segment intron : introns) {
intron.match++;
}
}
}
}
if (save_read_to_bam) {
saveInsertion(saveInsertionsPw, rec);
if (sfw != null)
sfw.addAlignment(rec);
}
}
final ContigDictComparator contigCmp = new ContigDictComparator(refDict);
this.all_transcripts.removeIf(T -> T.segments.stream().noneMatch(S -> S.match >= min_depth));
final int max_introns = this.all_transcripts.stream().mapToInt(K -> K.segments.size()).max().orElse(1);
final List<String> intron_names = IntStream.range(0, max_introns).mapToObj(IDX -> String.format("%s_INTRON_%04d", sample, 1 + IDX)).collect(Collectors.toList());
/**
* build vcf header
*/
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
}
// metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
// metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
// "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
metaData.add(new VCFFormatHeaderLine(INTRON_START, 1, VCFHeaderLineType.Integer, "Introns start"));
metaData.add(new VCFFormatHeaderLine(INTRON_END, 1, VCFHeaderLineType.Integer, "Introns end"));
metaData.add(new VCFFilterHeaderLine(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold, "Number of read is lower than :" + this.min_depth));
metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
final VCFHeader header = new VCFHeader(metaData, intron_names);
JVarkitVersion.getInstance().addMetaData(this, header);
header.setSequenceDictionary(refDict);
/* open vcf for writing*/
vcw0 = super.openVariantContextWriter(this.outputFile);
final VariantContextWriter vcw = vcw0;
vcw.writeHeader(header);
Collections.sort(this.all_transcripts, (A, B) -> {
int i = contigCmp.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
});
final Allele ref = Allele.create((byte) 'N', true);
final Allele alt = Allele.create("<RETROCOPY>", false);
for (final Transcript kg : this.all_transcripts) {
// ok good candidate
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(kg.getContig());
vcb.start(kg.getStart());
vcb.stop(kg.getEnd());
vcb.id(kg.transcript_id);
final List<Allele> alleles = Arrays.asList(ref, alt);
final int max_depth = kg.segments.stream().mapToInt(X -> X.match).max().orElse(0);
vcb.attribute(VCFConstants.DEPTH_KEY, max_depth);
vcb.log10PError(max_depth / -10.0);
boolean filter_set = false;
if (max_depth < this.low_depth_threshold) {
vcb.filter(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold);
filter_set = true;
}
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, 2);
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, 1);
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5);
vcb.attribute(VCFConstants.SVTYPE, "DEL");
vcb.attribute(VCFConstants.END_KEY, kg.getEnd());
vcb.attribute("SVLEN", kg.getLengthOnReference());
for (final String att : kg.attributes.keySet()) {
vcb.attribute(att, VCFUtils.escapeInfoField(kg.attributes.get(att)));
}
vcb.alleles(alleles);
// introns sequences
vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, kg.segments.stream().filter(I -> I.match > 0).count());
vcb.attribute(ATT_INTRONS_COUNT, kg.segments.size());
vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, kg.segments.stream().filter(I -> I.match > 0).count() / (float) kg.segments.size());
if (kg.segments.stream().filter(I -> I.match > 0).count() != kg.segments.size()) {
vcb.filter(ATT_NOT_ALL_INTRONS);
filter_set = true;
}
final List<Genotype> genotypes = new ArrayList<>(kg.segments.size());
/* build genotypes */
for (int i = 0; i < kg.segments.size(); i++) {
final Segment intron = kg.segments.get(i);
final GenotypeBuilder gb = new GenotypeBuilder(intron_names.get(i), Arrays.asList(ref, alt));
gb.DP(intron.match);
gb.attribute(INTRON_START, intron.start);
gb.attribute(INTRON_END, intron.end);
genotypes.add(gb.make());
}
vcb.genotypes(genotypes);
if (!filter_set) {
vcb.passFilters();
}
vcw.add(vcb.make());
}
progress.close();
vcw.close();
iter.close();
iter = null;
sr.close();
sr = null;
saveInsertionsPw.flush();
saveInsertionsPw.close();
saveInsertionsPw = null;
if (sfw != null) {
sfw.close();
sfw = null;
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sr);
CloserUtil.close(vcw0);
CloserUtil.close(sfw);
CloserUtil.close(saveInsertionsPw);
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class FindNewSpliceSites method doWork.
@Override
public int doWork(final List<String> args) {
SamReader sfr = null;
PrintWriter bedWriter = null;
SortingCollection<Junction> junctionSorter = null;
try {
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null) {
srf.referenceSequence(this.faidx);
}
final String input = oneFileOrNull(args);
sfr = input == null ? srf.open(SamInputResource.of(stdin())) : srf.open(SamInputResource.of(input));
final SAMFileHeader header0 = sfr.getFileHeader();
try (GtfReader gftReader = new GtfReader(this.gtfPath)) {
SAMSequenceDictionary dict = header0.getSequenceDictionary();
if (dict != null)
gftReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gftReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.getExonCount() > 1).flatMap(T -> T.getIntrons().stream()).map(T -> T.toInterval()).forEach(T -> {
this.intronMap.put(T, T);
});
}
final SAMFileHeader header1 = header0.clone();
final SAMProgramRecord p = header1.createProgramRecord();
p.setCommandLine(getProgramCommandLine());
p.setProgramVersion(getVersion());
p.setProgramName(getProgramName());
this.sfw = this.writingBamArgs.openSamWriter(outputFile, header1, true);
final SAMFileHeader header2 = header0.clone();
final SAMProgramRecord p2 = header2.createProgramRecord();
p2.setCommandLine(getProgramCommandLine());
p2.setProgramVersion(getVersion());
p2.setProgramName(getProgramName());
this.weird = this.writingBamArgs.createSAMFileWriterFactory().makeSAMWriter(header2, true, new NullOuputStream());
if (this.bedOut != null) {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sfr.getFileHeader());
this.junctionComparator = new ContigDictComparator(dict).createLocatableComparator();
junctionSorter = SortingCollection.newInstance(Junction.class, new JunctionCodec(), (A, B) -> A.compare2(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
}
scan(sfr, p, p2, junctionSorter);
sfr.close();
if (this.bedOut != null) {
junctionSorter.doneAdding();
bedWriter = super.openPathOrStdoutAsPrintWriter(this.bedOut);
final String sample = StringUtils.ifBlank(header0.getReadGroups().stream().map(RG -> RG.getSample()).filter(s -> !StringUtils.isBlank(s)).collect(Collectors.toCollection(TreeSet::new)).stream().collect(Collectors.joining(";")), ".");
try (CloseableIterator<Junction> iter = junctionSorter.iterator()) {
final EqualRangeIterator<Junction> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare1(B));
while (eq.hasNext()) {
final List<Junction> row = eq.next();
final Junction first = row.get(0);
bedWriter.print(first.getContig());
bedWriter.print('\t');
bedWriter.print(first.getStart() - 1);
bedWriter.print('\t');
bedWriter.print(first.getEnd());
bedWriter.print('\t');
bedWriter.print(sample);
bedWriter.print('\t');
bedWriter.print(first.name);
bedWriter.print('\t');
bedWriter.print(row.size());
bedWriter.println();
}
eq.close();
}
bedWriter.flush();
bedWriter.close();
bedWriter = null;
junctionSorter.cleanup();
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfr);
CloserUtil.close(this.sfw);
CloserUtil.close(this.weird);
CloserUtil.close(bedWriter);
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class RNASeqPolyA method doWork.
@Override
public int doWork(final List<String> args) {
final Map<String, GeneInfo> geneToTrancripts = new HashMap<>();
try {
final String debugTranscript = dynaParams.getOrDefault("debug.transcript", "");
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.faidx);
if (!StringUtils.isBlank(this.limit_contig) && dict.getSequence(this.limit_contig) == null) {
throw new JvarkitException.ContigNotFoundInDictionary(this.limit_contig, dict);
}
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
final Gff3Codec gff3 = new Gff3Codec(Gff3Codec.DecodeDepth.DEEP);
try (InputStream is = IOUtils.openPathForReading(this.gffPath)) {
final AsciiLineReader asciiLineReader = AsciiLineReader.from(is);
final LineIterator lr = new LineIteratorImpl(asciiLineReader);
while (!gff3.isDone(lr)) {
decodeGff3Feature(gff3.decode(lr), converter, geneToTrancripts);
}
gff3.close(lr);
asciiLineReader.close();
}
if (geneToTrancripts.isEmpty()) {
LOG.warn("no transcript was found.");
// continue , empty VCF must be produced
}
final IntervalTreeMap<LastExon> exonMap = new IntervalTreeMap<>();
// fill exonMap
geneToTrancripts.values().stream().flatMap(K -> K.transcripts.values().stream()).forEach(X -> exonMap.put(new Interval(X), X));
final ToIntFunction<String> toTid = C -> {
final SAMSequenceRecord ssr = dict.getSequence(C);
if (ssr == null)
throw new JvarkitException.ContigNotFoundInDictionary(C, dict);
return ssr.getSequenceIndex();
};
final List<Path> inputs = IOUtils.unrollPaths(args);
final QueryInterval[] intervals = this.disable_bam_index || inputs.isEmpty() ? null : QueryInterval.optimizeIntervals(exonMap.values().stream().map(R -> new QueryInterval(toTid.applyAsInt(R.getContig()), R.getPosition(), R.getPosition())).toArray(N -> new QueryInterval[N]));
final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.faidx);
final String primerAAA;
if (polyA_primer_size > 0) {
primerAAA = StringUtils.repeat(this.polyA_primer_size, 'A');
} else {
primerAAA = null;
}
final Set<String> samples = new HashSet<>();
int bam_index = 0;
// loop over the bams
for (; ; ) {
final Path bamFilename = inputs.isEmpty() ? null : inputs.get(bam_index);
try (SamReader sr = inputs.isEmpty() ? srf.open(SamInputResource.of(stdin())) : srf.open(bamFilename)) {
final SAMFileHeader header0 = sr.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header0));
final String sample = header0.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(bamFilename == null ? "STDIN" : IOUtils.getFilenameWithoutCommonSuffixes(bamFilename));
if (samples.contains(sample)) {
LOG.error("duplicate sample " + sample);
return -1;
}
samples.add(sample);
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).build();
try (CloseableIterator<SAMRecord> iter = (intervals == null || inputs.isEmpty() ? /*stdin*/
sr.iterator() : sr.query(intervals, false))) {
while (iter.hasNext()) {
final SAMRecord rec = progress.apply(iter.next());
if (rec.getReadUnmappedFlag())
continue;
if (!StringUtils.isBlank(this.limit_contig) && !rec.getContig().equals(this.limit_contig))
continue;
if (this.default_read_filter && !SAMRecordDefaultFilter.accept(rec))
continue;
final Collection<LastExon> lastExons = exonMap.getOverlapping(rec);
if (lastExons.isEmpty())
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
final byte[] bases = rec.getReadBases();
if (bases == null || SAMRecord.NULL_SEQUENCE.equals(bases))
continue;
for (LastExon exon : lastExons) {
if (!StringUtils.isBlank(debugTranscript) && !exon.transcriptId.equals(debugTranscript)) {
continue;
}
ExonCount count = exon.sample2count.get(sample);
if (count == null) {
count = new ExonCount();
exon.sample2count.put(sample, count);
}
final StringBuilder sb = new StringBuilder();
boolean indel_flag = false;
boolean last_exon_in_intron_flag = false;
boolean match_last_base = false;
int ref1 = rec.getUnclippedStart();
int read0 = 0;
for (CigarElement ce : cigar) {
if (exon.isMinusStrand() && ref1 > exon.start)
break;
if (this.ignore_with_indels && indel_flag)
break;
final CigarOperator op = ce.getOperator();
switch(op) {
case P:
break;
case I:
indel_flag = true;
for (int i = 0; i < ce.getLength(); i++) {
if (exon.isAfterExon(ref1)) {
sb.append((char) Character.toUpperCase(bases[read0]));
}
read0++;
}
break;
case D:
case N:
if ((exon.isPlusStrand() && CoordMath.overlaps(ref1, ref1 + ce.getLength() - 1, exon.getEnd(), exon.getEnd() + 1)) || (exon.isMinusStrand() && CoordMath.overlaps(ref1, ref1 + ce.getLength() - 1, exon.getStart() - 1, exon.getStart()))) {
last_exon_in_intron_flag = true;
}
ref1 += ce.getLength();
indel_flag = true;
break;
case H:
for (int i = 0; i < ce.getLength(); i++) {
if (exon.isAfterExon(ref1)) {
sb.append('N');
}
if (ref1 == exon.getPosition())
match_last_base = true;
ref1++;
}
break;
case S:
case M:
case X:
case EQ:
for (int i = 0; i < ce.getLength(); i++) {
if (exon.isAfterExon(ref1)) {
sb.append((char) Character.toUpperCase(bases[read0]));
}
if (ref1 == exon.getPosition())
match_last_base = true;
read0++;
ref1++;
}
break;
default:
throw new IllegalStateException(op.name());
}
}
// premature end or start
if (!match_last_base || (exon.isPlusStrand() && ref1 < exon.getEnd()) || (exon.isMinusStrand() && ref1 < exon.getStart()) || (this.ignore_with_indels && indel_flag) || last_exon_in_intron_flag) {
continue;
}
// if(read0!=bases.length) throw new IllegalStateException("read0:"+read0+" expected "+bases.length+" in "+rec.getReadName());
// if(ref1!=1+rec.getUnclippedEnd())throw new IllegalStateException("ref1:"+ref1+" expected 1+"+rec.getUnclippedEnd()+" in "+rec.getReadName());
++count.n_tested_reads;
String polyA;
if (exon.isMinusStrand()) {
polyA = AcidNucleics.reverseComplement(sb);
} else {
polyA = sb.toString();
}
if (primerAAA != null) {
final int pos = polyA.indexOf(primerAAA);
if (pos > 0)
polyA = polyA.substring(pos);
}
int count_polyA = 0;
for (int i = 0; i < polyA.length(); i++) {
if (polyA.charAt(i) != 'A')
break;
count_polyA++;
}
if (count_polyA > 0) {
count.n_tested_reads_with_A++;
count.sum_polyA += count_polyA;
}
if (count_polyA > count.max_length_polyA) {
count.max_length_polyA = count_polyA;
}
}
// end of loop last exon
}
progress.close();
}
}
++bam_index;
if (inputs.isEmpty() || bam_index >= inputs.size())
break;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
final VCFInfoHeaderLine infoGeneId = new VCFInfoHeaderLine("GENE", 1, VCFHeaderLineType.String, "Gene ID in " + this.gffPath);
metaData.add(infoGeneId);
final VCFInfoHeaderLine infoTranscriptId = new VCFInfoHeaderLine("TRANSCRIPT", 1, VCFHeaderLineType.String, "Transcript ID in " + this.gffPath);
metaData.add(infoTranscriptId);
final VCFInfoHeaderLine infoStrand = new VCFInfoHeaderLine("STRAND", 1, VCFHeaderLineType.String, "Strand");
metaData.add(infoStrand);
final VCFInfoHeaderLine infoTranscriptMaxPolyA = new VCFInfoHeaderLine("TRANSCRIPT_MAX", 1, VCFHeaderLineType.Integer, "Max poly A in Transcript");
metaData.add(infoTranscriptMaxPolyA);
final VCFInfoHeaderLine infoGeneMaxPolyA = new VCFInfoHeaderLine("GENE_MAX", 1, VCFHeaderLineType.Integer, "Max poly A in Gene");
metaData.add(infoGeneMaxPolyA);
final VCFInfoHeaderLine infoEndPos = new VCFInfoHeaderLine("POS3", 1, VCFHeaderLineType.Integer, "End 3 prime position");
metaData.add(infoEndPos);
final VCFInfoHeaderLine infoGeneName = new VCFInfoHeaderLine("GENE_NAME", 1, VCFHeaderLineType.String, "Gene Name");
metaData.add(infoGeneName);
final VCFInfoHeaderLine infoBiotype = new VCFInfoHeaderLine("GENE_BIOTYPE", 1, VCFHeaderLineType.String, "Gene Biotype");
metaData.add(infoBiotype);
final int n_last_exon_bases = Math.max(0, Integer.parseInt(dynaParams.getOrDefault("last.n.exons", "10")));
final VCFInfoHeaderLine infoLastExonBases = new VCFInfoHeaderLine("LAST_BASES", 1, VCFHeaderLineType.String, "Last exon bases N=" + n_last_exon_bases + ". Reverse-complemented for negative strand.");
metaData.add(infoLastExonBases);
final int n_after_exon_bases = Math.max(0, Integer.parseInt(dynaParams.getOrDefault("after.n.exons", "10")));
final VCFInfoHeaderLine infoAfterExonBases = new VCFInfoHeaderLine("AFTER_BASES", 1, VCFHeaderLineType.String, "Bases after exon N=" + n_after_exon_bases + ". Reverse-complemented for negative strand.");
metaData.add(infoAfterExonBases);
final VCFInfoHeaderLine infoOtherIdss = new VCFInfoHeaderLine("OTHER_IDS", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Other transcripts ending at the same coordinate.");
metaData.add(infoOtherIdss);
final VCFFormatHeaderLine fmtMaxPolyA = new VCFFormatHeaderLine("MAX", 1, VCFHeaderLineType.Integer, "Max poly A");
metaData.add(fmtMaxPolyA);
final VCFFormatHeaderLine fmtReadPolyA = new VCFFormatHeaderLine("DPA", 1, VCFHeaderLineType.Integer, "Read with at least one A");
metaData.add(fmtReadPolyA);
final VCFFormatHeaderLine fmtAveragePolyA = new VCFFormatHeaderLine("AVG", 1, VCFHeaderLineType.Float, "average length of poly-A for reads carrying at least one A.");
metaData.add(fmtAveragePolyA);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
final VCFHeader header = new VCFHeader(metaData, samples.stream().sorted().collect(Collectors.toList()));
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
final UnaryOperator<String> afterColon = S -> {
if (!(S.startsWith("gene:") || S.startsWith("transcript:")))
return S;
int colon = S.indexOf(":");
return S.substring(colon + 1);
};
final List<Allele> ALLELES = Collections.singletonList(Allele.create("N", true));
try (VariantContextWriter w = writingVariantsDelegate.dictionary(dict).open(this.outputFile);
ReferenceSequenceFile fai = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx)) {
w.writeHeader(header);
exonMap.values().stream().sorted(new ContigDictComparator(dict).createLocatableComparator()).forEach(T -> {
if (T.getDP() == 0)
return;
final String lastBases;
final String afterBases;
if (T.isPlusStrand()) {
lastBases = fai.getSubsequenceAt(T.getContig(), Math.max(T.getStart(), T.getEnd() - n_last_exon_bases), T.getEnd()).getBaseString();
final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(T.getContig()));
afterBases = fai.getSubsequenceAt(T.getContig(), T.getEnd() + 1, Math.min(T.getEnd() + n_after_exon_bases, ssr.getLengthOnReference())).getBaseString();
} else if (T.isMinusStrand()) {
lastBases = AcidNucleics.reverseComplement(fai.getSubsequenceAt(T.getContig(), T.getStart(), Math.min(T.getEnd(), T.getStart() + n_last_exon_bases)).getBaseString());
afterBases = AcidNucleics.reverseComplement(fai.getSubsequenceAt(T.getContig(), Math.max(1, T.getStart() - n_after_exon_bases), T.getStart() - 1).getBaseString());
} else {
lastBases = null;
afterBases = null;
}
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(T.getContig());
vcb.start(T.getStart());
vcb.stop(T.getEnd());
vcb.id(afterColon.apply(T.transcriptId));
vcb.attribute(VCFConstants.END_KEY, T.getEnd());
vcb.attribute(infoGeneId.getID(), afterColon.apply(T.gene.geneId));
vcb.attribute(infoTranscriptId.getID(), afterColon.apply(T.transcriptId));
vcb.attribute(infoStrand.getID(), T.strand.name());
vcb.attribute(infoEndPos.getID(), T.getPosition());
if (T.otherIds != null && !T.otherIds.isEmpty()) {
vcb.attribute(infoOtherIdss.getID(), T.otherIds.stream().map(afterColon).collect(Collectors.toList()));
}
if (!StringUtils.isBlank(lastBases)) {
vcb.attribute(infoLastExonBases.getID(), lastBases);
}
if (!StringUtils.isBlank(afterBases)) {
vcb.attribute(infoAfterExonBases.getID(), afterBases);
}
if (!StringUtils.isBlank(T.gene.geneName)) {
vcb.attribute(infoGeneName.getID(), T.gene.geneName);
}
if (!StringUtils.isBlank(T.gene.biotype)) {
vcb.attribute(infoBiotype.getID(), T.gene.biotype);
}
final List<Genotype> genotypes = new ArrayList<>(samples.size());
for (String sn : samples) {
final ExonCount count = T.sample2count.get(sn);
final GenotypeBuilder gb = new GenotypeBuilder(sn);
gb.attribute(fmtMaxPolyA.getID(), count == null ? 0 : count.max_length_polyA);
gb.attribute(fmtReadPolyA.getID(), count == null ? 0 : count.n_tested_reads_with_A);
gb.attribute(fmtAveragePolyA.getID(), count == null || count.n_tested_reads_with_A == 0 ? 0f : count.sum_polyA / (float) count.n_tested_reads_with_A);
gb.DP(count == null ? 0 : count.n_tested_reads);
genotypes.add(gb.make());
}
vcb.alleles(ALLELES);
vcb.genotypes(genotypes);
vcb.attribute(VCFConstants.DEPTH_KEY, T.getDP());
final int score = T.getMaxPolyA();
if (score > 0)
vcb.log10PError(score / -10.0);
vcb.attribute(infoTranscriptMaxPolyA.getID(), score);
vcb.attribute(infoGeneMaxPolyA.getID(), T.gene.getmaxPolyA());
w.add(vcb.make());
});
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class CopyNumber01 method doWork.
@Override
public int doWork(final List<String> args) {
ReferenceSequenceFile indexedFastaSequenceFile = null;
try {
this.sexContigSet.addAll(Arrays.stream(this.sexContigStr.split("[, \t]")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
/* loading REF Reference */
indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
final Comparator<Locatable> locComparator = new ContigDictComparator(dict).createLocatableComparator();
final List<Locatable> intervals = new ArrayList<>();
if (this.bedFile == null) {
for (final Locatable loc : dict.getSequences()) {
intervals.add(loc);
}
} else {
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.bedFile)) {
br.stream().filter(L -> !StringUtil.isBlank(converter.apply(L.getContig()))).forEach(B -> {
final String ctg = converter.apply(B.getContig());
intervals.add(new SimpleInterval(ctg, B.getStart(), B.getEnd()));
});
}
}
if (intervals.isEmpty()) {
LOG.error("no interval defined.");
return -1;
}
LOG.info("intervals N=" + intervals.size() + " mean-size:" + intervals.stream().mapToInt(R -> R.getLengthOnReference()).average().orElse(0.0));
final List<GCAndDepth> user_items = new ArrayList<>();
// split intervals
for (final Locatable loc : intervals) {
int pos = loc.getStart();
while (pos < loc.getEnd()) {
final int pos_end = Math.min(pos + this.windowSize, loc.getEnd());
final GCAndDepth dataRow = new GCAndDepth(loc.getContig(), pos, pos_end);
if (dataRow.getLengthOnReference() < this.windowMin) {
break;
}
user_items.add(dataRow);
pos += this.windowShift;
}
}
// free memory
intervals.clear();
LOG.info("sorting N=" + user_items.size());
Collections.sort(user_items, locComparator);
// fill gc percent
LOG.info("fill gc% N=" + user_items.size());
for (final String ctg : user_items.stream().map(T -> T.getContig()).collect(Collectors.toSet())) {
final GenomicSequence gseq = new GenomicSequence(indexedFastaSequenceFile, ctg);
for (final GCAndDepth dataRow : user_items) {
if (!dataRow.getContig().equals(ctg))
continue;
final GCPercent gc = gseq.getGCPercent(dataRow.getStart(), dataRow.getEnd());
if (gc.isEmpty())
continue;
dataRow.gc = gc.getGCPercent();
}
}
// remove strange gc
user_items.removeIf(B -> B.gc < this.minGC);
user_items.removeIf(B -> B.gc > this.maxGC);
LOG.info("remove high/low gc% N=" + user_items.size());
if (user_items.stream().allMatch(P -> isSex(P.getContig()))) {
LOG.error("All chromosomes are defined as sexual. Cannot normalize");
return -1;
}
final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
/* header */
pw.println("#CHROM\tSTART\tEND\tSample\tIDX\tGC\tRAW-DEPTH\tNORM-DEPTH");
for (final Path bamPath : IOUtils.unrollPaths(args)) {
// open this samReader
try (SamReader samReader = super.createSamReaderFactory().referenceSequence(this.refFile).open(bamPath)) {
if (!samReader.hasIndex()) {
LOG.error("file is not indexed " + bamPath);
return -1;
}
final SAMFileHeader header = samReader.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
final String sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet()).stream().collect(HtsCollectors.toSingleton());
final List<GCAndDepth> bam_items = new ArrayList<>(user_items.size());
/* loop over contigs */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
/* create a **COPY** of the intervals */
final List<GCAndDepth> ctgitems = user_items.stream().filter(T -> T.contigsMatch(ssr)).collect(Collectors.toList());
if (ctgitems.isEmpty())
continue;
LOG.info("Getting coverage for " + ssr.getSequenceName() + " N=" + ctgitems.size());
// get coverage
final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(samReader, ctgitems, sampleName);
// fill coverage
for (final GCAndDepth gc : ctgitems) {
final OptionalDouble optCov;
switch(this.univariateDepth) {
case median:
optCov = coverage.getMedian(gc);
break;
case mean:
optCov = coverage.getAverage(gc);
break;
default:
throw new IllegalStateException();
}
gc.raw_depth = optCov.orElse(-1.0);
gc.norm_depth = gc.raw_depth;
}
ctgitems.removeIf(V -> V.raw_depth < 0);
ctgitems.removeIf(V -> V.raw_depth > this.weirdMaxDepth);
ctgitems.removeIf(V -> V.raw_depth < this.weirdMinDepth);
if (ctgitems.isEmpty())
continue;
bam_items.addAll(ctgitems);
}
double[] y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.raw_depth).toArray();
LOG.info("median raw depth " + new Median().evaluate(y, 0, y.length));
Collections.sort(bam_items, (a, b) -> {
final int i = Double.compare(a.getX(), b.getX());
if (i != 0)
return i;
return Double.compare(a.getY(), b.getY());
});
double[] x = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getX()).toArray();
y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getY()).toArray();
// get min GC
final double min_x = x[0];
// get max GC
final double max_x = x[x.length - 1];
LOG.info("min/max gc " + min_x + " " + max_x);
/* merge adjacent x (GC%) having same values */
int i = 0;
int k = 0;
while (i < x.length) {
int j = i + 1;
while (j < x.length && Precision.equals(x[i], x[j])) {
++j;
}
x[k] = x[i];
y[k] = this.univariateGCLoess.create().evaluate(y, i, j - i);
++k;
i = j;
}
LOG.info("merged n=" + (x.length - k) + " items.");
/* reduce size of x et y */
final List<XY> xyL = new ArrayList<>(k);
for (int t = 0; t < k; ++t) {
xyL.add(new XYImpl(x[t], y[t]));
}
/* sort on Y */
Collections.sort(xyL, (a, b) -> {
final int d = Double.compare(a.getX(), b.getX());
if (d != 0)
return d;
return Double.compare(a.getY(), b.getY());
});
x = xyL.stream().mapToDouble(R -> R.getX()).toArray();
y = xyL.stream().mapToDouble(R -> R.getY()).toArray();
final UnivariateInterpolator interpolator = createInterpolator();
UnivariateFunction spline = null;
try {
spline = interpolator.interpolate(x, y);
} catch (final org.apache.commons.math3.exception.NumberIsTooSmallException err) {
spline = null;
LOG.error("Cannot use " + interpolator.getClass().getName() + ":" + err.getMessage());
}
// min depth cal
int points_removed = 0;
i = 0;
while (i < bam_items.size()) {
final GCAndDepth r = bam_items.get(i);
if (spline == null) {
++i;
} else if (r.getX() < min_x || r.getX() > max_x) {
bam_items.remove(i);
++points_removed;
} else {
final double norm;
if (this.gcDepthInterpolation.equals(UnivariateInerpolation.identity)) {
norm = r.getY();
} else {
norm = spline.value(r.getX());
}
if (Double.isNaN(norm) || Double.isInfinite(norm)) {
LOG.info("NAN " + r);
bam_items.remove(i);
++points_removed;
continue;
}
r.norm_depth = norm;
++i;
}
}
LOG.info("removed " + points_removed + ". now N=" + bam_items.size());
if (bam_items.isEmpty())
continue;
spline = null;
// DO NOT NORMALIZE ON MINIMUM DEPTH, STUPID.
// normalize on median
y = bam_items.stream().mapToDouble(G -> G.getY()).toArray();
final double median_depth = this.univariateMid.create().evaluate(y, 0, y.length);
LOG.info("median norm depth : " + median_depth);
for (i = 0; median_depth > 0 && i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
gc.norm_depth /= median_depth;
}
// restore genomic order
Collections.sort(bam_items, locComparator);
// smoothing values with neighbours
y = bam_items.stream().mapToDouble(V -> V.getY()).toArray();
for (i = 0; i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
int left = i;
for (int j = Math.max(0, i - this.smooth_window); j <= i; ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
continue;
left = j;
break;
}
int right = i;
for (int j = i; j <= i + this.smooth_window && j < bam_items.size(); ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
break;
right = j;
// no break;
}
gc.norm_depth = this.univariateSmooth.create().evaluate(y, left, (right - left) + 1);
}
/* print data */
for (final GCAndDepth r : bam_items) {
pw.print(r.getContig());
pw.print('\t');
pw.print(r.getStart() - 1);
pw.print('\t');
pw.print(r.getEnd());
pw.print('\t');
pw.print(sampleName);
pw.print('\t');
pw.print(getGenomicIndex(dict, r.getContig(), r.getStart()) - 1);
pw.print('\t');
pw.printf("%.3f", r.gc);
pw.print('\t');
pw.printf("%.3f", r.raw_depth);
pw.print('\t');
pw.printf("%.3f", r.norm_depth);
pw.println();
}
pw.flush();
}
// samReader
}
// end loop bamPath
pw.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(indexedFastaSequenceFile);
}
}
Aggregations