use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class StarRetroCopy method doWork.
@Override
public int doWork(final List<String> args) {
if (this.min_depth < 1) {
LOG.error("Bad min depth");
return -1;
}
PrintWriter saveInsertionsPw = null;
SamReader sr = null;
VariantContextWriter vcw0 = null;
CloseableIterator<SAMRecord> iter = null;
SAMFileWriter sfw = null;
try {
/* load the reference genome */
/* create a contig name converter from the REF */
// open the sam file
final String input = oneFileOrNull(args);
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null)
srf.referenceSequence(this.faidx);
if (input != null) {
sr = srf.open(SamInputResource.of(input));
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sr.getFileHeader());
/* READ KNOWGENES FILES */
loadGTF(dict);
if (this.use_bai && !sr.hasIndex()) {
LOG.warning("Cannot used bai because input is not indexed");
}
// if there is a bai, only query the bam in the regions of splicing
if (this.use_bai && sr.hasIndex()) {
LOG.info("building intervals...");
final QueryInterval[] intervals = this.intronTreeMap.keySet().stream().flatMap(intron -> {
// we need the reads overlapping the exon bounds
final int tid = dict.getSequenceIndex(intron.getContig());
final int extend = 1 + Math.max(0, this.minCigarSize);
final QueryInterval q1 = new QueryInterval(tid, Math.max(1, intron.getStart() - extend), intron.getStart() + extend);
final QueryInterval q2 = new QueryInterval(tid, Math.max(1, intron.getEnd() - extend), intron.getEnd() + extend);
return Arrays.stream(new QueryInterval[] { q1, q2 });
}).sorted().collect(HtsCollectors.optimizedQueryIntervals());
LOG.debug("Query bam using " + intervals.length + " random access intervals. Please wait...");
iter = sr.queryOverlapping(intervals);
} else {
iter = sr.iterator();
}
} else {
sr = srf.open(SamInputResource.of(stdin()));
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(sr.getFileHeader());
/* READ GTF FILES */
loadGTF(dict);
iter = sr.iterator();
}
final SAMFileHeader samFileHeader = sr.getFileHeader();
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(samFileHeader);
/* save gene writer */
if (this.saveBedPeTo != null) {
saveInsertionsPw = super.openPathOrStdoutAsPrintWriter(this.saveBedPeTo);
} else {
saveInsertionsPw = NullOuputStream.newPrintWriter();
}
if (this.saveBamTo != null) {
sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(samFileHeader, true, this.saveBamTo);
}
final String sample = samFileHeader.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse("SAMPLE");
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(samFileHeader).logger(LOG).build();
final String SAM_ATT_JI = "jI";
while (iter.hasNext()) {
final SAMRecord rec = progress.apply(iter.next());
if (rec.getReadUnmappedFlag())
continue;
if (rec.getMappingQuality() < this.min_read_mapq)
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
boolean save_read_to_bam = false;
/* save read if it is not properly mapped (problem with size) and he and his mate surround an intron */
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) && !rec.getProperPairFlag() && /* MUST NOT be proper pair */
rec.getReadNegativeStrandFlag() != rec.getMateNegativeStrandFlag()) {
final SimpleInterval intronInterval;
if (rec.getEnd() + 1 < rec.getMateAlignmentStart()) {
intronInterval = new SimpleInterval(rec.getContig(), rec.getEnd() + 1, rec.getMateAlignmentStart() - 1);
} else if (SAMUtils.hasMateCigar(rec) && SAMUtils.getMateAlignmentEnd(rec) + 1 < rec.getAlignmentStart()) {
intronInterval = new SimpleInterval(rec.getContig(), SAMUtils.getMateAlignmentEnd(rec) + 1, rec.getAlignmentStart() - 1);
} else {
intronInterval = null;
}
if (intronInterval != null) {
if (this.intronTreeMap.getOverlapping(intronInterval).stream().flatMap(L -> L.stream()).anyMatch(S -> intronInterval.contains(S))) {
save_read_to_bam = true;
}
}
}
/* WE use STAR DATA */
if (!this.use_cigar_string) {
if (!rec.hasAttribute(SAM_ATT_JI))
continue;
final Object tagValue = rec.getAttribute(SAM_ATT_JI);
paranoid.assertTrue((tagValue instanceof int[]));
final int[] bounds = (int[]) tagValue;
// jI:B:i,-1
if (bounds.length == 1 && bounds[0] < 0)
continue;
if (bounds.length % 2 != 0) {
LOG.warn("bound.length%2!=0 with " + rec.getSAMString());
continue;
}
for (int i = 0; i < bounds.length; i += 2) {
int intron_start = bounds[i];
int intron_end = bounds[i + 1];
final Interval r = new Interval(rec.getContig(), intron_start, intron_end);
// don't use overlapping : with STAR it is strictly the intron boundaries
final List<Segment> introns = this.intronTreeMap.get(r);
if (introns == null)
continue;
save_read_to_bam = true;
for (final Segment intron : introns) {
intron.match++;
}
}
} else /* WE use other bam like bwa-mem */
{
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.numCigarElements() < 2)
continue;
int ref1 = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
final int ref_end = ref1 + (op.consumesReferenceBases() ? ce.getLength() : 0);
if (op.equals(CigarOperator.N) || op.equals(CigarOperator.D)) {
final Interval r = new Interval(rec.getContig(), ref1, ref_end - 1);
final List<Segment> introns = this.intronTreeMap.get(r);
if (introns == null)
continue;
save_read_to_bam = true;
for (final Segment intron : introns) {
intron.match++;
}
}
ref1 = ref_end;
}
/**
* 2019-07-29. I tried using SA:Z:tag doesn't work well , so let's look a the clipping only
*/
if (cigar.isClipped()) {
for (int side = 0; side < 2; side++) {
final Interval r;
if (side == 0 && cigar.isRightClipped() && cigar.getLastCigarElement().getLength() >= this.minCigarSize) {
r = new Interval(rec.getContig(), rec.getEnd() + 1, rec.getEnd() + 1 + this.minCigarSize);
} else if (side == 1 && cigar.isLeftClipped() && cigar.getFirstCigarElement().getLength() >= this.minCigarSize) {
r = new Interval(rec.getContig(), Math.max(1, rec.getStart() - (1 + this.minCigarSize)), Math.max(1, rec.getStart() - (1)));
} else {
continue;
}
final int final_side = side;
final List<Segment> introns = this.intronTreeMap.getOverlapping(r).stream().flatMap(L -> L.stream()).filter(SEG -> isWithinIntronBound(SEG, r, final_side)).collect(Collectors.toList());
if (introns.isEmpty())
continue;
// System.err.println("SA for "+r+" "+rec.getReadName()+" "+introns.size());
save_read_to_bam = true;
for (final Segment intron : introns) {
intron.match++;
}
}
}
}
if (save_read_to_bam) {
saveInsertion(saveInsertionsPw, rec);
if (sfw != null)
sfw.addAlignment(rec);
}
}
final ContigDictComparator contigCmp = new ContigDictComparator(refDict);
this.all_transcripts.removeIf(T -> T.segments.stream().noneMatch(S -> S.match >= min_depth));
final int max_introns = this.all_transcripts.stream().mapToInt(K -> K.segments.size()).max().orElse(1);
final List<String> intron_names = IntStream.range(0, max_introns).mapToObj(IDX -> String.format("%s_INTRON_%04d", sample, 1 + IDX)).collect(Collectors.toList());
/**
* build vcf header
*/
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
}
// metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
// metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
// "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
metaData.add(new VCFFormatHeaderLine(INTRON_START, 1, VCFHeaderLineType.Integer, "Introns start"));
metaData.add(new VCFFormatHeaderLine(INTRON_END, 1, VCFHeaderLineType.Integer, "Introns end"));
metaData.add(new VCFFilterHeaderLine(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold, "Number of read is lower than :" + this.min_depth));
metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
final VCFHeader header = new VCFHeader(metaData, intron_names);
JVarkitVersion.getInstance().addMetaData(this, header);
header.setSequenceDictionary(refDict);
/* open vcf for writing*/
vcw0 = super.openVariantContextWriter(this.outputFile);
final VariantContextWriter vcw = vcw0;
vcw.writeHeader(header);
Collections.sort(this.all_transcripts, (A, B) -> {
int i = contigCmp.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
});
final Allele ref = Allele.create((byte) 'N', true);
final Allele alt = Allele.create("<RETROCOPY>", false);
for (final Transcript kg : this.all_transcripts) {
// ok good candidate
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(kg.getContig());
vcb.start(kg.getStart());
vcb.stop(kg.getEnd());
vcb.id(kg.transcript_id);
final List<Allele> alleles = Arrays.asList(ref, alt);
final int max_depth = kg.segments.stream().mapToInt(X -> X.match).max().orElse(0);
vcb.attribute(VCFConstants.DEPTH_KEY, max_depth);
vcb.log10PError(max_depth / -10.0);
boolean filter_set = false;
if (max_depth < this.low_depth_threshold) {
vcb.filter(ATT_LOW_DEPTH_FILTER + this.low_depth_threshold);
filter_set = true;
}
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, 2);
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, 1);
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5);
vcb.attribute(VCFConstants.SVTYPE, "DEL");
vcb.attribute(VCFConstants.END_KEY, kg.getEnd());
vcb.attribute("SVLEN", kg.getLengthOnReference());
for (final String att : kg.attributes.keySet()) {
vcb.attribute(att, VCFUtils.escapeInfoField(kg.attributes.get(att)));
}
vcb.alleles(alleles);
// introns sequences
vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, kg.segments.stream().filter(I -> I.match > 0).count());
vcb.attribute(ATT_INTRONS_COUNT, kg.segments.size());
vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, kg.segments.stream().filter(I -> I.match > 0).count() / (float) kg.segments.size());
if (kg.segments.stream().filter(I -> I.match > 0).count() != kg.segments.size()) {
vcb.filter(ATT_NOT_ALL_INTRONS);
filter_set = true;
}
final List<Genotype> genotypes = new ArrayList<>(kg.segments.size());
/* build genotypes */
for (int i = 0; i < kg.segments.size(); i++) {
final Segment intron = kg.segments.get(i);
final GenotypeBuilder gb = new GenotypeBuilder(intron_names.get(i), Arrays.asList(ref, alt));
gb.DP(intron.match);
gb.attribute(INTRON_START, intron.start);
gb.attribute(INTRON_END, intron.end);
genotypes.add(gb.make());
}
vcb.genotypes(genotypes);
if (!filter_set) {
vcb.passFilters();
}
vcw.add(vcb.make());
}
progress.close();
vcw.close();
iter.close();
iter = null;
sr.close();
sr = null;
saveInsertionsPw.flush();
saveInsertionsPw.close();
saveInsertionsPw = null;
if (sfw != null) {
sfw.close();
sfw = null;
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sr);
CloserUtil.close(vcw0);
CloserUtil.close(sfw);
CloserUtil.close(saveInsertionsPw);
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class StarRetroCopy method saveInsertion.
private void saveInsertion(final PrintWriter pw, final SAMRecord rec) {
if (this.saveBedPeTo == null)
return;
final List<SimpleInterval> rec2list = new ArrayList<>();
final Predicate<Locatable> shouldBeSaved = REC2 -> {
if (!rec.getContig().equals(REC2.getContig())) {
return true;
}
if (rec.overlaps(REC2))
return false;
return this.intronTreeMap.getOverlapping(rec).stream().flatMap(L -> L.stream()).map(L -> L.transcript).noneMatch(P -> P.overlaps(REC2));
};
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && !rec.getProperPairFlag()) {
final int mateStart = rec.getMateAlignmentStart();
final int mateEnd;
if (SAMUtils.hasMateCigar(rec)) {
mateEnd = SAMUtils.getMateAlignmentEnd(rec);
} else {
mateEnd = mateStart;
}
final SimpleInterval rec2 = new SimpleInterval(rec.getMateReferenceName(), mateStart, mateEnd);
if (shouldBeSaved.test(rec2)) {
rec2list.add(rec2);
}
}
/* add supplementary alignments that could be an evidence */
SAMUtils.getOtherCanonicalAlignments(rec).stream().filter(shouldBeSaved).forEach(R -> rec2list.add(new SimpleInterval(R)));
if (rec2list.isEmpty())
return;
for (final SimpleInterval rec2 : rec2list) {
final String distanceStr;
if (rec.overlaps(rec2)) {
distanceStr = "0";
} else if (rec.contigsMatch(rec2)) {
distanceStr = String.valueOf(Math.abs(Math.min(rec.getStart() - rec2.getEnd(), rec2.getStart() - rec.getEnd())));
} else {
distanceStr = ".";
}
// save insertions
pw.print(rec.getContig());
pw.print("\t");
pw.print(rec.getStart() - 1);
pw.print("\t");
pw.print(rec.getEnd());
pw.print("\t");
pw.print(rec2.getContig());
pw.print("\t");
pw.print(rec2.getStart() - 1);
pw.print("\t");
pw.print(rec2.getEnd());
pw.print("\t");
// name
pw.print(rec.getReadName() + (rec.getReadPairedFlag() ? (rec.getFirstOfPairFlag() ? "/1" : "/2") : ""));
pw.print("\t");
// score
pw.print(".");
pw.print("\t");
// strand 1
pw.print(".");
pw.print("\t");
// strand 2
pw.print(".");
pw.print("\t");
// "Any number of additional, user-defined fields ..."
pw.print(distanceStr);
pw.println();
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class SAM4WebLogo method doWork.
@Override
public int doWork(final List<String> args) {
if (this.fastq_quality_padding_str.length() != 1) {
LOG.error("Bad fastq padding character (length!=1)");
return -1;
}
if (this.fastq_quality_unknown_str.length() != 1) {
LOG.error("Bad fastq unknown character (length!=1)");
return -1;
}
PrintWriter out = null;
SamReader samReader = null;
SAMRecordIterator iter = null;
try {
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null)
srf.referenceSequence(this.faidx);
final List<Locatable> intervals = this.intervalListProvider.stream().collect(Collectors.toList());
if (!this.output_format.equals(Format.tabular) && intervals.size() > 1) {
LOG.error("Only one interval is allowed for format " + this.output_format);
return -1;
}
final List<Path> inputPath = IOUtils.unrollPaths(args);
out = super.openPathOrStdoutAsPrintWriter(outputFile);
for (int locatable_idx = 0; locatable_idx < intervals.size(); ++locatable_idx) {
if (locatable_idx > 0)
out.println();
final Locatable interval = intervals.get(locatable_idx);
final List<SAMRecord> buffer = new ArrayList<>();
final SortedMap<Integer, Integer> genomicpos2insertlen = new TreeMap<>();
for (final Path samPath : inputPath) {
samReader = srf.open(SamInputResource.of(samPath));
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(samReader.getFileHeader());
final ContigNameConverter ctgNameConverter = ContigNameConverter.fromOneDictionary(dict);
final SimpleInterval segment = ctgNameConverter.convertToSimpleInterval(interval).orElseThrow(() -> new JvarkitException.ContigNotFoundInDictionary(interval.getContig(), dict));
final int extend = useClip ? 1000 : 0;
iter = samReader.query(segment.getContig(), Math.max(1, segment.getStart() - extend), segment.getEnd() + extend, false);
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.SamRecordFilter.filterOut(rec))
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
if (!rec.getReferenceName().equals(segment.getContig()))
continue;
if (useClip) {
if (rec.getUnclippedEnd() < segment.getStart())
continue;
if (rec.getUnclippedStart() > segment.getEnd())
continue;
} else {
if (rec.getEnd() < segment.getStart())
continue;
if (rec.getStart() > segment.getEnd())
continue;
}
// free memory
if (!this.output_format.equals(Format.fastq)) {
rec.setBaseQualities(SAMRecord.NULL_QUALS);
}
final List<SAMTagAndValue> atts = rec.getAttributes();
for (final SAMTagAndValue stv : atts) {
if (stv.tag.equals("RG"))
continue;
rec.setAttribute(stv.tag, null);
}
int refPos = rec.getStart();
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
if (op.equals(CigarOperator.I)) {
genomicpos2insertlen.put(refPos, Math.max(ce.getLength(), genomicpos2insertlen.getOrDefault(refPos, 0)));
}
if (op.consumesReferenceBases())
refPos += ce.getLength();
if (refPos > interval.getEnd())
break;
}
buffer.add(rec);
}
iter.close();
iter = null;
samReader.close();
samReader = null;
}
/* compute columns */
if (interval != null) {
/**
* test if genomic position is in interval
*/
final IntPredicate testInInterval = pos -> interval.getStart() <= pos && pos <= interval.getEnd();
final ArrayList<Integer> column2genomic = new ArrayList<>(interval.getLengthOnReference());
for (int x = interval.getStart(); x <= interval.getEnd(); ++x) {
column2genomic.add(x);
}
// insert insertions in reverse order
for (int pos : genomicpos2insertlen.keySet().stream().sorted(Collections.reverseOrder()).collect(Collectors.toList())) {
if (!testInInterval.test(pos))
continue;
final int insert_len = genomicpos2insertlen.get(pos);
for (int x = 0; x < insert_len; ++x) {
column2genomic.add(pos - interval.getStart(), NO_POS);
}
}
// ruler
if (this.output_format.equals(Format.tabular)) {
int x = 0;
while (x < column2genomic.size()) {
if (column2genomic.get(x) == NO_POS || column2genomic.get(x) % 10 != 0) {
out.print(' ');
++x;
} else {
final String s = String.valueOf(column2genomic.get(x));
if (s.length() < 10 && x + s.length() < column2genomic.size()) {
out.print(s);
x += s.length();
} else {
out.print(' ');
++x;
}
}
}
out.println(" " + interval.toString());
}
for (final SAMRecord rec : buffer) {
final String recQualString = rec.getBaseQualityString();
final Function<Integer, Character> read2qual;
if (recQualString == null || SAMRecord.NULL_QUALS_STRING.equals(recQualString)) {
read2qual = IDX -> '~';
} else {
read2qual = IDX -> {
if (IDX < 0 || IDX >= recQualString.length())
return '~';
return recQualString.charAt(IDX);
};
}
final byte[] rec_bases = rec.getReadBases();
final Function<Integer, Character> read2base;
if (SAMRecord.NULL_SEQUENCE.equals(rec_bases)) {
read2base = IDX -> '?';
} else {
read2base = IDX -> {
if (IDX < 0 || IDX >= rec_bases.length)
return '?';
return (char) rec_bases[IDX];
};
}
final StringBuilder seqBuilder = new StringBuilder(column2genomic.size());
final StringBuilder qualBuilder = new StringBuilder(column2genomic.size());
int x = 0;
int readRef = rec.getUnclippedStart();
int readpos = 0;
final Cigar cigar = rec.getCigar();
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
if (op.equals(CigarOperator.PADDING))
continue;
for (int cigarIdx = 0; cigarIdx < ce.getLength(); ++cigarIdx) {
// pad before base
while (!op.equals(CigarOperator.INSERTION) && x < column2genomic.size() && (column2genomic.get(x) == NO_POS || column2genomic.get(x) < readRef)) {
seqBuilder.append('-');
qualBuilder.append('-');
++x;
}
switch(op) {
case I:
{
if (!this.hide_insertions) {
if (testInInterval.test(readRef)) {
seqBuilder.append(Character.toLowerCase(read2base.apply(readpos)));
qualBuilder.append(read2qual.apply(readpos));
++x;
}
}
++cigarIdx;
++readpos;
break;
}
case P:
break;
case H:
{
if (testInInterval.test(readRef)) {
seqBuilder.append(this.useClip ? 'n' : '-');
qualBuilder.append(this.useClip ? this.fastq_quality_unknown_str : "-");
++x;
}
++readRef;
break;
}
case S:
{
if (testInInterval.test(readRef)) {
seqBuilder.append(this.useClip ? Character.toLowerCase((read2base.apply(readpos))) : '-');
qualBuilder.append(this.useClip ? (char) read2qual.apply(readpos) : '-');
++x;
}
++readpos;
++readRef;
break;
}
case D:
case N:
{
if (testInInterval.test(readRef)) {
seqBuilder.append('>');
qualBuilder.append('>');
++x;
}
++readRef;
break;
}
case X:
case EQ:
case M:
{
if (testInInterval.test(readRef)) {
seqBuilder.append(read2base.apply(readpos));
qualBuilder.append(read2qual.apply(readpos));
++x;
}
++readpos;
++readRef;
break;
}
default:
throw new IllegalArgumentException(op.name());
}
}
// end cigarIdx
}
/* right pad */
while (x < column2genomic.size()) {
seqBuilder.append('-');
qualBuilder.append('-');
++x;
}
final String readName = this.samRecordNaming.apply(rec);
switch(this.output_format) {
case fastq:
out.print(FastqConstants.SEQUENCE_HEADER);
out.print(readName);
out.println();
out.print(seqBuilder);
out.println();
out.println(FastqConstants.QUALITY_HEADER);
out.println(qualBuilder);
break;
case tabular:
out.print(seqBuilder);
out.print(' ');
out.println(readName);
break;
default:
out.print('>');
out.print(readName);
out.println();
out.print(seqBuilder);
out.println();
break;
}
}
// end loop over SAMRec
}
// end if interval !=null
}
out.flush();
out.close();
out = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(samReader);
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class CopyNumber01 method doWork.
@Override
public int doWork(final List<String> args) {
ReferenceSequenceFile indexedFastaSequenceFile = null;
try {
this.sexContigSet.addAll(Arrays.stream(this.sexContigStr.split("[, \t]")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
/* loading REF Reference */
indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
final Comparator<Locatable> locComparator = new ContigDictComparator(dict).createLocatableComparator();
final List<Locatable> intervals = new ArrayList<>();
if (this.bedFile == null) {
for (final Locatable loc : dict.getSequences()) {
intervals.add(loc);
}
} else {
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.bedFile)) {
br.stream().filter(L -> !StringUtil.isBlank(converter.apply(L.getContig()))).forEach(B -> {
final String ctg = converter.apply(B.getContig());
intervals.add(new SimpleInterval(ctg, B.getStart(), B.getEnd()));
});
}
}
if (intervals.isEmpty()) {
LOG.error("no interval defined.");
return -1;
}
LOG.info("intervals N=" + intervals.size() + " mean-size:" + intervals.stream().mapToInt(R -> R.getLengthOnReference()).average().orElse(0.0));
final List<GCAndDepth> user_items = new ArrayList<>();
// split intervals
for (final Locatable loc : intervals) {
int pos = loc.getStart();
while (pos < loc.getEnd()) {
final int pos_end = Math.min(pos + this.windowSize, loc.getEnd());
final GCAndDepth dataRow = new GCAndDepth(loc.getContig(), pos, pos_end);
if (dataRow.getLengthOnReference() < this.windowMin) {
break;
}
user_items.add(dataRow);
pos += this.windowShift;
}
}
// free memory
intervals.clear();
LOG.info("sorting N=" + user_items.size());
Collections.sort(user_items, locComparator);
// fill gc percent
LOG.info("fill gc% N=" + user_items.size());
for (final String ctg : user_items.stream().map(T -> T.getContig()).collect(Collectors.toSet())) {
final GenomicSequence gseq = new GenomicSequence(indexedFastaSequenceFile, ctg);
for (final GCAndDepth dataRow : user_items) {
if (!dataRow.getContig().equals(ctg))
continue;
final GCPercent gc = gseq.getGCPercent(dataRow.getStart(), dataRow.getEnd());
if (gc.isEmpty())
continue;
dataRow.gc = gc.getGCPercent();
}
}
// remove strange gc
user_items.removeIf(B -> B.gc < this.minGC);
user_items.removeIf(B -> B.gc > this.maxGC);
LOG.info("remove high/low gc% N=" + user_items.size());
if (user_items.stream().allMatch(P -> isSex(P.getContig()))) {
LOG.error("All chromosomes are defined as sexual. Cannot normalize");
return -1;
}
final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
/* header */
pw.println("#CHROM\tSTART\tEND\tSample\tIDX\tGC\tRAW-DEPTH\tNORM-DEPTH");
for (final Path bamPath : IOUtils.unrollPaths(args)) {
// open this samReader
try (SamReader samReader = super.createSamReaderFactory().referenceSequence(this.refFile).open(bamPath)) {
if (!samReader.hasIndex()) {
LOG.error("file is not indexed " + bamPath);
return -1;
}
final SAMFileHeader header = samReader.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
final String sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet()).stream().collect(HtsCollectors.toSingleton());
final List<GCAndDepth> bam_items = new ArrayList<>(user_items.size());
/* loop over contigs */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
/* create a **COPY** of the intervals */
final List<GCAndDepth> ctgitems = user_items.stream().filter(T -> T.contigsMatch(ssr)).collect(Collectors.toList());
if (ctgitems.isEmpty())
continue;
LOG.info("Getting coverage for " + ssr.getSequenceName() + " N=" + ctgitems.size());
// get coverage
final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(samReader, ctgitems, sampleName);
// fill coverage
for (final GCAndDepth gc : ctgitems) {
final OptionalDouble optCov;
switch(this.univariateDepth) {
case median:
optCov = coverage.getMedian(gc);
break;
case mean:
optCov = coverage.getAverage(gc);
break;
default:
throw new IllegalStateException();
}
gc.raw_depth = optCov.orElse(-1.0);
gc.norm_depth = gc.raw_depth;
}
ctgitems.removeIf(V -> V.raw_depth < 0);
ctgitems.removeIf(V -> V.raw_depth > this.weirdMaxDepth);
ctgitems.removeIf(V -> V.raw_depth < this.weirdMinDepth);
if (ctgitems.isEmpty())
continue;
bam_items.addAll(ctgitems);
}
double[] y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.raw_depth).toArray();
LOG.info("median raw depth " + new Median().evaluate(y, 0, y.length));
Collections.sort(bam_items, (a, b) -> {
final int i = Double.compare(a.getX(), b.getX());
if (i != 0)
return i;
return Double.compare(a.getY(), b.getY());
});
double[] x = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getX()).toArray();
y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getY()).toArray();
// get min GC
final double min_x = x[0];
// get max GC
final double max_x = x[x.length - 1];
LOG.info("min/max gc " + min_x + " " + max_x);
/* merge adjacent x (GC%) having same values */
int i = 0;
int k = 0;
while (i < x.length) {
int j = i + 1;
while (j < x.length && Precision.equals(x[i], x[j])) {
++j;
}
x[k] = x[i];
y[k] = this.univariateGCLoess.create().evaluate(y, i, j - i);
++k;
i = j;
}
LOG.info("merged n=" + (x.length - k) + " items.");
/* reduce size of x et y */
final List<XY> xyL = new ArrayList<>(k);
for (int t = 0; t < k; ++t) {
xyL.add(new XYImpl(x[t], y[t]));
}
/* sort on Y */
Collections.sort(xyL, (a, b) -> {
final int d = Double.compare(a.getX(), b.getX());
if (d != 0)
return d;
return Double.compare(a.getY(), b.getY());
});
x = xyL.stream().mapToDouble(R -> R.getX()).toArray();
y = xyL.stream().mapToDouble(R -> R.getY()).toArray();
final UnivariateInterpolator interpolator = createInterpolator();
UnivariateFunction spline = null;
try {
spline = interpolator.interpolate(x, y);
} catch (final org.apache.commons.math3.exception.NumberIsTooSmallException err) {
spline = null;
LOG.error("Cannot use " + interpolator.getClass().getName() + ":" + err.getMessage());
}
// min depth cal
int points_removed = 0;
i = 0;
while (i < bam_items.size()) {
final GCAndDepth r = bam_items.get(i);
if (spline == null) {
++i;
} else if (r.getX() < min_x || r.getX() > max_x) {
bam_items.remove(i);
++points_removed;
} else {
final double norm;
if (this.gcDepthInterpolation.equals(UnivariateInerpolation.identity)) {
norm = r.getY();
} else {
norm = spline.value(r.getX());
}
if (Double.isNaN(norm) || Double.isInfinite(norm)) {
LOG.info("NAN " + r);
bam_items.remove(i);
++points_removed;
continue;
}
r.norm_depth = norm;
++i;
}
}
LOG.info("removed " + points_removed + ". now N=" + bam_items.size());
if (bam_items.isEmpty())
continue;
spline = null;
// DO NOT NORMALIZE ON MINIMUM DEPTH, STUPID.
// normalize on median
y = bam_items.stream().mapToDouble(G -> G.getY()).toArray();
final double median_depth = this.univariateMid.create().evaluate(y, 0, y.length);
LOG.info("median norm depth : " + median_depth);
for (i = 0; median_depth > 0 && i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
gc.norm_depth /= median_depth;
}
// restore genomic order
Collections.sort(bam_items, locComparator);
// smoothing values with neighbours
y = bam_items.stream().mapToDouble(V -> V.getY()).toArray();
for (i = 0; i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
int left = i;
for (int j = Math.max(0, i - this.smooth_window); j <= i; ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
continue;
left = j;
break;
}
int right = i;
for (int j = i; j <= i + this.smooth_window && j < bam_items.size(); ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
break;
right = j;
// no break;
}
gc.norm_depth = this.univariateSmooth.create().evaluate(y, left, (right - left) + 1);
}
/* print data */
for (final GCAndDepth r : bam_items) {
pw.print(r.getContig());
pw.print('\t');
pw.print(r.getStart() - 1);
pw.print('\t');
pw.print(r.getEnd());
pw.print('\t');
pw.print(sampleName);
pw.print('\t');
pw.print(getGenomicIndex(dict, r.getContig(), r.getStart()) - 1);
pw.print('\t');
pw.printf("%.3f", r.gc);
pw.print('\t');
pw.printf("%.3f", r.raw_depth);
pw.print('\t');
pw.printf("%.3f", r.norm_depth);
pw.println();
}
pw.flush();
}
// samReader
}
// end loop bamPath
pw.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(indexedFastaSequenceFile);
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class ReferenceToHtml method doWork.
@Override
public int doWork(final List<String> args) {
try {
try (VCFReader reader = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)), true)) {
final VCFHeader header = reader.getHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final Optional<String> buildName = SequenceDictionaryUtils.getBuildName(dict);
try (ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx)) {
SequenceUtil.assertSequenceDictionariesEqual(SequenceDictionaryUtils.extractRequired(reference), dict);
final List<Locatable> locs = IntervalListProvider.from(this.regionsStr).dictionary(dict).stream().sorted(new ContigDictComparator(dict).createLocatableComparator()).collect(Collectors.toList());
if (locs.isEmpty()) {
LOG.error("no region was defined");
return -1;
}
final XMLOutputFactory xof = XMLOutputFactory.newFactory();
try (ArchiveFactory archive = ArchiveFactory.open(archiveOutput)) {
try (PrintWriter pw = archive.openWriter(this.prefix + "script.js")) {
pw.println(StaticCodeExtractor.forClass(this.getClass()).extract("SCRIPT").get());
pw.flush();
}
try (PrintWriter pw = archive.openWriter(this.prefix + "style.css")) {
pw.println(StaticCodeExtractor.forClass(this.getClass()).extract("CSS").get());
pw.flush();
}
final OutputStream index_os = archive.openOuputStream(this.prefix + "index.html");
final XMLStreamWriter index_html = xof.createXMLStreamWriter(index_os, "UTF-8");
index_html.writeStartDocument("UTF-8", "1.0");
index_html.writeStartElement("html");
index_html.writeStartElement("head");
writeMeta(index_html);
index_html.writeStartElement("title");
index_html.writeCharacters(this.getProgramName());
// title
index_html.writeEndElement();
index_html.writeStartElement("link");
index_html.writeAttribute("rel", "stylesheet");
index_html.writeAttribute("href", this.prefix + "style.css");
index_html.writeCharacters("");
// link
index_html.writeEndElement();
// head
index_html.writeEndElement();
index_html.writeStartElement("body");
index_html.writeStartElement("ul");
for (final Locatable loc : locs) {
final List<VariantContext> variants;
try (CloseableIterator<VariantContext> iter = reader.query(loc)) {
variants = iter.stream().collect(Collectors.toList());
}
StringWriter sw = new StringWriter();
sw.append("var g = ");
JsonWriter jsw = new JsonWriter(sw);
jsw.beginObject();
jsw.name("contig").value(loc.getContig());
jsw.name("start").value(loc.getStart());
jsw.name("end").value(loc.getEnd());
jsw.name("length").value(loc.getLengthOnReference());
jsw.name("sequence").value(reference.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getEnd()).getBaseString());
jsw.name("variants");
jsw.beginArray();
for (VariantContext ctx : variants) {
jsw.beginObject();
jsw.name("start").value(ctx.getStart());
jsw.name("end").value(ctx.getEnd());
jsw.name("id").value(ctx.hasID() ? ctx.getID() : "");
jsw.name("type").value(ctx.getType().name());
jsw.name("ref").value(ctx.getReference().getDisplayString());
jsw.name("alts");
jsw.beginArray();
for (Allele alt : ctx.getAlternateAlleles()) {
jsw.value(alt.getDisplayString());
}
jsw.endArray();
jsw.endObject();
}
jsw.endArray();
jsw.endObject();
jsw.flush();
jsw.close();
sw.append(";");
final String filename = this.prefix + loc.getContig() + "_" + loc.getStart() + "_" + loc.getEnd() + ".html";
final String title = (buildName.isPresent() ? buildName.get() + " " : "") + new SimpleInterval(loc).toNiceString();
OutputStream os = archive.openOuputStream(filename);
XMLStreamWriter out = xof.createXMLStreamWriter(os, "UTF-8");
out.writeStartDocument("UTF-8", "1.0");
out.writeStartElement("html");
out.writeStartElement("head");
writeMeta(out);
out.writeStartElement("script");
out.writeCharacters(sw.toString());
// script
out.writeEndElement();
out.writeStartElement("script");
out.writeAttribute("src", this.prefix + "script.js");
out.writeCharacters("");
// script
out.writeEndElement();
out.writeStartElement("link");
out.writeAttribute("rel", "stylesheet");
out.writeAttribute("href", this.prefix + "style.css");
out.writeCharacters("");
// link
out.writeEndElement();
out.writeStartElement("title");
out.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
// title
out.writeEndElement();
// head
out.writeEndElement();
out.writeStartElement("body");
out.writeStartElement("h1");
out.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
out.writeEndElement();
out.writeStartElement("div");
checkBox(out, "showPos", "Line Number");
checkBox(out, "showSpace", "Spaces");
checkBox(out, "showVar", "Show Variants");
out.writeCharacters(" ");
out.writeEmptyElement("input");
out.writeAttribute("id", "primer");
out.writeAttribute("type", "text");
out.writeAttribute("placeholder", "Primer Sequence");
out.writeStartElement("button");
out.writeAttribute("id", "search");
out.writeCharacters("Search...");
out.writeEndElement();
// div
out.writeEndElement();
out.writeStartElement("div");
out.writeAttribute("class", "sequence");
out.writeAttribute("id", "main");
// div
out.writeEndElement();
// body
out.writeEndElement();
// html
out.writeEndElement();
os.flush();
os.close();
index_html.writeStartElement("li");
index_html.writeStartElement("a");
index_html.writeAttribute("href", filename);
index_html.writeCharacters(title + " N=" + StringUtils.niceInt(variants.size()));
// a
index_html.writeEndElement();
// li
index_html.writeEndElement();
}
// ul
index_html.writeEndElement();
// body
index_html.writeEndElement();
// html
index_html.writeEndElement();
index_html.writeEndDocument();
index_html.close();
index_os.flush();
index_os.close();
}
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
Aggregations