use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.
the class LowResBam2Raster method doWork.
@Override
public int doWork(final List<String> args) {
if (this.regionStr == null) {
LOG.error("Region was not defined.");
return -1;
}
if (this.WIDTH < 100) {
LOG.info("adjusting WIDTH to 100");
this.WIDTH = 100;
}
if (this.gcWinSize <= 0) {
LOG.info("adjusting GC win size to 5");
this.gcWinSize = 5;
}
try {
if (this.referenceFile == null) {
LOG.error("error. Since 2018-11-17. Reference is Required");
return -1;
} else {
this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.referenceFile);
this.refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
this.contigNameConverter = ContigNameConverter.fromOneDictionary(this.refDict);
}
final SamReaderFactory srf = super.createSamReaderFactory();
srf.referenceSequence(this.referenceFile);
this.interval = IntervalParserFactory.newInstance().dictionary(this.refDict).make().apply(this.regionStr).orElseThrow(IntervalParserFactory.exception(this.regionStr));
loadVCFs();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(this.contigNameConverter);
gtfReader.getAllGenes().stream().filter(G -> G.overlaps(this.interval)).flatMap(G -> G.getTranscripts().stream()).filter(G -> G.overlaps(this.interval)).forEach(T -> this.transcripts.add(T));
}
}
for (final Path bamFile : IOUtils.unrollPaths(args)) {
try (SamReader samFileReader = srf.open(SamInputResource.of(bamFile))) {
final SAMFileHeader header = samFileReader.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
SequenceUtil.assertSequenceDictionariesEqual(dict, this.refDict);
final ContigNameConverter conv = ContigNameConverter.fromOneDictionary(dict);
final String normalizedContig = conv.apply(this.interval.getContig());
if (StringUtil.isBlank(normalizedContig) || dict.getSequence(normalizedContig) == null) {
LOG.error("no such chromosome in " + bamFile + " " + this.interval + ". " + " " + JvarkitException.ContigNotFoundInDictionary.getMessage(this.interval.getContig(), dict));
return -1;
}
scan(samFileReader, normalizedContig);
}
}
if (this.key2partition.isEmpty()) {
LOG.error("No data was found. no Read-Group specified ? no data in that region ?");
return -1;
}
this.key2partition.values().stream().forEach(P -> P.make());
saveImages(this.key2partition.keySet().stream().collect(Collectors.toMap(K -> K, K -> this.key2partition.get(K).image)));
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.
the class BackLocate method doWork.
@Override
public int doWork(final List<String> args) {
PrintStream out = null;
BufferedReader in = null;
try {
this.referenceGenome = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
gtfReader.setContigNameConverter(contigNameConverter);
gtfReader.getAllGenes().stream().filter(G -> !StringUtils.isBlank(G.getGeneName())).flatMap(G -> G.getTranscripts().stream()).filter(T -> T.isCoding() && T.hasCDS()).forEach(T -> {
final String gn = T.getGene().getGeneName().toUpperCase();
List<Transcript> L = this.name2transcripts.get(gn);
if (L == null) {
L = new ArrayList<>();
this.name2transcripts.put(gn, L);
}
L.add(T);
});
}
out = this.openPathOrStdoutAsPrintStream(this.outputFile);
out.print("#User.Gene");
out.print('\t');
out.print("AA1");
out.print('\t');
out.print("petide.pos.1");
out.print('\t');
out.print("AA2");
out.print('\t');
out.print("transcript.name");
out.print('\t');
out.print("transcript.id");
out.print('\t');
out.print("transcript.strand");
out.print('\t');
out.print("transcript.AA");
out.print('\t');
out.print("index0.in.rna");
out.print('\t');
out.print("wild.codon");
out.print('\t');
out.print("potential.var.codons");
out.print('\t');
out.print("base.in.rna");
out.print('\t');
out.print("chromosome");
out.print('\t');
out.print("index0.in.genomic");
out.print('\t');
out.print("exon");
if (this.printSequences) {
out.print('\t');
out.print("mRNA");
out.print('\t');
out.print("protein");
}
out.print('\t');
out.print("messages");
out.print('\t');
out.print("extra.user.data");
out.println();
if (args.isEmpty()) {
in = super.openBufferedReader(null);
this.run(out, in);
CloserUtil.close(in);
} else {
for (final String filename : args) {
in = super.openBufferedReader(filename);
this.run(out, in);
CloserUtil.close(in);
}
}
return 0;
} catch (final Exception e) {
LOG.severe(e);
return -1;
} finally {
CloserUtil.close(this.referenceGenome);
this.referenceGenome = null;
CloserUtil.close(out);
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.
the class SamFindClippedRegions method processInput.
@Override
protected int processInput(final SAMFileHeader header, final CloseableIterator<SAMRecord> iter) {
final Set<String> samples = header.getReadGroups().stream().map(this.partition).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
if (samples.isEmpty()) {
LOG.error("no sample in read group was defined.");
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final int bad_mapq = 30;
// SAMFileWriter w=null;
try {
final IntervalTreeMap<Interval> intronMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasIntron()).flatMap(T -> T.getIntrons().stream()).map(I -> new Interval(I.getContig(), I.getStart(), I.getEnd(), I.isNegativeStrand(), I.getTranscript().getId())).forEach(I -> intronMap.put(I, I));
}
}
/* build VCF header */
final Allele reference_allele = Allele.create("N", true);
final Allele alt_allele = Allele.create("<CLIP>", false);
final Set<VCFHeaderLine> vcfHeaderLines = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(vcfHeaderLines, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
VCFStandardHeaderLines.addStandardInfoLines(vcfHeaderLines, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_FREQUENCY_KEY);
final VCFFormatHeaderLine leftClip = new VCFFormatHeaderLine("CL", 1, VCFHeaderLineType.Integer, "Left Clip");
vcfHeaderLines.add(leftClip);
final VCFFormatHeaderLine rightClip = new VCFFormatHeaderLine("RL", 1, VCFHeaderLineType.Integer, "Right Clip");
vcfHeaderLines.add(rightClip);
final VCFFormatHeaderLine totalCip = new VCFFormatHeaderLine("TL", 1, VCFHeaderLineType.Integer, "Total Clip");
vcfHeaderLines.add(totalCip);
final VCFFormatHeaderLine totalDel = new VCFFormatHeaderLine("DL", 1, VCFHeaderLineType.Integer, "Total Deletions");
vcfHeaderLines.add(totalDel);
final VCFFormatHeaderLine noClipMAPQ = new VCFFormatHeaderLine("MQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for reads without clip at this position.");
vcfHeaderLines.add(noClipMAPQ);
final VCFInfoHeaderLine averageMAPQ = new VCFInfoHeaderLine("AVG_MAPQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for called genotypes");
vcfHeaderLines.add(averageMAPQ);
final VCFInfoHeaderLine infoRetrogene = new VCFInfoHeaderLine("RETROGENE", 1, VCFHeaderLineType.String, "transcript name for Possible retrogene.");
vcfHeaderLines.add(infoRetrogene);
final VCFFilterHeaderLine filterRetrogene = new VCFFilterHeaderLine("POSSIBLE_RETROGENE", "Junction is a possible Retrogene.");
vcfHeaderLines.add(filterRetrogene);
final VCFFilterHeaderLine filterlowMapq = new VCFFilterHeaderLine("LOW_MAPQ", "Low average mapq (< " + bad_mapq + ")");
vcfHeaderLines.add(filterlowMapq);
final VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
vcfHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
this.writingVcfConfig.dictionary(dict);
try (VariantContextWriter w = this.writingVcfConfig.open(this.outputFile)) {
w.writeHeader(vcfHeader);
@SuppressWarnings("resource") final VariantContextWriter finalVariantContextWriter = w;
/**
* dump a BASe into the VCF
*/
final BiConsumer<String, Base> baseConsumer = (CTG, B) -> {
if (B.pos < 1)
return;
// no clip
if (B.sample2gt.values().stream().mapToInt(G -> G.clip()).sum() == 0)
return;
if (B.sample2gt.values().stream().allMatch(G -> G.clip() < min_clip_depth))
return;
if (B.sample2gt.values().stream().allMatch(G -> G.dp() < min_depth))
return;
if (B.sample2gt.values().stream().allMatch(G -> G.ratio() < fraction))
return;
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(CTG);
vcb.start(B.pos);
vcb.stop(B.pos);
vcb.alleles(Arrays.asList(reference_allele, alt_allele));
vcb.attribute(VCFConstants.DEPTH_KEY, B.sample2gt.values().stream().mapToInt(G -> G.dp()).sum());
/* if gtf was specified, find intron which ends are near this pos */
if (gtfPath != null) {
final Locatable bounds1 = new SimpleInterval(CTG, Math.max(1, B.pos - max_intron_distance), B.pos + max_intron_distance);
intronMap.getOverlapping(bounds1).stream().filter(I -> Math.abs(I.getStart() - B.pos) <= this.max_intron_distance || Math.abs(I.getEnd() - B.pos) <= this.max_intron_distance).map(I -> I.getName()).findFirst().ifPresent(transcript_id -> {
vcb.attribute(infoRetrogene.getID(), transcript_id);
vcb.filter(filterRetrogene.getID());
});
;
}
final List<Genotype> genotypes = new ArrayList<>(B.sample2gt.size());
int AC = 0;
int AN = 0;
int max_clip = 1;
double sum_mapq = 0.0;
int count_mapq = 0;
for (final String sn : B.sample2gt.keySet()) {
final Gt gt = B.sample2gt.get(sn);
final GenotypeBuilder gb = new GenotypeBuilder(sn);
if (gt.clip() == 0 && gt.noClip == 0) {
gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
} else if (gt.noClip == 0) {
gb.alleles(Arrays.asList(alt_allele, alt_allele));
AC += 2;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
} else if (gt.clip() == 0) {
gb.alleles(Arrays.asList(reference_allele, reference_allele));
AN += 2;
} else {
gb.alleles(Arrays.asList(reference_allele, alt_allele));
AC++;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
}
gb.DP(gt.dp());
gb.attribute(leftClip.getID(), gt.leftClip);
gb.attribute(rightClip.getID(), gt.rightClip);
gb.attribute(totalCip.getID(), gt.clip());
gb.attribute(totalDel.getID(), gt.del);
gb.attribute(noClipMAPQ.getID(), gt.noClipMapq());
gb.AD(new int[] { gt.noClip, gt.clip() });
genotypes.add(gb.make());
max_clip = Math.max(max_clip, gt.clip());
}
if (count_mapq > 0) {
final int avg_mapq = (int) (sum_mapq / count_mapq);
vcb.attribute(averageMAPQ.getID(), avg_mapq);
if (avg_mapq < bad_mapq)
vcb.filter(filterlowMapq.getID());
}
vcb.log10PError(max_clip / -10.0);
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, AC);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, AN);
if (AN > 0)
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, AC / (float) AN);
vcb.genotypes(genotypes);
finalVariantContextWriter.add(vcb.make());
};
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
String prevContig = null;
final SortedMap<Integer, Base> pos2base = new TreeMap<>();
/* get base in pos2base, create it if needed */
final Function<Integer, Base> baseAt = POS -> {
Base b = pos2base.get(POS);
if (b == null) {
b = new Base(POS, samples);
pos2base.put(POS, b);
}
return b;
};
for (; ; ) {
final SAMRecord rec = (iter.hasNext() ? progress.apply(iter.next()) : null);
if (rec != null && !SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
if (rec == null || !rec.getContig().equals(prevContig)) {
for (final Integer pos : pos2base.keySet()) {
baseConsumer.accept(prevContig, pos2base.get(pos));
}
if (rec == null)
break;
pos2base.clear();
prevContig = rec.getContig();
}
for (Iterator<Integer> rpos = pos2base.keySet().iterator(); rpos.hasNext(); ) {
final Integer pos = rpos.next();
if (pos.intValue() + this.max_clip_length >= rec.getUnclippedStart())
break;
baseConsumer.accept(prevContig, pos2base.get(pos));
rpos.remove();
}
final String rg = this.partition.getPartion(rec);
if (StringUtils.isBlank(rg))
continue;
for (final AlignmentBlock ab : rec.getAlignmentBlocks()) {
for (int n = 0; n < ab.getLength(); ++n) {
}
}
final Cigar cigar = rec.getCigar();
int refPos = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
final Gt gt = baseAt.apply(refPos + x).getGt(rg);
gt.noClip++;
gt.noClip_sum_mapq += rec.getMappingQuality();
}
} else if (op.equals(CigarOperator.D) || op.equals(CigarOperator.N)) {
baseAt.apply(refPos).getGt(rg).del++;
baseAt.apply(refPos + ce.getLength() - 1).getGt(rg).del++;
}
refPos += ce.getLength();
}
}
CigarElement ce = cigar.getFirstCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(rec.getStart() - 1).getGt(rg).leftClip++;
}
ce = cigar.getLastCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(rec.getEnd() + 1).getGt(rg).rightClip++;
}
}
}
// end of vcf writer
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations