use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class MergeBlastXml method doWork.
@Override
public int doWork(List<String> args) {
if (args.isEmpty()) {
LOG.error("input xml missing");
return -1;
}
XMLEventReader rx = null;
XMLEventReader rx2 = null;
XMLEventWriter wx = null;
SortingCollection<Iteration> sortingCollection = null;
try {
JAXBContext jc = JAXBContext.newInstance("gov.nih.nlm.ncbi.blast");
this.unmarshaller = jc.createUnmarshaller();
this.marshaller = jc.createMarshaller();
this.marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true);
this.marshaller.setProperty(Marshaller.JAXB_FRAGMENT, true);
XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
xmlInputFactory.setXMLResolver(new XMLResolver() {
@Override
public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
return null;
}
});
final Comparator<Iteration> hitComparator = (A, B) -> {
return A.getIterationQueryDef().compareTo(B.getIterationQueryDef());
};
sortingCollection = SortingCollection.newInstance(Iteration.class, new BlastIterationCodec(), hitComparator, this.maxRecordsInRam, this.tmpFile.toPath());
rx = xmlInputFactory.createXMLEventReader(new FileReader(args.get(0)));
XMLOutputFactory xof = XMLOutputFactory.newFactory();
if (this.outputFile != null) {
wx = xof.createXMLEventWriter(new StreamResult(this.outputFile));
} else {
wx = xof.createXMLEventWriter(new StreamResult(stdout()));
}
boolean in_iteration = false;
while (rx.hasNext()) {
final XMLEvent evt = rx.peek();
if (evt.isStartElement() && evt.asStartElement().getName().getLocalPart().equals("Iteration")) {
final Iteration iteration = this.unmarshaller.unmarshal(rx, Iteration.class).getValue();
sortingCollection.add(iteration);
} else if (evt.isStartElement() && evt.asStartElement().getName().getLocalPart().equals("BlastOutput_iterations")) {
wx.add(rx.nextEvent());
in_iteration = true;
} else if (evt.isEndElement() && evt.asEndElement().getName().getLocalPart().equals("BlastOutput_iterations")) {
for (int optind = 1; optind < args.size(); ++optind) {
LOG.info("opening " + args.get(optind));
rx2 = xmlInputFactory.createXMLEventReader(new FileReader(args.get(optind)));
while (rx2.hasNext()) {
XMLEvent evt2 = rx2.peek();
if (evt2.isStartElement() && evt2.asStartElement().getName().getLocalPart().equals("Iteration")) {
final Iteration iteration = this.unmarshaller.unmarshal(rx2, Iteration.class).getValue();
sortingCollection.add(iteration);
} else {
rx2.nextEvent();
}
}
rx2.close();
LOG.info("close");
}
sortingCollection.doneAdding();
sortingCollection.setDestructiveIteration(true);
final CloseableIterator<Iteration> coliter = sortingCollection.iterator();
final EqualRangeIterator<Iteration> eq = new EqualRangeIterator<>(coliter, hitComparator);
while (coliter.hasNext()) {
final List<Iteration> L = eq.next();
for (int i = 1; i < L.size(); ++i) {
L.get(0).getIterationHits().getHit().addAll(L.get(i).getIterationHits().getHit());
}
marshaller.marshal(L.get(0), wx);
}
eq.close();
coliter.close();
sortingCollection.cleanup();
sortingCollection = null;
wx.add(rx.nextEvent());
in_iteration = false;
} else if (in_iteration) {
// consumme text
rx.nextEvent();
} else {
wx.add(rx.nextEvent());
}
}
wx.flush();
wx.close();
return 0;
} catch (Exception e) {
LOG.error(e);
if (sortingCollection != null) {
sortingCollection.cleanup();
}
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class VcfBurdenSplitter method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, File outorNull) {
SortingCollection<KeyAndLine> sortingcollection = null;
BufferedReader in = null;
CloseableIterator<KeyAndLine> iter = null;
PrintStream pw = null;
PrintWriter allDiscardedLog = null;
try {
in = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
if (this.allFilteredFileOut != null) {
allDiscardedLog = IOUtils.openFileForPrintWriter(this.allFilteredFileOut);
}
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
/**
* find splitter by name
*/
Splitter splitter = null;
for (final Splitter s : this.splitters) {
if (this.splitterName.equals(s.getName())) {
splitter = s;
break;
}
}
if (splitter == null) {
return wrapException("Cannot find a splitter named " + this.splitterName);
}
splitter.initialize(cah.header);
LOG.info("splitter is " + splitter);
pw = super.openFileOrStdoutAsPrintStream(outorNull);
// read variants
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(cah.header);
String prev_contig = null;
for (; ; ) {
final String line = in.readLine();
final VariantContext variant = (line == null ? null : progess.watch(cah.codec.decode(line)));
if (variant == null || !variant.getContig().equals(prev_contig)) {
if (sortingcollection != null) {
sortingcollection.doneAdding();
iter = sortingcollection.iterator();
LOG.info("dumping data for CONTIG: \"" + prev_contig + "\"");
final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, new Comparator<KeyAndLine>() {
@Override
public int compare(final KeyAndLine o1, final KeyAndLine o2) {
return o1.key.compareTo(o2.key);
}
});
while (eqiter.hasNext()) {
final List<KeyAndLine> buffer = eqiter.next();
final KeyAndLine first = buffer.get(0);
LOG.info(first.key);
final List<VariantContext> variants = new ArrayList<>(buffer.size());
boolean has_only_filtered = true;
for (final KeyAndLine kal : buffer) {
final VariantContext ctx = cah.codec.decode(kal.ctx);
variants.add(ctx);
if (isDebuggingVariant(ctx)) {
LOG.info("Adding variant to list for key " + kal.key + " " + shortName(ctx));
}
if (!ctx.getContig().equals(prev_contig)) {
eqiter.close();
return wrapException("illegal state");
}
if (!ctx.isFiltered() || this.acceptFiltered) {
has_only_filtered = false;
// break; NOOOONNN !!!
}
}
// all ctx are filtered
if (has_only_filtered) {
LOG.warn("ALL IS FILTERED in " + first.key);
if (allDiscardedLog != null) {
for (final VariantContext ctx : variants) {
if (isDebuggingVariant(ctx)) {
LOG.info("Variant " + shortName(ctx) + " is part of never filtered for " + first.key);
}
allDiscardedLog.println(String.join("\t", first.key, ctx.getContig(), String.valueOf(ctx.getStart()), ctx.getReference().getDisplayString(), ctx.getAlternateAllele(0).getDisplayString(), String.valueOf(ctx.getFilters())));
}
}
continue;
}
// save vcf file
final VariantContextWriter out = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(pw));
final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
header2.addMetaDataLine(new VCFHeaderLine(VCF_HEADER_SPLITKEY, first.key));
out.writeHeader(header2);
for (final VariantContext ctx : variants) {
if (isDebuggingVariant(ctx)) {
LOG.info("saving variant " + shortName(ctx) + " to final output with key=" + first.key);
}
out.add(ctx);
}
// yes because wrapped into IOUtils.encloseableOutputSream
out.close();
pw.flush();
}
eqiter.close();
iter.close();
iter = null;
// dispose sorting collection
sortingcollection.cleanup();
sortingcollection = null;
}
// EOF met
if (variant == null)
break;
prev_contig = variant.getContig();
}
if (sortingcollection == null) {
/* create sorting collection for new contig */
sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator(), this.writingSortingCollection.maxRecordsInRam, this.writingSortingCollection.getTmpPaths());
sortingcollection.setDestructiveIteration(true);
}
if (variant.getAlternateAlleles().size() != 1) {
return wrapException("Expected only one allele per variant. Please use VcfMultiToOneAllele https://github.com/lindenb/jvarkit/wiki/VcfMultiToOneAllele.");
}
// no check for ctx.ifFiltered here, we do this later.
for (final String key : splitter.keys(variant)) {
if (isDebuggingVariant(variant)) {
LOG.info("Adding variant with key " + key + " " + shortName(variant));
}
sortingcollection.add(new KeyAndLine(key, line));
}
}
progess.finish();
pw.flush();
pw.close();
pw = null;
if (allDiscardedLog != null) {
allDiscardedLog.flush();
allDiscardedLog.close();
allDiscardedLog = null;
}
return RETURN_OK;
} catch (final Exception err) {
return wrapException(err);
} finally {
CloserUtil.close(iter);
if (sortingcollection != null)
sortingcollection.cleanup();
CloserUtil.close(in);
CloserUtil.close(pw);
CloserUtil.close(allDiscardedLog);
}
}
use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class VcfDoest method run.
private void run(final LineIterator lr, final PrintWriter pw) throws IOException {
SortingCollection<TranscriptInfo> sorting = null;
CloseableIterator<TranscriptInfo> iter2 = null;
try {
while (lr.hasNext()) {
VcfIterator in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
final VCFHeader header = in.getHeader();
final Pedigree pedigree = Pedigree.newParser().parse(header);
if (pedigree.isEmpty()) {
throw new IOException("No pedigree found in header VCF header. use VcfInjectPedigree to add it");
}
final SortedSet<Pedigree.Person> individuals = new TreeSet<>();
for (final Pedigree.Person individual : pedigree.getPersons()) {
if (individual.isAffected() || individual.isUnaffected()) {
individuals.add(individual);
}
}
boolean first = true;
pw.println("# samples ( 0: unaffected 1:affected)");
pw.print("population <- data.frame(family=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print("\"" + person.getFamily().getId() + "\"");
first = false;
}
pw.print("),name=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print("\"" + person.getId() + "\"");
first = false;
}
pw.print("),status=c(");
first = true;
for (final Pedigree.Person person : individuals) {
if (!first)
pw.print(",");
pw.print(person.isUnaffected() ? 0 : 1);
first = false;
}
pw.println("))");
sorting = SortingCollection.newInstance(TranscriptInfo.class, new TranscriptInfoCodec(), new TranscriptInfoCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
/**
* loop over variants
*/
while (in.hasNext() && !pw.checkError()) {
final VariantContext ctx = progess.watch(in.next());
if (ctx.isFiltered())
continue;
if (ctx.getAlternateAlleles().isEmpty())
continue;
final Allele altAllele = ctx.getAltAlleleWithHighestAlleleCount();
final MafCalculator mafCalculator = new MafCalculator(altAllele, ctx.getContig());
boolean genotyped = false;
for (final Pedigree.Person p : pedigree.getPersons()) {
if (!(p.isAffected() || p.isUnaffected()))
continue;
final Genotype g = ctx.getGenotype(p.getId());
if (g == null)
throw new IOException("Strange I cannot find individual " + p + " in the pedigree. Aborting.");
if (g.isCalled()) {
mafCalculator.add(g, p.isMale());
}
if (g.isHet() || g.isHomVar()) {
if (!g.getAlleles().contains(altAllele))
continue;
genotyped = true;
break;
}
}
if (!genotyped)
continue;
final Interval interval = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd());
final List<KnownGene> genes = this.overlap(interval);
if (genes.isEmpty())
continue;
for (final KnownGene kg : genes) {
final TranscriptInfo trInfo = new TranscriptInfo();
trInfo.contig = kg.getContig();
trInfo.txStart = kg.getTxStart();
trInfo.txEnd = kg.getTxEnd();
trInfo.transcriptName = kg.getName();
trInfo.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
trInfo.exonCount = kg.getExonCount();
trInfo.transcriptLength = kg.getTranscriptLength();
trInfo.ctxStart = ctx.getStart();
trInfo.ref = ctx.getReference();
trInfo.alt = altAllele;
trInfo.maf = mafCalculator.getMaf();
trInfo.genotypes = new byte[individuals.size()];
int idx = 0;
for (final Pedigree.Person individual : individuals) {
final Genotype genotype = ctx.getGenotype(individual.getId());
final byte b;
if (genotype.isHomRef()) {
b = 0;
} else if (genotype.isHomVar() && genotype.getAlleles().contains(altAllele)) {
b = 2;
} else if (genotype.isHet() && genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
b = 1;
} else /* we treat 0/2 has hom-ref */
if (genotype.isHet() && !genotype.getAlleles().contains(altAllele) && genotype.getAlleles().contains(ctx.getReference())) {
LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
b = 0;
} else /* we treat 2/2 has hom-ref */
if (genotype.isHomVar() && !genotype.getAlleles().contains(altAllele)) {
LOG.warn("Treating " + genotype + " as hom-ref (0) alt=" + altAllele);
b = 0;
} else {
b = -9;
}
trInfo.genotypes[idx] = b;
++idx;
}
KnownGene archetype = kg;
/* find gene archetype = longest overlapping */
for (final KnownGene kg2 : genes) {
if (kg2 == kg)
continue;
if (archetype.getStrand().equals(kg2.getStrand()) && archetype.getTranscriptLength() < kg2.getTranscriptLength()) {
archetype = kg2;
}
}
trInfo.archetypeName = archetype.getName();
trInfo.archetypeLength = archetype.getTranscriptLength();
boolean ctxWasFoundInExon = false;
final int ctxPos0 = ctx.getStart() - 1;
int indexInTranscript0 = 0;
for (final KnownGene.Exon exon : kg.getExons()) {
// variant in exon ?
if (!(exon.getStart() > (ctx.getEnd() - 1) || (ctx.getStart() - 1) >= exon.getEnd())) {
ctxWasFoundInExon = true;
indexInTranscript0 += (ctxPos0 - exon.getStart());
if (kg.isNegativeStrand()) {
indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
}
trInfo.indexInTranscript0 = indexInTranscript0;
trInfo.overlapName = exon.getName();
sorting.add(trInfo);
break;
} else {
indexInTranscript0 += (exon.getEnd() - exon.getStart());
}
}
if (ctxWasFoundInExon) {
continue;
}
indexInTranscript0 = 0;
// search closest intron/exon junction
for (int ex = 0; ex + 1 < kg.getExonCount(); ++ex) {
final KnownGene.Exon exon1 = kg.getExon(ex);
indexInTranscript0 += (exon1.getEnd() - exon1.getStart());
final KnownGene.Exon exon2 = kg.getExon(ex + 1);
if (exon1.getEnd() <= ctxPos0 && ctxPos0 < exon2.getStart()) {
final int dist_to_exon1 = ctxPos0 - exon1.getEnd();
final int dist_to_exon2 = exon2.getStart() - ctxPos0;
if (dist_to_exon2 < dist_to_exon1) {
indexInTranscript0++;
}
if (kg.isNegativeStrand()) {
indexInTranscript0 = (kg.getTranscriptLength() - 1) - indexInTranscript0;
}
trInfo.indexInTranscript0 = indexInTranscript0;
trInfo.overlapName = exon1.getNextIntron().getName();
sorting.add(trInfo);
break;
}
}
}
// end loop over genes
}
// end while loop over variants
progess.finish();
sorting.doneAdding();
LOG.info("done adding");
iter2 = sorting.iterator();
final EqualRangeIterator<TranscriptInfo> eqiter = new EqualRangeIterator<TranscriptInfo>(iter2, new Comparator<TranscriptInfo>() {
@Override
public int compare(final TranscriptInfo o1, final TranscriptInfo o2) {
int i = o1.contig.compareTo(o2.contig);
if (i != 0)
return i;
i = o1.transcriptName.compareTo(o2.transcriptName);
return i;
}
});
while (eqiter.hasNext()) {
final List<TranscriptInfo> list = eqiter.next();
final TranscriptInfo front = list.get(0);
pw.println("# BEGIN TRANSCRIPT " + front.transcriptName + " ##########################################");
pw.println("transcript.chrom <- \"" + front.contig + "\"");
pw.println("transcript.txStart0 <- " + front.txStart + "");
pw.println("transcript.txEnd0 <- " + front.txEnd + "");
pw.println("transcript.name <- \"" + front.transcriptName + "\"");
pw.println("transcript.strand <- \"" + ((char) front.strand) + "\"");
pw.println("transcript.length <- " + front.transcriptLength + "");
pw.println("transcript.exonCount <- " + front.exonCount + "");
pw.println("archetype.name <- \"" + front.archetypeName + "\"");
pw.println("archetype.length <- " + front.archetypeLength + "");
pw.print("variants <- data.frame(chrom=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.contig + "\"");
first = false;
}
pw.print("),chromStart=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.ctxStart);
first = false;
}
pw.print("),chromEnd=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.ctxStart + v.ref.length() - 1);
first = false;
}
pw.print("),refAllele=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.ref.getDisplayString() + "\"");
first = false;
}
pw.print("),altAllele=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.alt.getDisplayString() + "\"");
first = false;
}
pw.print("),positionInTranscript1=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.indexInTranscript0 + 1);
first = false;
}
pw.print("),maf=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print(v.maf);
first = false;
}
pw.print("),overlapName=c(");
first = true;
for (final TranscriptInfo v : list) {
if (!first)
pw.print(",");
pw.print("\"" + v.overlapName + "\"");
first = false;
}
pw.println("))");
pw.println("# genotypes as a list. Should be a multiple of length(samples).");
pw.println("# 0 is homref (0/0), 1 is het (0/1), 2 is homvar (1/1)");
pw.println("# if the variant contains another ALT allele: (0/2) and (2/2) are considered 0 (homref)");
pw.print("genotypes <- c(");
first = true;
for (final TranscriptInfo tr : list) {
for (byte g : tr.genotypes) {
if (!first)
pw.print(",");
first = false;
pw.print((int) g);
}
}
pw.println(")");
pw.println("stopifnot(NROW(variants) * NROW(population) == length(genotypes) )");
if (this.userDefinedFunName == null || this.userDefinedFunName.trim().isEmpty()) {
pw.println("## WARNING not user-defined R function was defined");
} else {
pw.println("# consumme data with user-defined R function ");
pw.println(this.userDefinedFunName + "()");
}
pw.println("# END TRANSCRIPT " + front.transcriptName + " ##########################################");
}
// end while eqiter
eqiter.close();
iter2.close();
iter2 = null;
sorting.cleanup();
sorting = null;
}
} finally {
CloserUtil.close(iter2);
if (sorting != null)
sorting.cleanup();
}
}
use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class Gff2KnownGene method doWork.
@Override
public int doWork(final List<String> args) {
this.gtfCodec = this.formatChooser.makeCodec();
LineIterator in = null;
EqualRangeIterator<GffLine> eq = null;
CloseableIterator<GffLine> iter = null;
SortingCollection<GffLine> sorting = null;
PrintWriter pw = null;
final Set<String> transcriptIdentifiersSet = new HashSet<>(Arrays.asList(semicolon.split(this.transcriptIdentifiersStr)));
final GffLineComparator comparator = new GffLineComparator();
try {
final String input = oneFileOrNull(args);
in = (input == null ? IOUtils.openStdinForLineIterator() : IOUtils.openURIForLineIterator(input));
sorting = SortingCollection.newInstance(GffLine.class, new GffLineCodec(), comparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
int nRead = 0;
this.gtfCodec.readActualHeader(in);
while (in.hasNext()) {
++nRead;
final String line = in.next();
if (line.isEmpty() || line.startsWith("#"))
continue;
final GTFLine delegate = this.gtfCodec.decode(line);
if (delegate.getType().equals("gene")) {
if (verbose)
LOG.info("skipping " + line);
continue;
}
final GffLine gffLine = new GffLine(delegate);
if (!gffLine.hasTranscript()) {
if (verbose)
LOG.info("skipping " + line);
continue;
}
sorting.add(gffLine);
if (nRead % 50000 == 0)
LOG.info("Read " + nRead + " lines. Last: " + line);
}
sorting.doneAdding();
LOG.info("sorting...." + nRead);
pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
iter = sorting.iterator();
eq = new EqualRangeIterator<>(iter, (o1, o2) -> {
final int i = o1.getContig().compareTo(o2.getContig());
if (i != 0)
return i;
return o1.getTranscript().compareTo(o2.getTranscript());
});
while (eq.hasNext()) {
final List<GffLine> L = eq.next();
final GffLine first = L.get(0);
final String firstContig = first.getContig();
final String firstTranscriptName = first.getTranscript();
if (verbose)
LOG.info("processing " + firstTranscriptName);
final char strand = first.delegate.getStrand();
if (!(strand == '+' || strand == '-')) {
LOG.error("Bad strand in " + first.delegate.getLine());
return -1;
}
final List<Interval> exons = new ArrayList<>();
Interval mainTranscriptInterval = null;
final List<Interval> cds = new ArrayList<>();
for (final GffLine item : L) {
if (!firstContig.equals(item.getContig())) {
LOG.error("Conflict in contig!!");
return -1;
}
if (!firstTranscriptName.equals(item.getTranscript())) {
LOG.error("Conflict in name!! " + firstTranscriptName + ":" + item.getTranscript());
return -1;
}
if (item.delegate.getType().equals("gene")) {
if (verbose)
LOG.info("ignore line " + item);
continue;
} else if ((transcriptIdentifiersSet.contains(item.delegate.getType()))) {
if (mainTranscriptInterval != null && !mainTranscriptInterval.equals(item.interval)) {
LOG.error("Transcript found twice for " + firstTranscriptName);
return -1;
}
mainTranscriptInterval = item.interval;
continue;
} else if (item.delegate.getType().equals("exon")) {
exons.add(item.interval);
continue;
} else if (item.delegate.getType().equals("CDS")) {
cds.add(item.interval);
continue;
} else // UTR , stop_codon, etc...
{
if (verbose)
LOG.info("ignore line " + firstTranscriptName + ":" + item.delegate.getType());
continue;
}
}
exons.sort((o1, o2) -> o1.getStart() - o2.getStart());
if (mainTranscriptInterval == null) {
LOG.warn("main transcript not found for " + firstTranscriptName + " " + first + " available feature type where:" + L.stream().map(T -> T.delegate.getType()).collect(Collectors.toSet()));
continue;
}
if (this.writeBin) {
pw.print(reg2bin(mainTranscriptInterval.getStart() - 1, mainTranscriptInterval.getEnd()));
pw.print("\t");
}
pw.print(firstTranscriptName);
pw.print("\t");
pw.print(firstContig);
pw.print("\t");
pw.print(strand);
pw.print("\t");
pw.print(mainTranscriptInterval.getStart() - 1);
pw.print("\t");
pw.print(mainTranscriptInterval.getEnd());
pw.print("\t");
if (cds.isEmpty()) {
pw.print(mainTranscriptInterval.getStart() - 1);
pw.print("\t");
pw.print(mainTranscriptInterval.getStart() - 1);
pw.print("\t");
} else {
int minCds = cds.get(0).getStart();
int maxCds = cds.get(0).getEnd();
for (int i = 1; i < cds.size(); ++i) {
minCds = Math.min(cds.get(i).getStart(), minCds);
maxCds = Math.max(cds.get(i).getEnd(), maxCds);
}
pw.print(minCds - 1);
pw.print("\t");
pw.print(maxCds);
pw.print("\t");
}
pw.print(exons.size());
pw.print("\t");
for (int i = 0; i < exons.size(); ++i) {
if (i > 0)
pw.print(",");
pw.print(exons.get(i).getStart() - 1);
}
pw.print("\t");
for (int i = 0; i < exons.size(); ++i) {
if (i > 0)
pw.print(",");
pw.print(exons.get(i).getEnd());
}
pw.print("\t");
for (final Iterator<Map.Entry<String, String>> metainfoiter = first.delegate.iterator(); metainfoiter.hasNext(); ) {
final Map.Entry<String, String> entry = metainfoiter.next();
if (entry == null)
continue;
final String s = entry.getKey();
if (s.equals("gene_id") || s.equals("transcript_type") || s.equals("gene_name") || s.equals("gene_status") || s.equals("gene_type") || s.equals("transcript_id") || s.equals("havana_gene") || s.equals("havana_transcript") || s.equals("transcript_name") || s.equals("protein_id") || s.equals("ccdsid") || s.equals("Parent")) {
pw.print(entry.getValue());
pw.print(";");
}
}
pw.print("\t");
pw.print(firstTranscriptName);
pw.println();
}
eq.close();
iter.close();
iter = null;
sorting = null;
pw.flush();
pw.close();
pw = null;
LOG.info("done");
return RETURN_OK;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(eq);
CloserUtil.close(pw);
CloserUtil.close(in);
CloserUtil.close(iter);
CloserUtil.close(sorting);
}
}
use of com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator in project jvarkit by lindenb.
the class GroupByGene method read.
private void read(final String input) throws IOException {
LineIterator lineiter = null;
SortingCollection<Call> sortingCollection = null;
try {
final Pattern regexType = (StringUtil.isBlank(this.typeRegexExclude) ? null : Pattern.compile(this.typeRegexExclude));
lineiter = (input == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(input));
sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(), (C1, C2) -> {
int i = C1.compareTo(C2);
if (i != 0)
return i;
return C1.line.compareTo(C2.line);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(lineiter);
final VCFHeader header = cah.header;
this.the_dictionary = header.getSequenceDictionary();
if (this.the_dictionary == null || this.the_dictionary.isEmpty()) {
throw new JvarkitException.DictionaryMissing(input);
}
this.the_codec = cah.codec;
final List<String> sampleNames;
if (header.getSampleNamesInOrder() != null) {
sampleNames = header.getSampleNamesInOrder();
} else {
sampleNames = Collections.emptyList();
}
final VcfTools vcfTools = new VcfTools(header);
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else {
pedigree = Pedigree.newParser().parse(header);
}
final Pattern tab = Pattern.compile("[\t]");
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(the_dictionary).logger(LOG);
while (lineiter.hasNext()) {
String line = lineiter.next();
final VariantContext ctx = progress.watch(this.the_codec.decode(line));
if (!ctx.isVariant())
continue;
if (ignore_filtered && ctx.isFiltered())
continue;
// simplify line
final String[] tokens = tab.split(line);
// ID
tokens[2] = VCFConstants.EMPTY_ID_FIELD;
// QUAL
tokens[5] = VCFConstants.MISSING_VALUE_v4;
// FILTER
tokens[6] = VCFConstants.UNFILTERED;
// INFO
tokens[7] = VCFConstants.EMPTY_INFO_FIELD;
line = String.join(VCFConstants.FIELD_SEPARATOR, Arrays.asList(tokens));
for (final GeneName g : getGenes(vcfTools, ctx)) {
if (regexType != null && regexType.matcher(g.type).matches())
continue;
final Call c = new Call();
c.line = line;
c.gene = g;
sortingCollection.add(c);
}
}
CloserUtil.close(lineiter);
lineiter = null;
sortingCollection.doneAdding();
/**
* dump
*/
final Set<String> casesSamples = pedigree.getPersons().stream().filter(P -> P.isAffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> controlsSamples = pedigree.getPersons().stream().filter(P -> P.isUnaffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> maleSamples = pedigree.getPersons().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> femaleSamples = pedigree.getPersons().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Predicate<Genotype> genotypeFilter = genotype -> {
if (!genotype.isAvailable())
return false;
if (!genotype.isCalled())
return false;
if (genotype.isNoCall())
return false;
if (genotype.isHomRef())
return false;
if (this.ignore_filtered_genotype && genotype.isFiltered())
return false;
return true;
};
PrintStream pw = openFileOrStdoutAsPrintStream(this.outFile);
pw.print("#chrom");
pw.print('\t');
pw.print("min.POS");
pw.print('\t');
pw.print("max.POS");
pw.print('\t');
pw.print("gene.name");
pw.print('\t');
pw.print("gene.type");
pw.print('\t');
pw.print("samples.affected");
pw.print('\t');
pw.print("count.variations");
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.cases");
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.controls");
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.males");
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.females");
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
pw.print('\t');
pw.print("fisher");
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample);
}
pw.println();
final CloseableIterator<Call> iter = sortingCollection.iterator();
final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
while (eqiter.hasNext()) {
final List<Call> row = eqiter.next();
final Call first = row.get(0);
final List<VariantContext> variantList = row.stream().map(R -> GroupByGene.this.the_codec.decode(R.line)).collect(Collectors.toList());
final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
final Set<String> sampleCarryingMut = new HashSet<String>();
final Counter<String> pedCasesCarryingMut = new Counter<String>();
final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
final Counter<String> malesCarryingMut = new Counter<String>();
final Counter<String> femalesCarryingMut = new Counter<String>();
final Counter<String> sample2count = new Counter<String>();
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
if (!genotypeFilter.test(genotype))
continue;
final String sampleName = genotype.getSampleName();
sample2count.incr(sampleName);
sampleCarryingMut.add(sampleName);
if (casesSamples.contains(sampleName)) {
pedCasesCarryingMut.incr(sampleName);
}
if (controlsSamples.contains(sampleName)) {
pedCtrlsCarryingMut.incr(sampleName);
}
if (maleSamples.contains(sampleName)) {
malesCarryingMut.incr(sampleName);
}
if (femaleSamples.contains(sampleName)) {
femalesCarryingMut.incr(sampleName);
}
}
}
pw.print(first.getContig());
pw.print('\t');
// convert to bed
pw.print(minPos - 1);
pw.print('\t');
pw.print(maxPos);
pw.print('\t');
pw.print(first.gene.name);
pw.print('\t');
pw.print(first.gene.type);
pw.print('\t');
pw.print(sampleCarryingMut.size());
pw.print('\t');
pw.print(variantList.size());
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCasesCarryingMut.getCountCategories());
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCtrlsCarryingMut.getCountCategories());
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print(malesCarryingMut.getCountCategories());
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print(femalesCarryingMut.getCountCategories());
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
int count_case_mut = 0;
int count_ctrl_mut = 0;
int count_case_wild = 0;
int count_ctrl_wild = 0;
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
final String sampleName = genotype.getSampleName();
final boolean has_mutation = genotypeFilter.test(genotype);
if (controlsSamples.contains(sampleName)) {
if (has_mutation) {
count_ctrl_mut++;
} else {
count_ctrl_wild++;
}
} else if (casesSamples.contains(sampleName)) {
if (has_mutation) {
count_case_mut++;
} else {
count_case_wild++;
}
}
}
}
final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
pw.print('\t');
pw.print(fisher.getAsDouble());
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample2count.count(sample));
}
pw.println();
if (pw.checkError())
break;
}
eqiter.close();
iter.close();
pw.flush();
if (this.outFile != null)
pw.close();
} finally {
CloserUtil.close(lineiter);
if (sortingCollection != null)
sortingCollection.cleanup();
}
}
Aggregations