Search in sources :

Example 1 with SnpLoc

use of gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SnpLoc in project jvarkit by lindenb.

the class Biostar140111 method parseGenotypes.

/**
 * parses NCBI GT output
 */
private void parseGenotypes(XMLEventReader r, OutputStream outStream) throws Exception {
    final Pattern dnaRegex = Pattern.compile("[ATGCatgc]+");
    final QName attInId = new QName("indId");
    Set<String> samples = new TreeSet<>();
    Set<VCFHeaderLine> metaData = new HashSet<>();
    metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
    metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
    metaData.add(new VCFInfoHeaderLine("SnpInfoObserved", 1, VCFHeaderLineType.String, "SnpInfo : Oberved"));
    VCFHeader header = null;
    VariantContextWriter w = null;
    while (r.hasNext()) {
        XMLEvent evt = r.peek();
        if (evt.isStartElement()) {
            StartElement startE = evt.asStartElement();
            String localName = startE.getName().getLocalPart();
            if (localName.equals("SnpInfo")) {
                if (header == null) {
                    header = new VCFHeader(metaData, samples);
                    w = VCFUtils.createVariantContextWriterToOutputStream(outStream);
                    w.writeHeader(header);
                }
                SnpInfo snpInfo = this.unmarshaller.unmarshal(r, SnpInfo.class).getValue();
                if (snpInfo.getSnpLoc().isEmpty()) {
                    LOG.warning("no snploc for rs" + snpInfo.getRsId());
                }
                for (SnpLoc snpLoc : snpInfo.getSnpLoc()) {
                    int chromStart;
                    try {
                        chromStart = Integer.parseInt(snpLoc.getStart());
                    } catch (Exception e) {
                        LOG.warning("bad start in rs" + snpInfo.getRsId() + " " + snpLoc.getStart());
                        continue;
                    }
                    chromStart++;
                    String contigAllele = snpLoc.getContigAllele();
                    if (contigAllele == null || !dnaRegex.matcher(contigAllele).matches()) {
                        LOG.warning("bad contigAllele in rs" + snpInfo.getRsId() + " " + contigAllele);
                        continue;
                    }
                    if (!"fwd".equals(snpLoc.getRsOrientToChrom())) {
                        contigAllele = AcidNucleics.reverseComplement(contigAllele);
                    }
                    Allele ref = Allele.create(contigAllele, true);
                    Map<String, Genotype> sample2genotype = new HashMap<>();
                    for (SsInfo ssinfo : snpInfo.getSsInfo()) {
                        boolean revcomp = !"fwd".equals(snpLoc.getRsOrientToChrom());
                        if (!"fwd".equals(ssinfo.getSsOrientToRs()))
                            revcomp = !revcomp;
                        for (ByPop byPop : ssinfo.getByPop()) {
                            for (GTypeByInd gt : byPop.getGTypeByInd()) {
                                String sample = String.valueOf(gt.getIndId());
                                if (!samples.contains(sample)) {
                                    LOG.warning("Undefined sample:" + sample);
                                    continue;
                                }
                                boolean ok = true;
                                String[] tokens = gt.getGtype().split("[/]");
                                if (tokens.length == 1) {
                                    tokens = new String[] { tokens[0], tokens[0] };
                                } else if (tokens.length != 2) {
                                    LOG.warning("Bad genotypes in sample:" + sample + " " + gt.getGtype());
                                    continue;
                                }
                                List<Allele> sampleAlleles = new ArrayList<>(2);
                                for (int i = 0; i < tokens.length; ++i) {
                                    if (revcomp)
                                        tokens[i] = AcidNucleics.reverseComplement(tokens[i]);
                                    if (!dnaRegex.matcher(tokens[i]).matches()) {
                                        ok = false;
                                        break;
                                    }
                                    sampleAlleles.add(tokens[i].equalsIgnoreCase(contigAllele) ? ref : Allele.create(tokens[i], false));
                                }
                                if (!ok)
                                    continue;
                                GenotypeBuilder gb = new GenotypeBuilder(sample, sampleAlleles);
                                sample2genotype.put(sample, gb.make());
                            }
                        }
                    }
                    Set<Allele> alleles = new HashSet<>();
                    alleles.add(ref);
                    for (String sample : samples) {
                        if (!sample2genotype.containsKey(sample)) {
                            sample2genotype.put(sample, GenotypeBuilder.createMissing(sample, 2));
                        } else {
                            alleles.addAll(sample2genotype.get(sample).getAlleles());
                        }
                    }
                    VariantContextBuilder vcb = new VariantContextBuilder("dbsnp", snpLoc.getChrom(), chromStart, chromStart + ref.getBaseString().length() - 1, alleles);
                    if (snpInfo.getObserved() != null) {
                        vcb.attribute("SnpInfoObserved", VCFUtils.escapeInfoField(snpInfo.getObserved()));
                    }
                    vcb.genotypes(sample2genotype.values());
                    vcb.id("rs" + snpInfo.getRsId());
                    w.add(vcb.make());
                }
            } else if (localName.equals("Individual")) {
                if (header != null)
                    throw new XMLStreamException("Error got " + localName + " after genotypes", evt.getLocation());
                Attribute att = startE.getAttributeByName(attInId);
                if (att == null)
                    throw new XMLStreamException("Cannot get " + attInId, evt.getLocation());
                samples.add(att.getValue());
                // consumme
                r.next();
            } else {
                r.next();
            }
        } else {
            // consumme
            r.next();
        }
    }
    if (w == null)
        throw new IOException("No Genotype was found");
    w.close();
}
Also used : ByPop(gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SsInfo.ByPop) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SnpInfo(gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo) HashMap(java.util.HashMap) Attribute(javax.xml.stream.events.Attribute) ArrayList(java.util.ArrayList) GTypeByInd(gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SsInfo.ByPop.GTypeByInd) SnpLoc(gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SnpLoc) TreeSet(java.util.TreeSet) SsInfo(gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SsInfo) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Pattern(java.util.regex.Pattern) QName(javax.xml.namespace.QName) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) IOException(java.io.IOException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) XMLStreamException(javax.xml.stream.XMLStreamException) IOException(java.io.IOException) StartElement(javax.xml.stream.events.StartElement) Allele(htsjdk.variant.variantcontext.Allele) XMLStreamException(javax.xml.stream.XMLStreamException) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) XMLEvent(javax.xml.stream.events.XMLEvent)

Aggregations

SnpInfo (gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo)1 SnpLoc (gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SnpLoc)1 SsInfo (gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SsInfo)1 ByPop (gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SsInfo.ByPop)1 GTypeByInd (gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SsInfo.ByPop.GTypeByInd)1 Allele (htsjdk.variant.variantcontext.Allele)1 Genotype (htsjdk.variant.variantcontext.Genotype)1 GenotypeBuilder (htsjdk.variant.variantcontext.GenotypeBuilder)1 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)1 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)1 VCFFormatHeaderLine (htsjdk.variant.vcf.VCFFormatHeaderLine)1 VCFHeader (htsjdk.variant.vcf.VCFHeader)1 VCFHeaderLine (htsjdk.variant.vcf.VCFHeaderLine)1 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 TreeSet (java.util.TreeSet)1 Pattern (java.util.regex.Pattern)1