use of gov.nih.nlm.ncbi.dbsnp.gt.SnpInfo.SsInfo.ByPop.GTypeByInd in project jvarkit by lindenb.
the class Biostar140111 method parseGenotypes.
/**
* parses NCBI GT output
*/
private void parseGenotypes(XMLEventReader r, OutputStream outStream) throws Exception {
final Pattern dnaRegex = Pattern.compile("[ATGCatgc]+");
final QName attInId = new QName("indId");
Set<String> samples = new TreeSet<>();
Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
metaData.add(new VCFInfoHeaderLine("SnpInfoObserved", 1, VCFHeaderLineType.String, "SnpInfo : Oberved"));
VCFHeader header = null;
VariantContextWriter w = null;
while (r.hasNext()) {
XMLEvent evt = r.peek();
if (evt.isStartElement()) {
StartElement startE = evt.asStartElement();
String localName = startE.getName().getLocalPart();
if (localName.equals("SnpInfo")) {
if (header == null) {
header = new VCFHeader(metaData, samples);
w = VCFUtils.createVariantContextWriterToOutputStream(outStream);
w.writeHeader(header);
}
SnpInfo snpInfo = this.unmarshaller.unmarshal(r, SnpInfo.class).getValue();
if (snpInfo.getSnpLoc().isEmpty()) {
LOG.warning("no snploc for rs" + snpInfo.getRsId());
}
for (SnpLoc snpLoc : snpInfo.getSnpLoc()) {
int chromStart;
try {
chromStart = Integer.parseInt(snpLoc.getStart());
} catch (Exception e) {
LOG.warning("bad start in rs" + snpInfo.getRsId() + " " + snpLoc.getStart());
continue;
}
chromStart++;
String contigAllele = snpLoc.getContigAllele();
if (contigAllele == null || !dnaRegex.matcher(contigAllele).matches()) {
LOG.warning("bad contigAllele in rs" + snpInfo.getRsId() + " " + contigAllele);
continue;
}
if (!"fwd".equals(snpLoc.getRsOrientToChrom())) {
contigAllele = AcidNucleics.reverseComplement(contigAllele);
}
Allele ref = Allele.create(contigAllele, true);
Map<String, Genotype> sample2genotype = new HashMap<>();
for (SsInfo ssinfo : snpInfo.getSsInfo()) {
boolean revcomp = !"fwd".equals(snpLoc.getRsOrientToChrom());
if (!"fwd".equals(ssinfo.getSsOrientToRs()))
revcomp = !revcomp;
for (ByPop byPop : ssinfo.getByPop()) {
for (GTypeByInd gt : byPop.getGTypeByInd()) {
String sample = String.valueOf(gt.getIndId());
if (!samples.contains(sample)) {
LOG.warning("Undefined sample:" + sample);
continue;
}
boolean ok = true;
String[] tokens = gt.getGtype().split("[/]");
if (tokens.length == 1) {
tokens = new String[] { tokens[0], tokens[0] };
} else if (tokens.length != 2) {
LOG.warning("Bad genotypes in sample:" + sample + " " + gt.getGtype());
continue;
}
List<Allele> sampleAlleles = new ArrayList<>(2);
for (int i = 0; i < tokens.length; ++i) {
if (revcomp)
tokens[i] = AcidNucleics.reverseComplement(tokens[i]);
if (!dnaRegex.matcher(tokens[i]).matches()) {
ok = false;
break;
}
sampleAlleles.add(tokens[i].equalsIgnoreCase(contigAllele) ? ref : Allele.create(tokens[i], false));
}
if (!ok)
continue;
GenotypeBuilder gb = new GenotypeBuilder(sample, sampleAlleles);
sample2genotype.put(sample, gb.make());
}
}
}
Set<Allele> alleles = new HashSet<>();
alleles.add(ref);
for (String sample : samples) {
if (!sample2genotype.containsKey(sample)) {
sample2genotype.put(sample, GenotypeBuilder.createMissing(sample, 2));
} else {
alleles.addAll(sample2genotype.get(sample).getAlleles());
}
}
VariantContextBuilder vcb = new VariantContextBuilder("dbsnp", snpLoc.getChrom(), chromStart, chromStart + ref.getBaseString().length() - 1, alleles);
if (snpInfo.getObserved() != null) {
vcb.attribute("SnpInfoObserved", VCFUtils.escapeInfoField(snpInfo.getObserved()));
}
vcb.genotypes(sample2genotype.values());
vcb.id("rs" + snpInfo.getRsId());
w.add(vcb.make());
}
} else if (localName.equals("Individual")) {
if (header != null)
throw new XMLStreamException("Error got " + localName + " after genotypes", evt.getLocation());
Attribute att = startE.getAttributeByName(attInId);
if (att == null)
throw new XMLStreamException("Cannot get " + attInId, evt.getLocation());
samples.add(att.getValue());
// consumme
r.next();
} else {
r.next();
}
} else {
// consumme
r.next();
}
}
if (w == null)
throw new IOException("No Genotype was found");
w.close();
}
Aggregations