use of htsjdk.variant.variantcontext.GenotypeBuilder in project jvarkit by lindenb.
the class VcfGroupByPopulation method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator vcfIn, VariantContextWriter out) {
try {
Reader r = IOUtils.openFileForBufferedReading(this.mappingFile);
parsePopulationMapping(r);
r.close();
VCFHeader header = vcfIn.getHeader();
Set<String> samplesInVcf = new HashSet<>(header.getSampleNamesInOrder());
this.sample2population.keySet().retainAll(samplesInVcf);
Map<String, Set<String>> population2samples = new HashMap<>();
for (String sample : this.sample2population.keySet()) {
String pop = this.sample2population.get(sample);
Set<String> samples = population2samples.get(pop);
if (samples == null) {
samples = new HashSet<>();
population2samples.put(pop, samples);
}
samples.add(sample);
}
for (String sample : header.getSampleNamesInOrder()) {
if (!this.sample2population.containsKey(sample)) {
throw new IOException("Sample " + sample + " not affected to a population");
}
}
Set<VCFHeaderLine> metaData = new LinkedHashSet<>();
for (VCFHeaderLine vhl : header.getMetaDataInInputOrder()) {
if (!(vhl instanceof VCFContigHeaderLine))
continue;
metaData.add(vhl);
}
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
/* FORMAT */
metaData.add(new VCFFormatHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Total Number of Samples"));
metaData.add(new VCFFormatHeaderLine("R", 1, VCFHeaderLineType.Integer, "Number of alleles REF (hom:=2,het:=1)"));
metaData.add(new VCFFormatHeaderLine("A", 1, VCFHeaderLineType.Integer, "Number of alleles ALT (hom:=2,het:=1)"));
metaData.add(new VCFFormatHeaderLine("UNC", 1, VCFHeaderLineType.Integer, "Number of NON-called samples"));
metaData.add(new VCFFormatHeaderLine("F", 1, VCFHeaderLineType.Float, "Allele Frequency A/(R+A)"));
metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
/* INFO */
metaData.add(new VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Total Number of Samples"));
metaData.add(new VCFInfoHeaderLine("R", 1, VCFHeaderLineType.Integer, "Number of alleles REF (hom:=2,het:=1)"));
metaData.add(new VCFInfoHeaderLine("A", 1, VCFHeaderLineType.Integer, "Number of alleles ALT (hom:=2,het:=1)"));
metaData.add(new VCFInfoHeaderLine("UNC", 1, VCFHeaderLineType.Integer, "Number of NON-called samples"));
metaData.add(new VCFInfoHeaderLine("F", 1, VCFHeaderLineType.Float, "Allele Frequency A/(R+A)"));
metaData.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
VCFHeader h2 = new VCFHeader(metaData, population2samples.keySet());
out.writeHeader(h2);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfIn.getHeader());
while (vcfIn.hasNext()) {
VariantContext ctx = progress.watch(vcfIn.next());
VariantContextBuilder vcb = new VariantContextBuilder(ctx.getSource(), ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (ctx.hasID())
vcb.id(ctx.getID());
GCount count_ctx = new GCount();
List<Genotype> genotypes = new ArrayList<>(population2samples.size());
for (String pop : population2samples.keySet()) {
GCount count = new GCount();
Set<String> samples = population2samples.get(pop);
for (String sample : samples) {
Genotype g = ctx.getGenotype(sample);
count.watch(g);
}
GenotypeBuilder gb = new GenotypeBuilder(pop);
gb.attribute("NS", samples.size());
gb.attribute("R", count.R);
gb.attribute("A", count.A);
gb.attribute("UNC", count.uncalled);
if (count.R + count.A > 0) {
gb.attribute("F", (float) count.A / (float) (count.R + count.A));
}
if (count.dp >= 0) {
gb.attribute("DP", count.dp);
if (count_ctx.dp == -1)
count_ctx.dp = 0;
}
genotypes.add(gb.make());
count_ctx.R += count.R;
count_ctx.A += count.A;
count_ctx.uncalled += count.uncalled;
count_ctx.dp += count.dp;
}
vcb.attribute("R", count_ctx.R);
vcb.attribute("A", count_ctx.A);
vcb.attribute("UNC", count_ctx.uncalled);
if (count_ctx.R + count_ctx.A > 0) {
vcb.attribute("F", (float) count_ctx.A / (float) (count_ctx.R + count_ctx.A));
}
if (count_ctx.dp >= 0) {
vcb.attribute("DP", count_ctx.dp);
}
vcb.attribute("NS", this.sample2population.keySet().size());
vcb.genotypes(genotypes);
out.add(vcb.make());
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
use of htsjdk.variant.variantcontext.GenotypeBuilder in project jvarkit by lindenb.
the class MsaToVcf method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter w = null;
LineIterator r = null;
try {
final String inputName = oneFileOrNull(args);
if (inputName == null) {
LOG.info("Reading from stdin");
r = IOUtils.openStreamForLineIterator(stdin());
} else {
LOG.info("Reading from " + inputName);
r = IOUtils.openURIForLineIterator(inputName);
}
Format format = Format.None;
/**
* try to guess format
*/
while (r.hasNext() && format == Format.None) {
final String line = r.peek();
if (line.trim().isEmpty()) {
r.next();
continue;
}
if (line.startsWith("CLUSTAL")) {
format = Format.Clustal;
// consume
r.next();
break;
} else if (line.startsWith(">")) {
format = Format.Fasta;
break;
} else {
LOG.error("MSA format not recognized in " + line);
return -1;
}
}
LOG.info("format : " + format);
/**
* parse lines as FASTA
*/
if (Format.Fasta.equals(format)) {
this.consensus = new FastaConsensus();
AlignSequence curr = null;
while (r.hasNext()) {
String line = r.next();
if (line.startsWith(">")) {
curr = new AlignSequence();
curr.name = line.substring(1).trim();
if (sample2sequence.containsKey(curr.name)) {
LOG.error("Sequence ID " + curr.name + " defined twice");
return -1;
}
sample2sequence.put(curr.name, curr);
} else if (curr != null) {
curr.seq.append(line.trim());
this.align_length = Math.max(this.align_length, curr.seq.length());
}
}
/*
//remove heading & trailing '-'
for(final String sample:this.sample2sequence.keySet())
{
final AlignSequence seq = this.sample2sequence.get(sample);
int i=0;
while(i<this.align_length && seq.at(i)==DELETION)
{
seq.seq.setCharAt(i, CLIPPING);
++i;
}
i= this.align_length-1;
while(i>=0 && seq.at(i)==DELETION)
{
seq.seq.setCharAt(i, CLIPPING);
--i;
}
}*/
} else /**
* parse lines as CLUSTAL
*/
if (Format.Clustal.equals(format)) {
ClustalConsensus clustalconsensus = new ClustalConsensus();
this.consensus = clustalconsensus;
AlignSequence curr = null;
int columnStart = -1;
while (r.hasNext()) {
String line = r.next();
if (line.trim().isEmpty() || line.startsWith("CLUSTAL W")) {
columnStart = -1;
continue;
}
if (line.charAt(0) == ' ') {
if (columnStart == -1) {
LOG.error("illegal consensus line for " + line);
return -1;
}
/* if consensus doesn't exist in the first rows */
while (clustalconsensus.seq.length() < (this.align_length - (line.length() - columnStart))) {
clustalconsensus.seq.append(" ");
}
clustalconsensus.seq.append(line.substring(columnStart));
} else {
if (columnStart == -1) {
columnStart = line.indexOf(' ');
if (columnStart == -1) {
LOG.error("no whithespace in " + line);
return -1;
}
while (columnStart < line.length() && line.charAt(columnStart) == ' ') {
columnStart++;
}
}
String seqname = line.substring(0, columnStart).trim();
curr = this.sample2sequence.get(seqname);
if (curr == null) {
curr = new AlignSequence();
curr.name = seqname;
this.sample2sequence.put(curr.name, curr);
}
int columnEnd = line.length();
// remove blanks and digit at the end
while (columnEnd - 1 > columnStart && (line.charAt(columnEnd - 1) == ' ' || Character.isDigit(line.charAt(columnEnd - 1)))) {
columnEnd--;
}
curr.seq.append(line.substring(columnStart, columnEnd));
this.align_length = Math.max(align_length, curr.seq.length());
}
}
} else {
LOG.error("Undefined input format");
return -1;
}
CloserUtil.close(r);
/* sequence consensus was set*/
if (consensusRefName != null) {
AlignSequence namedSequence = null;
if ((namedSequence = sample2sequence.get(consensusRefName)) == null) {
LOG.error("Cannot find consensus sequence \"" + consensusRefName + "\" in list of sequences: " + this.sample2sequence.keySet().toString());
return -1;
}
this.consensus = new NamedConsensus(namedSequence);
}
/**
* we're done, print VCF
*/
/**
* first, print header
*/
Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
// super.addMetaData(vcfHeaderLines);
Map<String, String> mapping = new HashMap<String, String>();
mapping.put("ID", REF);
mapping.put("length", String.valueOf(this.align_length));
vcfHeaderLines.add(new VCFContigHeaderLine(mapping, 1));
Set<String> samples = new TreeSet<String>(this.sample2sequence.keySet());
VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
w = super.openVariantContextWriter(this.outputFile);
w.writeHeader(vcfHeader);
/**
* loop over data, print header
*/
int pos1 = 0;
while (pos1 < align_length) {
// is it a real variation or print all sites
boolean is_variation;
if (consensus.at(pos1) == MATCH) {
if (this.printAllSites) {
is_variation = false;
} else {
++pos1;
continue;
}
} else {
is_variation = true;
}
int pos2 = pos1 + 1;
// don't extend if no variation and printAllSites
while (is_variation && pos2 < align_length && consensus.at(pos2) != MATCH) {
++pos2;
}
boolean is_subsitution = (pos1 + 1 == pos2);
if (// need pos1>0 because ALT contains prev base.
is_subsitution && pos1 != 0 && is_variation) {
for (Sequence seq : this.sample2sequence.values()) {
if (seq.at(pos1) == DELETION) {
is_subsitution = false;
break;
}
}
}
Set<Allele> alleles = new HashSet<Allele>();
VariantContextBuilder vcb = new VariantContextBuilder();
List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
/* longest variant */
String longest = null;
Counter<String> countAlleles = new Counter<String>();
Map<String, String> sample2genotype = new HashMap<String, String>(samples.size());
String namedConsensusRefAllele = "N";
/* loop over the sequences of each seample */
for (String sample : samples) {
Sequence seq = this.sample2sequence.get(sample);
String al = null;
if (is_subsitution) {
if (seq.at(pos1) == CLIPPING)
continue;
al = String.valueOf(seq.at(pos1));
} else {
StringBuilder sb = new StringBuilder(pos2 - pos1);
for (// yes -1
int i = Math.max(0, pos1 - 1); i < pos2; ++i) {
if (seq.at(i) == CLIPPING)
continue;
if (seq.at(i) == DELETION)
continue;
sb.append(seq.at(i));
}
if (sb.length() == 0)
continue;
al = sb.toString();
}
/* did we find the longest allele ?*/
if (longest == null || longest.length() < al.length()) {
// reset count of most frequent, we'll use the longest indel or subst
countAlleles = new Counter<String>();
longest = al;
}
countAlleles.incr(al);
sample2genotype.put(sample, al);
/* if consensus sequence name was defined , record this allele for future use */
if (consensusRefName != null && sample.equals(consensusRefName)) {
namedConsensusRefAllele = al;
}
}
if (countAlleles.isEmpty()) {
if (// printAllSites=false
printAllSites == false) {
continue;
}
/* no a real variation, just add a dummy 'N' */
countAlleles.incr("N");
}
String refAllStr;
if (consensusRefName == null) {
refAllStr = countAlleles.getMostFrequent();
} else {
refAllStr = namedConsensusRefAllele;
}
final Allele refAllele = Allele.create(refAllStr.replaceAll("[^ATGCatgc]", "N"), true);
alleles.add(refAllele);
/* loop over samples, , build each genotype */
for (String sample : sample2genotype.keySet()) {
Allele al = null;
if (!sample2genotype.containsKey(sample)) {
// nothing
} else if (sample2genotype.get(sample).equals(refAllStr)) {
al = refAllele;
} else {
al = Allele.create(sample2genotype.get(sample).replaceAll("[^ATGCatgc]", "N"), false);
alleles.add(al);
}
if (al != null) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
final List<Allele> sampleAlleles = new ArrayList<Allele>(2);
sampleAlleles.add(al);
if (!haploid)
sampleAlleles.add(al);
gb.alleles(sampleAlleles);
gb.DP(1);
genotypes.add(gb.make());
} else {
genotypes.add(GenotypeBuilder.createMissing(sample, haploid ? 1 : 2));
}
}
// got to 1-based ref if subst, for indel with use pos(base)-1
final int start = pos1 + (is_subsitution ? 1 : 0);
vcb.start(start);
vcb.stop(start + (refAllStr.length() - 1));
vcb.chr(REF);
HashMap<String, Object> atts = new HashMap<String, Object>();
atts.put(VCFConstants.DEPTH_KEY, genotypes.size());
vcb.attributes(atts);
vcb.alleles(alleles);
vcb.genotypes(genotypes);
w.add(vcb.make());
pos1 = pos2;
}
w.close();
if (outFasta != null) {
final PrintWriter fasta = super.openFileOrStdoutAsPrintWriter(outFasta);
for (final String sample : samples) {
fasta.println(">" + sample);
final Sequence seq = this.sample2sequence.get(sample);
for (int i = 0; i < align_length; ++i) {
fasta.print(seq.at(i));
}
fasta.println();
}
fasta.println(">CONSENSUS");
for (int i = 0; i < align_length; ++i) {
fasta.print(consensus.at(i));
}
fasta.println();
fasta.flush();
fasta.close();
}
LOG.info("Done");
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(w);
}
}
use of htsjdk.variant.variantcontext.GenotypeBuilder in project jvarkit by lindenb.
the class NoZeroVariationVCF method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
final VCFHeader header = in.getHeader();
if (in.hasNext()) {
LOG.info("found a variant in VCF.");
VCFUtils.copyHeaderAndVariantsTo(in, out);
} else {
LOG.info("no a variant in VCF. Creating a fake Variant");
header.addMetaDataLine(new VCFFilterHeaderLine("FAKESNP", "Fake SNP created because vcf input was empty."));
VCFFormatHeaderLine gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_KEY);
if (gtHeaderLine == null) {
LOG.info("Adding GT to header");
header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
}
gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY);
if (gtHeaderLine == null) {
LOG.info("Adding GQ to header");
header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality"));
}
out.writeHeader(header);
SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
// choose random chrom, best is 'random' , but not 1...23,X,Y, etc...
String chrom = dict.getSequence(0).getSequenceName();
for (final SAMSequenceRecord ssr : dict.getSequences()) {
String ssn = ssr.getSequenceName();
if (ssn.contains("_")) {
chrom = ssn;
break;
}
}
for (final SAMSequenceRecord ssr : dict.getSequences()) {
String ssn = ssr.getSequenceName();
if (ssn.toLowerCase().contains("random")) {
chrom = ssn;
break;
}
if (ssn.toLowerCase().contains("gl")) {
chrom = ssn;
break;
}
}
GenomicSequence gseq = new GenomicSequence(this.indexedFastaSequenceFile, chrom);
char ref = 'N';
char alt = 'N';
int POS = 0;
for (POS = 0; POS < gseq.length(); ++POS) {
ref = Character.toUpperCase(gseq.charAt(POS));
if (ref == 'N')
continue;
switch(ref) {
case 'A':
alt = 'T';
break;
case 'T':
alt = 'G';
break;
case 'G':
alt = 'C';
break;
case 'C':
alt = 'A';
break;
default:
break;
}
if (alt == 'N')
continue;
break;
}
if (alt == 'N')
throw new RuntimeException("found only N");
VariantContextBuilder vcb = new VariantContextBuilder();
Allele a1 = Allele.create((byte) ref, true);
Allele a2 = Allele.create((byte) alt, false);
List<Allele> la1a2 = new ArrayList<Allele>(2);
List<Genotype> genotypes = new ArrayList<Genotype>(header.getSampleNamesInOrder().size());
la1a2.add(a1);
la1a2.add(a2);
vcb.chr(gseq.getChrom());
vcb.start(POS + 1);
vcb.stop(POS + 1);
vcb.filter("FAKESNP");
vcb.alleles(la1a2);
vcb.log10PError(-0.1);
for (String sample : header.getSampleNamesInOrder()) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
if (genotypes.isEmpty()) {
gb.DP(1);
gb.GQ(1);
gb.alleles(la1a2);
gb.noAD();
gb.noPL();
genotypes.add(gb.make());
} else {
genotypes.add(GenotypeBuilder.createMissing(sample, 2));
}
}
vcb.genotypes(genotypes);
vcb.noID();
out.add(vcb.make());
}
return 0;
}
use of htsjdk.variant.variantcontext.GenotypeBuilder in project jvarkit by lindenb.
the class VcfRenameSamples method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
final VCFHeader header1 = in.getHeader();
final Set<String> samples1 = new LinkedHashSet<String>(header1.getSampleNamesInOrder());
final List<String> newHeader = new ArrayList<String>(samples1);
for (int i = 0; i < newHeader.size(); ++i) {
final String destName = this.oldNameToNewName.get(newHeader.get(i));
if (destName == null)
continue;
newHeader.set(i, destName);
}
if (newHeader.size() != new HashSet<String>(newHeader).size()) {
throw new RuntimeException("Error in input : there are some diplicates in the resulting new VCF header: " + newHeader);
}
for (final String srcName : this.oldNameToNewName.keySet()) {
if (!samples1.contains(srcName)) {
if (missing_user_name_is_error) {
LOG.error("Source Sample " + srcName + " missing in " + samples1 + ". Use option -E to ignore");
return -1;
} else {
LOG.warning("Missing src-sample:" + srcName);
}
}
}
final VCFHeader header2 = new VCFHeader(header1.getMetaDataInInputOrder(), newHeader);
header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
out.writeHeader(header2);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
while (in.hasNext()) {
final VariantContext ctx = progress.watch(in.next());
final VariantContextBuilder b = new VariantContextBuilder(ctx);
final List<Genotype> genotypes = new ArrayList<Genotype>();
for (final String oldName : samples1) {
Genotype g = ctx.getGenotype(oldName);
final String destName = this.oldNameToNewName.get(oldName);
if (destName != null) {
final GenotypeBuilder gb = new GenotypeBuilder(g);
gb.name(destName);
g = gb.make();
}
genotypes.add(g);
}
b.genotypes(genotypes);
out.add(b.make());
}
progress.finish();
return 0;
}
use of htsjdk.variant.variantcontext.GenotypeBuilder in project jvarkit by lindenb.
the class VCFCompareGT method doWork.
@Override
public int doWork(final List<String> arguments) {
final List<File> inputVcfFiles = new ArrayList<>(IOUtil.unrollFiles(arguments.stream().map(F -> new File(F)).collect(Collectors.toCollection(HashSet::new)), ".vcf", "vcf.gz"));
if (inputVcfFiles.isEmpty()) {
LOG.error("VCF missing.");
return -1;
}
VariantComparator varcmp = new VariantComparator();
SortingCollection<Variant> variants = null;
final Set<String> sampleNames = new LinkedHashSet<>();
try {
variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), varcmp, writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPaths());
variants.setDestructiveIteration(true);
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
for (int i = 0; i < inputVcfFiles.size(); ++i) {
final File vcfFile = inputVcfFiles.get(i);
LOG.info("Opening " + vcfFile);
final VCFFileReader vcfFileReader = new VCFFileReader(vcfFile, false);
final CloseableIterator<VariantContext> iter = vcfFileReader.iterator();
final VCFHeader header = vcfFileReader.getFileHeader();
sampleNames.addAll(header.getSampleNamesInOrder());
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "_" + ((i) + 1), "File: " + vcfFile.getPath()));
long nLines = 0;
while (iter.hasNext()) {
final VariantContext var = iter.next();
if (nLines++ % 10000 == 0) {
LOG.info(vcfFile + " " + nLines);
}
if (!var.isVariant())
continue;
if (!var.hasGenotypes())
continue;
for (final Genotype genotype : var.getGenotypes()) {
final Variant rec = new Variant();
if (!genotype.isAvailable())
continue;
if (!genotype.isCalled())
continue;
if (genotype.isNoCall())
continue;
rec.file_index = i + 1;
rec.sampleName = genotype.getSampleName();
rec.chrom = var.getContig();
rec.start = var.getStart();
rec.end = var.getEnd();
rec.ref = var.getReference().getDisplayString();
if (var.hasID()) {
rec.id = var.getID();
}
if (genotype.hasDP()) {
rec.dp = genotype.getDP();
}
if (genotype.hasGQ()) {
rec.gq = genotype.getGQ();
}
final List<Allele> alleles = genotype.getAlleles();
if (alleles == null)
continue;
if (alleles.size() == 1) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = rec.a1;
} else if (alleles.size() == 2) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = alleles.get(1).getDisplayString().toUpperCase();
if (rec.a1.compareTo(rec.a2) > 0) {
String tmp = rec.a2;
rec.a2 = rec.a1;
rec.a1 = tmp;
}
} else {
continue;
}
variants.add(rec);
}
}
iter.close();
vcfFileReader.close();
}
variants.doneAdding();
LOG.info("Done Adding");
final Set<String> newSampleNames = new HashSet<>();
for (int i = 0; i < inputVcfFiles.size(); ++i) {
for (final String sample : sampleNames) {
newSampleNames.add(sample + "_" + ((i) + 1));
}
}
final String GenpotypeChangedKey = "GCH";
final String GenpotypeCreated = "GNW";
final String GenpotypeDiff = "GDF";
metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Qual"));
metaData.add(new VCFFormatHeaderLine(GenpotypeChangedKey, 1, VCFHeaderLineType.Integer, "Changed Genotype"));
metaData.add(new VCFFormatHeaderLine(GenpotypeCreated, 1, VCFHeaderLineType.Integer, "Genotype Created/Deleted"));
metaData.add(new VCFInfoHeaderLine(GenpotypeDiff, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples with Genotype Difference"));
final VCFHeader header = new VCFHeader(metaData, new ArrayList<String>(newSampleNames));
final VariantContextWriter w = super.openVariantContextWriter(outputFile);
w.writeHeader(header);
final PosComparator posCompare = new PosComparator();
final EqualRangeIterator<Variant> iter = new EqualRangeIterator<>(variants.iterator(), posCompare);
while (iter.hasNext()) {
final List<Variant> row = iter.next();
/**
* this sample is not always the same
*/
final Set<String> samplesModified = new TreeSet<>();
/**
* the number of sample is different from vcflist.size()
*/
final Set<String> samplesCreates = new TreeSet<>();
final Counter<String> samplesSeen = new Counter<>();
for (int x = 0; x < row.size(); ++x) {
final Variant var1 = row.get(x);
samplesSeen.incr(var1.sampleName);
for (int y = x + 1; y < row.size(); ++y) {
final Variant var2 = row.get(y);
if (!var2.sampleName.equals(var1.sampleName))
continue;
if (var1.a1.equals(var2.a1) && var1.a2.equals(var2.a2))
continue;
samplesModified.add(var1.sampleName);
}
}
for (final String sampleName : samplesSeen.keySet()) {
if (samplesSeen.count(sampleName) != inputVcfFiles.size()) {
samplesCreates.add(sampleName);
}
}
final Variant first = row.get(0);
final Set<Allele> alleles = new HashSet<>();
alleles.add(Allele.create(first.ref, true));
for (final Variant var : row) {
alleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
alleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
}
final VariantContextBuilder b = new VariantContextBuilder(getClass().getName(), first.chrom, first.start, first.end, alleles);
// build genotypes
final List<Genotype> genotypes = new ArrayList<Genotype>();
for (final Variant var : row) {
// alleles for this genotype
final List<Allele> galleles = new ArrayList<Allele>();
galleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
galleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
final GenotypeBuilder gb = new GenotypeBuilder();
gb.DP(var.dp);
gb.alleles(galleles);
gb.name(var.sampleName + "_" + var.file_index);
gb.GQ(var.gq);
gb.attribute(GenpotypeChangedKey, samplesModified.contains(var.sampleName) ? 1 : 0);
gb.attribute(GenpotypeCreated, samplesCreates.contains(var.sampleName) ? 1 : 0);
genotypes.add(gb.make());
}
b.genotypes(genotypes);
b.id(first.id);
if (!(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
Set<String> set2 = new TreeSet<String>(samplesModified);
set2.addAll(samplesCreates);
b.attribute(GenpotypeDiff, set2.toArray());
}
if (!only_print_modified || !(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
w.add(b.make());
}
}
iter.close();
w.close();
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (variants != null)
try {
variants.cleanup();
} catch (Exception err) {
}
}
return 0;
}
Aggregations