use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class NgsFilesScanner method getFastqSampleInDirectory.
private Counter<String> getFastqSampleInDirectory(File f) {
if (f == null || !f.isDirectory() || !f.exists() || !f.canRead())
return EMPTY_COUNTER;
File[] array = f.listFiles(this.fastqFilter);
if (array == null)
return EMPTY_COUNTER;
Counter<String> fastqSamples = new Counter<String>();
for (File f2 : array) {
FastQName fqName = FastQName.parse(f2);
if (fqName.isValid() && fqName.getSample() != null) {
fastqSamples.incr(fqName.getSample(), f2.length());
}
}
return fastqSamples;
}
use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class GroupByGene method read.
private void read(final String input) throws IOException {
LineIterator lineiter = null;
SortingCollection<Call> sortingCollection = null;
try {
final Pattern regexType = (StringUtil.isBlank(this.typeRegexExclude) ? null : Pattern.compile(this.typeRegexExclude));
lineiter = (input == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(input));
sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(), (C1, C2) -> {
int i = C1.compareTo(C2);
if (i != 0)
return i;
return C1.line.compareTo(C2.line);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(lineiter);
final VCFHeader header = cah.header;
this.the_dictionary = header.getSequenceDictionary();
if (this.the_dictionary == null || this.the_dictionary.isEmpty()) {
throw new JvarkitException.DictionaryMissing(input);
}
this.the_codec = cah.codec;
final List<String> sampleNames;
if (header.getSampleNamesInOrder() != null) {
sampleNames = header.getSampleNamesInOrder();
} else {
sampleNames = Collections.emptyList();
}
final VcfTools vcfTools = new VcfTools(header);
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else {
pedigree = Pedigree.newParser().parse(header);
}
final Pattern tab = Pattern.compile("[\t]");
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(the_dictionary).logger(LOG);
while (lineiter.hasNext()) {
String line = lineiter.next();
final VariantContext ctx = progress.watch(this.the_codec.decode(line));
if (!ctx.isVariant())
continue;
if (ignore_filtered && ctx.isFiltered())
continue;
// simplify line
final String[] tokens = tab.split(line);
// ID
tokens[2] = VCFConstants.EMPTY_ID_FIELD;
// QUAL
tokens[5] = VCFConstants.MISSING_VALUE_v4;
// FILTER
tokens[6] = VCFConstants.UNFILTERED;
// INFO
tokens[7] = VCFConstants.EMPTY_INFO_FIELD;
line = String.join(VCFConstants.FIELD_SEPARATOR, Arrays.asList(tokens));
for (final GeneName g : getGenes(vcfTools, ctx)) {
if (regexType != null && regexType.matcher(g.type).matches())
continue;
final Call c = new Call();
c.line = line;
c.gene = g;
sortingCollection.add(c);
}
}
CloserUtil.close(lineiter);
lineiter = null;
sortingCollection.doneAdding();
/**
* dump
*/
final Set<String> casesSamples = pedigree.getPersons().stream().filter(P -> P.isAffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> controlsSamples = pedigree.getPersons().stream().filter(P -> P.isUnaffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> maleSamples = pedigree.getPersons().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> femaleSamples = pedigree.getPersons().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Predicate<Genotype> genotypeFilter = genotype -> {
if (!genotype.isAvailable())
return false;
if (!genotype.isCalled())
return false;
if (genotype.isNoCall())
return false;
if (genotype.isHomRef())
return false;
if (this.ignore_filtered_genotype && genotype.isFiltered())
return false;
return true;
};
PrintStream pw = openFileOrStdoutAsPrintStream(this.outFile);
pw.print("#chrom");
pw.print('\t');
pw.print("min.POS");
pw.print('\t');
pw.print("max.POS");
pw.print('\t');
pw.print("gene.name");
pw.print('\t');
pw.print("gene.type");
pw.print('\t');
pw.print("samples.affected");
pw.print('\t');
pw.print("count.variations");
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.cases");
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.controls");
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.males");
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.females");
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
pw.print('\t');
pw.print("fisher");
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample);
}
pw.println();
final CloseableIterator<Call> iter = sortingCollection.iterator();
final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
while (eqiter.hasNext()) {
final List<Call> row = eqiter.next();
final Call first = row.get(0);
final List<VariantContext> variantList = row.stream().map(R -> GroupByGene.this.the_codec.decode(R.line)).collect(Collectors.toList());
final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
final Set<String> sampleCarryingMut = new HashSet<String>();
final Counter<String> pedCasesCarryingMut = new Counter<String>();
final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
final Counter<String> malesCarryingMut = new Counter<String>();
final Counter<String> femalesCarryingMut = new Counter<String>();
final Counter<String> sample2count = new Counter<String>();
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
if (!genotypeFilter.test(genotype))
continue;
final String sampleName = genotype.getSampleName();
sample2count.incr(sampleName);
sampleCarryingMut.add(sampleName);
if (casesSamples.contains(sampleName)) {
pedCasesCarryingMut.incr(sampleName);
}
if (controlsSamples.contains(sampleName)) {
pedCtrlsCarryingMut.incr(sampleName);
}
if (maleSamples.contains(sampleName)) {
malesCarryingMut.incr(sampleName);
}
if (femaleSamples.contains(sampleName)) {
femalesCarryingMut.incr(sampleName);
}
}
}
pw.print(first.getContig());
pw.print('\t');
// convert to bed
pw.print(minPos - 1);
pw.print('\t');
pw.print(maxPos);
pw.print('\t');
pw.print(first.gene.name);
pw.print('\t');
pw.print(first.gene.type);
pw.print('\t');
pw.print(sampleCarryingMut.size());
pw.print('\t');
pw.print(variantList.size());
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCasesCarryingMut.getCountCategories());
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCtrlsCarryingMut.getCountCategories());
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print(malesCarryingMut.getCountCategories());
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print(femalesCarryingMut.getCountCategories());
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
int count_case_mut = 0;
int count_ctrl_mut = 0;
int count_case_wild = 0;
int count_ctrl_wild = 0;
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
final String sampleName = genotype.getSampleName();
final boolean has_mutation = genotypeFilter.test(genotype);
if (controlsSamples.contains(sampleName)) {
if (has_mutation) {
count_ctrl_mut++;
} else {
count_ctrl_wild++;
}
} else if (casesSamples.contains(sampleName)) {
if (has_mutation) {
count_case_mut++;
} else {
count_case_wild++;
}
}
}
}
final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
pw.print('\t');
pw.print(fisher.getAsDouble());
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample2count.count(sample));
}
pw.println();
if (pw.checkError())
break;
}
eqiter.close();
iter.close();
pw.flush();
if (this.outFile != null)
pw.close();
} finally {
CloserUtil.close(lineiter);
if (sortingCollection != null)
sortingCollection.cleanup();
}
}
use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class MsaToVcf method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter w = null;
LineIterator r = null;
try {
final String inputName = oneFileOrNull(args);
if (inputName == null) {
LOG.info("Reading from stdin");
r = IOUtils.openStreamForLineIterator(stdin());
} else {
LOG.info("Reading from " + inputName);
r = IOUtils.openURIForLineIterator(inputName);
}
Format format = Format.None;
/**
* try to guess format
*/
while (r.hasNext() && format == Format.None) {
final String line = r.peek();
if (line.trim().isEmpty()) {
r.next();
continue;
}
if (line.startsWith("CLUSTAL")) {
format = Format.Clustal;
// consume
r.next();
break;
} else if (line.startsWith(">")) {
format = Format.Fasta;
break;
} else {
LOG.error("MSA format not recognized in " + line);
return -1;
}
}
LOG.info("format : " + format);
/**
* parse lines as FASTA
*/
if (Format.Fasta.equals(format)) {
this.consensus = new FastaConsensus();
AlignSequence curr = null;
while (r.hasNext()) {
String line = r.next();
if (line.startsWith(">")) {
curr = new AlignSequence();
curr.name = line.substring(1).trim();
if (sample2sequence.containsKey(curr.name)) {
LOG.error("Sequence ID " + curr.name + " defined twice");
return -1;
}
sample2sequence.put(curr.name, curr);
} else if (curr != null) {
curr.seq.append(line.trim());
this.align_length = Math.max(this.align_length, curr.seq.length());
}
}
/*
//remove heading & trailing '-'
for(final String sample:this.sample2sequence.keySet())
{
final AlignSequence seq = this.sample2sequence.get(sample);
int i=0;
while(i<this.align_length && seq.at(i)==DELETION)
{
seq.seq.setCharAt(i, CLIPPING);
++i;
}
i= this.align_length-1;
while(i>=0 && seq.at(i)==DELETION)
{
seq.seq.setCharAt(i, CLIPPING);
--i;
}
}*/
} else /**
* parse lines as CLUSTAL
*/
if (Format.Clustal.equals(format)) {
ClustalConsensus clustalconsensus = new ClustalConsensus();
this.consensus = clustalconsensus;
AlignSequence curr = null;
int columnStart = -1;
while (r.hasNext()) {
String line = r.next();
if (line.trim().isEmpty() || line.startsWith("CLUSTAL W")) {
columnStart = -1;
continue;
}
if (line.charAt(0) == ' ') {
if (columnStart == -1) {
LOG.error("illegal consensus line for " + line);
return -1;
}
/* if consensus doesn't exist in the first rows */
while (clustalconsensus.seq.length() < (this.align_length - (line.length() - columnStart))) {
clustalconsensus.seq.append(" ");
}
clustalconsensus.seq.append(line.substring(columnStart));
} else {
if (columnStart == -1) {
columnStart = line.indexOf(' ');
if (columnStart == -1) {
LOG.error("no whithespace in " + line);
return -1;
}
while (columnStart < line.length() && line.charAt(columnStart) == ' ') {
columnStart++;
}
}
String seqname = line.substring(0, columnStart).trim();
curr = this.sample2sequence.get(seqname);
if (curr == null) {
curr = new AlignSequence();
curr.name = seqname;
this.sample2sequence.put(curr.name, curr);
}
int columnEnd = line.length();
// remove blanks and digit at the end
while (columnEnd - 1 > columnStart && (line.charAt(columnEnd - 1) == ' ' || Character.isDigit(line.charAt(columnEnd - 1)))) {
columnEnd--;
}
curr.seq.append(line.substring(columnStart, columnEnd));
this.align_length = Math.max(align_length, curr.seq.length());
}
}
} else {
LOG.error("Undefined input format");
return -1;
}
CloserUtil.close(r);
/* sequence consensus was set*/
if (consensusRefName != null) {
AlignSequence namedSequence = null;
if ((namedSequence = sample2sequence.get(consensusRefName)) == null) {
LOG.error("Cannot find consensus sequence \"" + consensusRefName + "\" in list of sequences: " + this.sample2sequence.keySet().toString());
return -1;
}
this.consensus = new NamedConsensus(namedSequence);
}
/**
* we're done, print VCF
*/
/**
* first, print header
*/
Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
// super.addMetaData(vcfHeaderLines);
Map<String, String> mapping = new HashMap<String, String>();
mapping.put("ID", REF);
mapping.put("length", String.valueOf(this.align_length));
vcfHeaderLines.add(new VCFContigHeaderLine(mapping, 1));
Set<String> samples = new TreeSet<String>(this.sample2sequence.keySet());
VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, samples);
w = super.openVariantContextWriter(this.outputFile);
w.writeHeader(vcfHeader);
/**
* loop over data, print header
*/
int pos1 = 0;
while (pos1 < align_length) {
// is it a real variation or print all sites
boolean is_variation;
if (consensus.at(pos1) == MATCH) {
if (this.printAllSites) {
is_variation = false;
} else {
++pos1;
continue;
}
} else {
is_variation = true;
}
int pos2 = pos1 + 1;
// don't extend if no variation and printAllSites
while (is_variation && pos2 < align_length && consensus.at(pos2) != MATCH) {
++pos2;
}
boolean is_subsitution = (pos1 + 1 == pos2);
if (// need pos1>0 because ALT contains prev base.
is_subsitution && pos1 != 0 && is_variation) {
for (Sequence seq : this.sample2sequence.values()) {
if (seq.at(pos1) == DELETION) {
is_subsitution = false;
break;
}
}
}
Set<Allele> alleles = new HashSet<Allele>();
VariantContextBuilder vcb = new VariantContextBuilder();
List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
/* longest variant */
String longest = null;
Counter<String> countAlleles = new Counter<String>();
Map<String, String> sample2genotype = new HashMap<String, String>(samples.size());
String namedConsensusRefAllele = "N";
/* loop over the sequences of each seample */
for (String sample : samples) {
Sequence seq = this.sample2sequence.get(sample);
String al = null;
if (is_subsitution) {
if (seq.at(pos1) == CLIPPING)
continue;
al = String.valueOf(seq.at(pos1));
} else {
StringBuilder sb = new StringBuilder(pos2 - pos1);
for (// yes -1
int i = Math.max(0, pos1 - 1); i < pos2; ++i) {
if (seq.at(i) == CLIPPING)
continue;
if (seq.at(i) == DELETION)
continue;
sb.append(seq.at(i));
}
if (sb.length() == 0)
continue;
al = sb.toString();
}
/* did we find the longest allele ?*/
if (longest == null || longest.length() < al.length()) {
// reset count of most frequent, we'll use the longest indel or subst
countAlleles = new Counter<String>();
longest = al;
}
countAlleles.incr(al);
sample2genotype.put(sample, al);
/* if consensus sequence name was defined , record this allele for future use */
if (consensusRefName != null && sample.equals(consensusRefName)) {
namedConsensusRefAllele = al;
}
}
if (countAlleles.isEmpty()) {
if (// printAllSites=false
printAllSites == false) {
continue;
}
/* no a real variation, just add a dummy 'N' */
countAlleles.incr("N");
}
String refAllStr;
if (consensusRefName == null) {
refAllStr = countAlleles.getMostFrequent();
} else {
refAllStr = namedConsensusRefAllele;
}
final Allele refAllele = Allele.create(refAllStr.replaceAll("[^ATGCatgc]", "N"), true);
alleles.add(refAllele);
/* loop over samples, , build each genotype */
for (String sample : sample2genotype.keySet()) {
Allele al = null;
if (!sample2genotype.containsKey(sample)) {
// nothing
} else if (sample2genotype.get(sample).equals(refAllStr)) {
al = refAllele;
} else {
al = Allele.create(sample2genotype.get(sample).replaceAll("[^ATGCatgc]", "N"), false);
alleles.add(al);
}
if (al != null) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
final List<Allele> sampleAlleles = new ArrayList<Allele>(2);
sampleAlleles.add(al);
if (!haploid)
sampleAlleles.add(al);
gb.alleles(sampleAlleles);
gb.DP(1);
genotypes.add(gb.make());
} else {
genotypes.add(GenotypeBuilder.createMissing(sample, haploid ? 1 : 2));
}
}
// got to 1-based ref if subst, for indel with use pos(base)-1
final int start = pos1 + (is_subsitution ? 1 : 0);
vcb.start(start);
vcb.stop(start + (refAllStr.length() - 1));
vcb.chr(REF);
HashMap<String, Object> atts = new HashMap<String, Object>();
atts.put(VCFConstants.DEPTH_KEY, genotypes.size());
vcb.attributes(atts);
vcb.alleles(alleles);
vcb.genotypes(genotypes);
w.add(vcb.make());
pos1 = pos2;
}
w.close();
if (outFasta != null) {
final PrintWriter fasta = super.openFileOrStdoutAsPrintWriter(outFasta);
for (final String sample : samples) {
fasta.println(">" + sample);
final Sequence seq = this.sample2sequence.get(sample);
for (int i = 0; i < align_length; ++i) {
fasta.print(seq.at(i));
}
fasta.println();
}
fasta.println(">CONSENSUS");
for (int i = 0; i < align_length; ++i) {
fasta.print(consensus.at(i));
}
fasta.println();
fasta.flush();
fasta.close();
}
LOG.info("Done");
return RETURN_OK;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(w);
}
}
use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class VCFCompare method doWork.
@Override
public int doWork(final List<String> args) {
if (args.isEmpty()) {
LOG.error("VCFs missing.");
return -1;
}
if (args.size() != 2) {
System.err.println("Illegal number or arguments. Expected two VCFs");
return -1;
}
PrintWriter pw = null;
XMLStreamWriter w = null;
InputStream in = null;
SortingCollection<LineAndFile> variants = null;
try {
LineAndFileComparator varcmp = new LineAndFileComparator();
variants = SortingCollection.newInstance(LineAndFile.class, new LineAndFileCodec(), varcmp, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
variants.setDestructiveIteration(true);
for (int i = 0; i < 2; ++i) {
this.inputs[i] = new Input();
this.inputs[i].codec = VCFUtils.createDefaultVCFCodec();
this.inputs[i].filename = args.get(i);
LOG.info("Opening " + this.inputs[i].filename);
in = IOUtils.openURIForReading(this.inputs[i].filename);
final LineReader lr = new SynchronousLineReader(in);
final LineIterator li = new LineIteratorImpl(lr);
this.inputs[i].header = (VCFHeader) this.inputs[i].codec.readActualHeader(li);
this.inputs[i].vepPredictionParser = new VepPredictionParserFactory(this.inputs[i].header).get();
this.inputs[i].snpEffPredictionParser = new SnpEffPredictionParserFactory(this.inputs[i].header).get();
this.inputs[i].annPredictionParser = new AnnPredictionParserFactory(this.inputs[i].header).get();
while (li.hasNext()) {
LineAndFile laf = new LineAndFile();
laf.fileIdx = i;
laf.line = li.next();
variants.add(laf);
}
LOG.info("Done Reading " + this.inputs[i].filename);
CloserUtil.close(li);
CloserUtil.close(lr);
CloserUtil.close(in);
}
variants.doneAdding();
LOG.info("Done Adding");
Set<String> commonSamples = new TreeSet<String>(this.inputs[0].header.getSampleNamesInOrder());
commonSamples.retainAll(this.inputs[1].header.getSampleNamesInOrder());
List<Venn0> venn1List = new ArrayList<VCFCompare.Venn0>();
venn1List.add(new Venn1("ALL"));
venn1List.add(new Venn1("having ID") {
@Override
public VariantContext filter(VariantContext ctx, int fileIndex) {
return ctx == null || !ctx.hasID() ? null : ctx;
}
});
venn1List.add(new Venn1("QUAL greater 30") {
@Override
public VariantContext filter(VariantContext ctx, int fileIndex) {
return ctx == null || !ctx.hasLog10PError() || ctx.getPhredScaledQual() < 30.0 ? null : ctx;
}
});
for (VariantContext.Type t : VariantContext.Type.values()) {
venn1List.add(new VennType(t));
}
for (SequenceOntologyTree.Term term : SequenceOntologyTree.getInstance().getTerms()) {
venn1List.add(new VennPred("vep", term) {
@Override
Set<Term> terms(VariantContext ctx, int file_id) {
Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
for (VepPredictionParser.VepPrediction pred : VCFCompare.this.inputs[file_id].vepPredictionParser.getPredictions(ctx)) {
tt.addAll(pred.getSOTerms());
}
return tt;
}
});
venn1List.add(new VennPred("SnpEff", term) {
@Override
Set<Term> terms(VariantContext ctx, int file_id) {
Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
for (SnpEffPredictionParser.SnpEffPrediction pred : VCFCompare.this.inputs[file_id].snpEffPredictionParser.getPredictions(ctx)) {
tt.addAll(pred.getSOTerms());
}
return tt;
}
});
venn1List.add(new VennPred("ANN", term) {
@Override
Set<Term> terms(VariantContext ctx, int file_id) {
Set<Term> tt = new HashSet<SequenceOntologyTree.Term>();
for (AnnPredictionParser.AnnPrediction pred : VCFCompare.this.inputs[file_id].annPredictionParser.getPredictions(ctx)) {
tt.addAll(pred.getSOTerms());
}
return tt;
}
});
}
for (String s : commonSamples) {
venn1List.add(new VennGType(s));
}
/* START : digest results ====================== */
Counter<String> diff = new Counter<String>();
List<LineAndFile> row = new ArrayList<LineAndFile>();
CloseableIterator<LineAndFile> iter = variants.iterator();
for (; ; ) {
LineAndFile rec = null;
if (iter.hasNext()) {
rec = iter.next();
}
if (rec == null || (!row.isEmpty() && varcmp.compare(row.get(0), rec) != 0)) {
if (!row.isEmpty()) {
diff.incr("count.variations");
VariantContext[] contexes_init = new VariantContext[] { null, null };
for (LineAndFile var : row) {
if (contexes_init[var.fileIdx] != null) {
LOG.error("Duplicate context in " + inputs[var.fileIdx].filename + " : " + var.line);
continue;
}
contexes_init[var.fileIdx] = var.getContext();
}
for (Venn0 venn : venn1List) {
venn.visit(contexes_init);
}
row.clear();
}
if (rec == null)
break;
}
row.add(rec);
}
iter.close();
/* END : digest results ====================== */
pw = super.openFileOrStdoutAsPrintWriter(outputFile);
XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
w = xmlfactory.createXMLStreamWriter(pw);
w.writeStartElement("html");
w.writeStartElement("body");
/* specific samples */
w.writeStartElement("div");
w.writeStartElement("dl");
for (int i = 0; i < 3; ++i) {
String title;
Set<String> samples;
switch(i) {
case 0:
case 1:
title = "Sample(s) for " + this.inputs[i].filename + ".";
samples = new TreeSet<String>(this.inputs[i].header.getSampleNamesInOrder());
samples.removeAll(commonSamples);
break;
default:
title = "Common Sample(s).";
samples = new TreeSet<String>(commonSamples);
break;
}
w.writeStartElement("dt");
w.writeCharacters(title);
w.writeEndElement();
w.writeStartElement("dd");
w.writeStartElement("ol");
for (String s : samples) {
w.writeStartElement("li");
w.writeCharacters(s);
w.writeEndElement();
}
w.writeEndElement();
w.writeEndElement();
}
// dl
w.writeEndElement();
// div
w.writeEndElement();
for (Venn0 v : venn1List) {
v.write(w);
}
// body
w.writeEndElement();
// html
w.writeEndElement();
w.writeEndDocument();
w.close();
w = null;
pw.flush();
pw.close();
pw = null;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(pw);
if (variants != null)
variants.cleanup();
}
return 0;
}
use of com.github.lindenb.jvarkit.util.Counter in project jvarkit by lindenb.
the class VCFCompareGT method doWork.
@Override
public int doWork(final List<String> arguments) {
final List<File> inputVcfFiles = new ArrayList<>(IOUtil.unrollFiles(arguments.stream().map(F -> new File(F)).collect(Collectors.toCollection(HashSet::new)), ".vcf", "vcf.gz"));
if (inputVcfFiles.isEmpty()) {
LOG.error("VCF missing.");
return -1;
}
VariantComparator varcmp = new VariantComparator();
SortingCollection<Variant> variants = null;
final Set<String> sampleNames = new LinkedHashSet<>();
try {
variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), varcmp, writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPaths());
variants.setDestructiveIteration(true);
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
for (int i = 0; i < inputVcfFiles.size(); ++i) {
final File vcfFile = inputVcfFiles.get(i);
LOG.info("Opening " + vcfFile);
final VCFFileReader vcfFileReader = new VCFFileReader(vcfFile, false);
final CloseableIterator<VariantContext> iter = vcfFileReader.iterator();
final VCFHeader header = vcfFileReader.getFileHeader();
sampleNames.addAll(header.getSampleNamesInOrder());
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "_" + ((i) + 1), "File: " + vcfFile.getPath()));
long nLines = 0;
while (iter.hasNext()) {
final VariantContext var = iter.next();
if (nLines++ % 10000 == 0) {
LOG.info(vcfFile + " " + nLines);
}
if (!var.isVariant())
continue;
if (!var.hasGenotypes())
continue;
for (final Genotype genotype : var.getGenotypes()) {
final Variant rec = new Variant();
if (!genotype.isAvailable())
continue;
if (!genotype.isCalled())
continue;
if (genotype.isNoCall())
continue;
rec.file_index = i + 1;
rec.sampleName = genotype.getSampleName();
rec.chrom = var.getContig();
rec.start = var.getStart();
rec.end = var.getEnd();
rec.ref = var.getReference().getDisplayString();
if (var.hasID()) {
rec.id = var.getID();
}
if (genotype.hasDP()) {
rec.dp = genotype.getDP();
}
if (genotype.hasGQ()) {
rec.gq = genotype.getGQ();
}
final List<Allele> alleles = genotype.getAlleles();
if (alleles == null)
continue;
if (alleles.size() == 1) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = rec.a1;
} else if (alleles.size() == 2) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = alleles.get(1).getDisplayString().toUpperCase();
if (rec.a1.compareTo(rec.a2) > 0) {
String tmp = rec.a2;
rec.a2 = rec.a1;
rec.a1 = tmp;
}
} else {
continue;
}
variants.add(rec);
}
}
iter.close();
vcfFileReader.close();
}
variants.doneAdding();
LOG.info("Done Adding");
final Set<String> newSampleNames = new HashSet<>();
for (int i = 0; i < inputVcfFiles.size(); ++i) {
for (final String sample : sampleNames) {
newSampleNames.add(sample + "_" + ((i) + 1));
}
}
final String GenpotypeChangedKey = "GCH";
final String GenpotypeCreated = "GNW";
final String GenpotypeDiff = "GDF";
metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Qual"));
metaData.add(new VCFFormatHeaderLine(GenpotypeChangedKey, 1, VCFHeaderLineType.Integer, "Changed Genotype"));
metaData.add(new VCFFormatHeaderLine(GenpotypeCreated, 1, VCFHeaderLineType.Integer, "Genotype Created/Deleted"));
metaData.add(new VCFInfoHeaderLine(GenpotypeDiff, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples with Genotype Difference"));
final VCFHeader header = new VCFHeader(metaData, new ArrayList<String>(newSampleNames));
final VariantContextWriter w = super.openVariantContextWriter(outputFile);
w.writeHeader(header);
final PosComparator posCompare = new PosComparator();
final EqualRangeIterator<Variant> iter = new EqualRangeIterator<>(variants.iterator(), posCompare);
while (iter.hasNext()) {
final List<Variant> row = iter.next();
/**
* this sample is not always the same
*/
final Set<String> samplesModified = new TreeSet<>();
/**
* the number of sample is different from vcflist.size()
*/
final Set<String> samplesCreates = new TreeSet<>();
final Counter<String> samplesSeen = new Counter<>();
for (int x = 0; x < row.size(); ++x) {
final Variant var1 = row.get(x);
samplesSeen.incr(var1.sampleName);
for (int y = x + 1; y < row.size(); ++y) {
final Variant var2 = row.get(y);
if (!var2.sampleName.equals(var1.sampleName))
continue;
if (var1.a1.equals(var2.a1) && var1.a2.equals(var2.a2))
continue;
samplesModified.add(var1.sampleName);
}
}
for (final String sampleName : samplesSeen.keySet()) {
if (samplesSeen.count(sampleName) != inputVcfFiles.size()) {
samplesCreates.add(sampleName);
}
}
final Variant first = row.get(0);
final Set<Allele> alleles = new HashSet<>();
alleles.add(Allele.create(first.ref, true));
for (final Variant var : row) {
alleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
alleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
}
final VariantContextBuilder b = new VariantContextBuilder(getClass().getName(), first.chrom, first.start, first.end, alleles);
// build genotypes
final List<Genotype> genotypes = new ArrayList<Genotype>();
for (final Variant var : row) {
// alleles for this genotype
final List<Allele> galleles = new ArrayList<Allele>();
galleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
galleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
final GenotypeBuilder gb = new GenotypeBuilder();
gb.DP(var.dp);
gb.alleles(galleles);
gb.name(var.sampleName + "_" + var.file_index);
gb.GQ(var.gq);
gb.attribute(GenpotypeChangedKey, samplesModified.contains(var.sampleName) ? 1 : 0);
gb.attribute(GenpotypeCreated, samplesCreates.contains(var.sampleName) ? 1 : 0);
genotypes.add(gb.make());
}
b.genotypes(genotypes);
b.id(first.id);
if (!(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
Set<String> set2 = new TreeSet<String>(samplesModified);
set2.addAll(samplesCreates);
b.attribute(GenpotypeDiff, set2.toArray());
}
if (!only_print_modified || !(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
w.add(b.make());
}
}
iter.close();
w.close();
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (variants != null)
try {
variants.cleanup();
} catch (Exception err) {
}
}
return 0;
}
Aggregations