use of com.github.lindenb.jvarkit.math.stats.FisherExactTest in project jvarkit by lindenb.
the class GroupByGene method read.
private void read(final String input) throws IOException {
LineIterator lineiter = null;
SortingCollection<Call> sortingCollection = null;
try {
final Pattern regexType = (StringUtil.isBlank(this.typeRegexExclude) ? null : Pattern.compile(this.typeRegexExclude));
lineiter = (input == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(input));
sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(), (C1, C2) -> {
int i = C1.compareTo(C2);
if (i != 0)
return i;
return C1.line.compareTo(C2.line);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(lineiter);
final VCFHeader header = cah.header;
this.the_dictionary = header.getSequenceDictionary();
if (this.the_dictionary == null || this.the_dictionary.isEmpty()) {
throw new JvarkitException.DictionaryMissing(input);
}
this.the_codec = cah.codec;
final List<String> sampleNames;
if (header.getSampleNamesInOrder() != null) {
sampleNames = header.getSampleNamesInOrder();
} else {
sampleNames = Collections.emptyList();
}
final VcfTools vcfTools = new VcfTools(header);
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else {
pedigree = Pedigree.newParser().parse(header);
}
final Pattern tab = Pattern.compile("[\t]");
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(the_dictionary).logger(LOG);
while (lineiter.hasNext()) {
String line = lineiter.next();
final VariantContext ctx = progress.watch(this.the_codec.decode(line));
if (!ctx.isVariant())
continue;
if (ignore_filtered && ctx.isFiltered())
continue;
// simplify line
final String[] tokens = tab.split(line);
// ID
tokens[2] = VCFConstants.EMPTY_ID_FIELD;
// QUAL
tokens[5] = VCFConstants.MISSING_VALUE_v4;
// FILTER
tokens[6] = VCFConstants.UNFILTERED;
// INFO
tokens[7] = VCFConstants.EMPTY_INFO_FIELD;
line = String.join(VCFConstants.FIELD_SEPARATOR, Arrays.asList(tokens));
for (final GeneName g : getGenes(vcfTools, ctx)) {
if (regexType != null && regexType.matcher(g.type).matches())
continue;
final Call c = new Call();
c.line = line;
c.gene = g;
sortingCollection.add(c);
}
}
CloserUtil.close(lineiter);
lineiter = null;
sortingCollection.doneAdding();
/**
* dump
*/
final Set<String> casesSamples = pedigree.getPersons().stream().filter(P -> P.isAffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> controlsSamples = pedigree.getPersons().stream().filter(P -> P.isUnaffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> maleSamples = pedigree.getPersons().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> femaleSamples = pedigree.getPersons().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Predicate<Genotype> genotypeFilter = genotype -> {
if (!genotype.isAvailable())
return false;
if (!genotype.isCalled())
return false;
if (genotype.isNoCall())
return false;
if (genotype.isHomRef())
return false;
if (this.ignore_filtered_genotype && genotype.isFiltered())
return false;
return true;
};
PrintStream pw = openFileOrStdoutAsPrintStream(this.outFile);
pw.print("#chrom");
pw.print('\t');
pw.print("min.POS");
pw.print('\t');
pw.print("max.POS");
pw.print('\t');
pw.print("gene.name");
pw.print('\t');
pw.print("gene.type");
pw.print('\t');
pw.print("samples.affected");
pw.print('\t');
pw.print("count.variations");
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.cases");
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.controls");
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.males");
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.females");
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
pw.print('\t');
pw.print("fisher");
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample);
}
pw.println();
final CloseableIterator<Call> iter = sortingCollection.iterator();
final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
while (eqiter.hasNext()) {
final List<Call> row = eqiter.next();
final Call first = row.get(0);
final List<VariantContext> variantList = row.stream().map(R -> GroupByGene.this.the_codec.decode(R.line)).collect(Collectors.toList());
final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
final Set<String> sampleCarryingMut = new HashSet<String>();
final Counter<String> pedCasesCarryingMut = new Counter<String>();
final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
final Counter<String> malesCarryingMut = new Counter<String>();
final Counter<String> femalesCarryingMut = new Counter<String>();
final Counter<String> sample2count = new Counter<String>();
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
if (!genotypeFilter.test(genotype))
continue;
final String sampleName = genotype.getSampleName();
sample2count.incr(sampleName);
sampleCarryingMut.add(sampleName);
if (casesSamples.contains(sampleName)) {
pedCasesCarryingMut.incr(sampleName);
}
if (controlsSamples.contains(sampleName)) {
pedCtrlsCarryingMut.incr(sampleName);
}
if (maleSamples.contains(sampleName)) {
malesCarryingMut.incr(sampleName);
}
if (femaleSamples.contains(sampleName)) {
femalesCarryingMut.incr(sampleName);
}
}
}
pw.print(first.getContig());
pw.print('\t');
// convert to bed
pw.print(minPos - 1);
pw.print('\t');
pw.print(maxPos);
pw.print('\t');
pw.print(first.gene.name);
pw.print('\t');
pw.print(first.gene.type);
pw.print('\t');
pw.print(sampleCarryingMut.size());
pw.print('\t');
pw.print(variantList.size());
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCasesCarryingMut.getCountCategories());
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCtrlsCarryingMut.getCountCategories());
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print(malesCarryingMut.getCountCategories());
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print(femalesCarryingMut.getCountCategories());
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
int count_case_mut = 0;
int count_ctrl_mut = 0;
int count_case_wild = 0;
int count_ctrl_wild = 0;
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
final String sampleName = genotype.getSampleName();
final boolean has_mutation = genotypeFilter.test(genotype);
if (controlsSamples.contains(sampleName)) {
if (has_mutation) {
count_ctrl_mut++;
} else {
count_ctrl_wild++;
}
} else if (casesSamples.contains(sampleName)) {
if (has_mutation) {
count_case_mut++;
} else {
count_case_wild++;
}
}
}
}
final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
pw.print('\t');
pw.print(fisher.getAsDouble());
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample2count.count(sample));
}
pw.println();
if (pw.checkError())
break;
}
eqiter.close();
iter.close();
pw.flush();
if (this.outFile != null)
pw.close();
} finally {
CloserUtil.close(lineiter);
if (sortingCollection != null)
sortingCollection.cleanup();
}
}
Aggregations