use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.
the class VcfSkatSlidingWindow method doWork.
@Override
public int doWork(final List<String> args) {
if (this.nJobs < 1) {
this.nJobs = Math.max(1, Runtime.getRuntime().availableProcessors());
LOG.info("setting njobs to " + this.nJobs);
}
VcfIterator r = null;
try {
final VCFHeader header;
final SAMSequenceDictionary dict;
final File vcfFile = new File(oneAndOnlyOneFile(args));
try (final VCFFileReader vr = new VCFFileReader(vcfFile, true)) {
header = vr.getFileHeader();
dict = header.getSequenceDictionary();
}
if (dict == null || dict.isEmpty()) {
throw new JvarkitException.VcfDictionaryMissing(vcfFile);
}
if (!this.limit_contigs.isEmpty()) {
if (this.limit_contigs.stream().anyMatch(C -> dict.getSequence(C) == null)) {
LOG.error("user contig missing in vcf dictionary.");
return -1;
}
}
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
pedigree = new Pedigree.Parser().parse(header);
}
final Set<Pedigree.Person> samples = new HashSet<>(pedigree.getPersons());
samples.removeIf(I -> !(I.isAffected() || I.isUnaffected()) || !header.getSampleNamesInOrder().contains(I.getId()));
this.writer = super.openFileOrStdoutAsPrintWriter(this.outputFile);
final Consumer<SkatCallerResult> writeResult = (R) -> {
synchronized (this.writer) {
this.writer.println(R.toString());
}
};
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!this.limit_contigs.isEmpty() && !this.limit_contigs.contains(ssr.getSequenceName())) {
LOG.warning("skipping contig " + ssr.getSequenceName());
continue;
}
LOG.info("contig " + ssr.getSequenceName());
final ExecutorService executorService = new ThreadPoolExecutor(this.nJobs, this.nJobs, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingDeque<>(this.nJobs));
final List<Future<Integer>> results = new ArrayList<>(this.nJobs);
for (int i = 0; i < this.nJobs; i++) {
final int windowLen = Math.max(1, ssr.getSequenceLength() / this.nJobs);
final SkatWorker caller = new SkatWorker(vcfFile, new Interval(ssr.getSequenceName(), i * windowLen, Math.min(ssr.getSequenceLength(), (i + 1) * windowLen)), samples, this.skat.build(), writeResult);
results.add(executorService.submit(caller));
}
executorService.shutdown();
executorService.awaitTermination(365, TimeUnit.DAYS);
for (final Future<Integer> fl : results) {
try {
if (fl.get() != 0) {
LOG.error("An error occured");
return -1;
}
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
}
this.writer.flush();
this.writer.close();
this.writer = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(this.writer);
}
}
use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.
the class GroupByGene method read.
private void read(final String input) throws IOException {
LineIterator lineiter = null;
SortingCollection<Call> sortingCollection = null;
try {
final Pattern regexType = (StringUtil.isBlank(this.typeRegexExclude) ? null : Pattern.compile(this.typeRegexExclude));
lineiter = (input == null ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(input));
sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(), (C1, C2) -> {
int i = C1.compareTo(C2);
if (i != 0)
return i;
return C1.line.compareTo(C2.line);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(lineiter);
final VCFHeader header = cah.header;
this.the_dictionary = header.getSequenceDictionary();
if (this.the_dictionary == null || this.the_dictionary.isEmpty()) {
throw new JvarkitException.DictionaryMissing(input);
}
this.the_codec = cah.codec;
final List<String> sampleNames;
if (header.getSampleNamesInOrder() != null) {
sampleNames = header.getSampleNamesInOrder();
} else {
sampleNames = Collections.emptyList();
}
final VcfTools vcfTools = new VcfTools(header);
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else {
pedigree = Pedigree.newParser().parse(header);
}
final Pattern tab = Pattern.compile("[\t]");
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(the_dictionary).logger(LOG);
while (lineiter.hasNext()) {
String line = lineiter.next();
final VariantContext ctx = progress.watch(this.the_codec.decode(line));
if (!ctx.isVariant())
continue;
if (ignore_filtered && ctx.isFiltered())
continue;
// simplify line
final String[] tokens = tab.split(line);
// ID
tokens[2] = VCFConstants.EMPTY_ID_FIELD;
// QUAL
tokens[5] = VCFConstants.MISSING_VALUE_v4;
// FILTER
tokens[6] = VCFConstants.UNFILTERED;
// INFO
tokens[7] = VCFConstants.EMPTY_INFO_FIELD;
line = String.join(VCFConstants.FIELD_SEPARATOR, Arrays.asList(tokens));
for (final GeneName g : getGenes(vcfTools, ctx)) {
if (regexType != null && regexType.matcher(g.type).matches())
continue;
final Call c = new Call();
c.line = line;
c.gene = g;
sortingCollection.add(c);
}
}
CloserUtil.close(lineiter);
lineiter = null;
sortingCollection.doneAdding();
/**
* dump
*/
final Set<String> casesSamples = pedigree.getPersons().stream().filter(P -> P.isAffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> controlsSamples = pedigree.getPersons().stream().filter(P -> P.isUnaffected()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> maleSamples = pedigree.getPersons().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> femaleSamples = pedigree.getPersons().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Predicate<Genotype> genotypeFilter = genotype -> {
if (!genotype.isAvailable())
return false;
if (!genotype.isCalled())
return false;
if (genotype.isNoCall())
return false;
if (genotype.isHomRef())
return false;
if (this.ignore_filtered_genotype && genotype.isFiltered())
return false;
return true;
};
PrintStream pw = openFileOrStdoutAsPrintStream(this.outFile);
pw.print("#chrom");
pw.print('\t');
pw.print("min.POS");
pw.print('\t');
pw.print("max.POS");
pw.print('\t');
pw.print("gene.name");
pw.print('\t');
pw.print("gene.type");
pw.print('\t');
pw.print("samples.affected");
pw.print('\t');
pw.print("count.variations");
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.cases");
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.controls");
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.males");
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.females");
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
pw.print('\t');
pw.print("fisher");
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample);
}
pw.println();
final CloseableIterator<Call> iter = sortingCollection.iterator();
final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
while (eqiter.hasNext()) {
final List<Call> row = eqiter.next();
final Call first = row.get(0);
final List<VariantContext> variantList = row.stream().map(R -> GroupByGene.this.the_codec.decode(R.line)).collect(Collectors.toList());
final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
final Set<String> sampleCarryingMut = new HashSet<String>();
final Counter<String> pedCasesCarryingMut = new Counter<String>();
final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
final Counter<String> malesCarryingMut = new Counter<String>();
final Counter<String> femalesCarryingMut = new Counter<String>();
final Counter<String> sample2count = new Counter<String>();
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
if (!genotypeFilter.test(genotype))
continue;
final String sampleName = genotype.getSampleName();
sample2count.incr(sampleName);
sampleCarryingMut.add(sampleName);
if (casesSamples.contains(sampleName)) {
pedCasesCarryingMut.incr(sampleName);
}
if (controlsSamples.contains(sampleName)) {
pedCtrlsCarryingMut.incr(sampleName);
}
if (maleSamples.contains(sampleName)) {
malesCarryingMut.incr(sampleName);
}
if (femaleSamples.contains(sampleName)) {
femalesCarryingMut.incr(sampleName);
}
}
}
pw.print(first.getContig());
pw.print('\t');
// convert to bed
pw.print(minPos - 1);
pw.print('\t');
pw.print(maxPos);
pw.print('\t');
pw.print(first.gene.name);
pw.print('\t');
pw.print(first.gene.type);
pw.print('\t');
pw.print(sampleCarryingMut.size());
pw.print('\t');
pw.print(variantList.size());
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCasesCarryingMut.getCountCategories());
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCtrlsCarryingMut.getCountCategories());
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print(malesCarryingMut.getCountCategories());
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print(femalesCarryingMut.getCountCategories());
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
int count_case_mut = 0;
int count_ctrl_mut = 0;
int count_case_wild = 0;
int count_ctrl_wild = 0;
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
final String sampleName = genotype.getSampleName();
final boolean has_mutation = genotypeFilter.test(genotype);
if (controlsSamples.contains(sampleName)) {
if (has_mutation) {
count_ctrl_mut++;
} else {
count_ctrl_wild++;
}
} else if (casesSamples.contains(sampleName)) {
if (has_mutation) {
count_case_mut++;
} else {
count_case_wild++;
}
}
}
}
final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
pw.print('\t');
pw.print(fisher.getAsDouble());
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample2count.count(sample));
}
pw.println();
if (pw.checkError())
break;
}
eqiter.close();
iter.close();
pw.flush();
if (this.outFile != null)
pw.close();
} finally {
CloserUtil.close(lineiter);
if (sortingCollection != null)
sortingCollection.cleanup();
}
}
use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.
the class VcfToRdf method writeHeader.
private void writeHeader(final VCFHeader header, final URI source) {
if (source != null) {
emit(source, "rdf:type", "vcf:File");
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict != null) {
for (final SAMSequenceRecord ssr : dict.getSequences()) {
emit(URI.create("urn:chrom/" + ssr.getSequenceName()), "rdf:type", "vcf:Chromosome", "dc:title", ssr.getSequenceName(), "vcf:length", ssr.getSequenceLength(), "vcf:sequenceIndex", ssr.getSequenceIndex());
}
}
for (final VCFFilterHeaderLine h : header.getFilterLines()) {
emit(URI.create("urn:filter/" + h.getID()), "rdf:type", "vcf:Filter", "dc:title", h.getID(), "dc:description", h.getValue());
}
final Pedigree pedigree = Pedigree.newParser().parse(header);
for (final Pedigree.Person pe : pedigree.getPersons()) {
final URI sample = URI.create("urn:sample/" + pe.getId());
emit(sample, "rdf:type", "foaf:Person", "foaf:name", pe.getId(), "foaf:family", pe.getFamily().getId());
if (pe.isMale())
emit(sample, "idt:gender", "male");
if (pe.isFemale())
emit(sample, "idt:gender", "female");
if (pe.isAffected())
emit(sample, "idt:status", "affected");
if (pe.isUnaffected())
emit(sample, "idt:status", "unaffected");
}
// Sample
for (final String sample : header.getSampleNamesInOrder()) {
emit(URI.create("urn:sample/" + sample), "rdf:type", "vcf:Sample", "dc:title", sample);
}
}
use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.
the class VcfOptimizePedForSkat method doWork.
@Override
public int doWork(final List<String> args) {
VcfIterator r = null;
try {
this.random = new Random(this.seed == -1 ? System.currentTimeMillis() : seed);
if (this.nSamplesRemove < 1) {
LOG.error("bad number of samples to remove.");
return -1;
}
final SkatFactory.SkatExecutor executor = this.skatInstance.build();
r = super.openVcfIterator(oneFileOrNull(args));
final List<VariantContext> variants = new ArrayList<>();
while (r.hasNext()) {
final VariantContext ctx = r.next();
if (!executor.getUpstreamVariantFilter().test(ctx))
continue;
variants.add(ctx);
}
LOG.info("number of variants : " + variants.size());
if (variants.stream().map(V -> V.getContig()).collect(Collectors.toSet()).size() != 1) {
LOG.error("multiple contig/chromosome in input.");
return -1;
}
final VCFHeader header = r.getHeader();
final Set<String> sampleNames = new HashSet<>(header.getSampleNamesInOrder());
if (this.pedigreeFile != null) {
this.pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
this.pedigree = new Pedigree.Parser().parse(header);
}
r.close();
r = null;
final List<Pedigree.Person> samples = this.pedigree.getPersons().stream().filter(P -> (P.isAffected() || P.isUnaffected())).filter(S -> sampleNames.contains(S.getId())).collect(Collectors.toList());
if (samples.isEmpty()) {
LOG.error("Not enough Samples in pedigree/vcf");
return -1;
}
if (!samples.stream().anyMatch(P -> P.isAffected())) {
LOG.error("No affected Samples in pedigree/vcf");
return -1;
}
if (!samples.stream().anyMatch(P -> P.isUnaffected())) {
LOG.error("No unaffected Samples in pedigree/vcf");
return -1;
}
long nIter = 0L;
while (max_iterations == -1L || nIter < max_iterations) {
exec(nIter, header, variants, samples, executor);
++nIter;
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
}
}
use of com.github.lindenb.jvarkit.util.Pedigree in project jvarkit by lindenb.
the class VCFFilterJS method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, final VariantContextWriter w) {
try {
final VCFHeader header = r.getHeader();
final VcfTools vcfTools = new VcfTools(header);
final VCFHeader h2 = new VCFHeader(header);
addMetaData(h2);
final Pedigree pedigree;
if (pedigreeFile != null) {
pedigree = Pedigree.newParser().parse(this.pedigreeFile);
} else // try to read from VCF header
{
Pedigree p = null;
try {
p = Pedigree.newParser().parse(header);
} catch (final Exception err) {
LOG.warn("cannot decode pedigree from vcf header");
p = Pedigree.createEmptyPedigree();
}
pedigree = p;
}
final VCFFilterHeaderLine filterHeaderLine = (filteredTag.trim().isEmpty() ? null : new VCFFilterHeaderLine(this.filteredTag.trim(), "Filtered with " + getProgramName()));
if (filterHeaderLine != null)
h2.addMetaDataLine(filterHeaderLine);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
final Bindings bindings = this.compiledScript.getEngine().createBindings();
bindings.put("header", header);
bindings.put("tools", vcfTools);
bindings.put("pedigree", pedigree);
bindings.put("individuals", Collections.unmodifiableList(pedigree.getPersons().stream().filter(P -> (P.isAffected() || P.isUnaffected())).filter(P -> P.hasUniqId()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toList())));
for (final String jsonkv : this.jsonFiles) {
int eq = jsonkv.indexOf("=");
if (eq <= 0)
throw new JvarkitException.UserError("Bad format for json . expected key=/path/to/file.json but got '" + jsonkv + "'");
final String key = jsonkv.substring(0, eq);
final FileReader jsonFile = new FileReader(jsonkv.substring(eq + 1));
JsonParser jsonParser = new JsonParser();
final JsonElement root = jsonParser.parse(jsonFile);
jsonFile.close();
bindings.put(key, root);
}
w.writeHeader(h2);
while (r.hasNext() && !w.checkError()) {
final VariantContext variation = progress.watch(r.next());
bindings.put("variant", variation);
final Object result = compiledScript.eval(bindings);
// result is an array of a collection of variants
if (result != null && (result.getClass().isArray() || (result instanceof Collection))) {
final Collection<?> col;
if (result.getClass().isArray()) {
final Object[] array = (Object[]) result;
col = Arrays.asList(array);
} else {
col = (Collection<?>) result;
}
// write all of variants
for (final Object item : col) {
if (item == null)
throw new JvarkitException.UserError("item in array is null");
if (!(item instanceof VariantContext))
throw new JvarkitException.UserError("item in array is not a VariantContext " + item.getClass());
w.add(VariantContext.class.cast(item));
}
} else // result is a VariantContext
if (result != null && (result instanceof VariantContext)) {
w.add(VariantContext.class.cast(result));
} else {
boolean accept = true;
if (result == null) {
accept = false;
} else if (result instanceof Boolean) {
if (Boolean.FALSE.equals(result))
accept = false;
} else if (result instanceof Number) {
if (((Number) result).intValue() != 1)
accept = false;
} else {
LOG.warn("Script returned something that is not a boolean or a number:" + result.getClass());
accept = false;
}
if (!accept) {
if (filterHeaderLine != null) {
final VariantContextBuilder vcb = new VariantContextBuilder(variation);
vcb.filter(filterHeaderLine.getID());
w.add(vcb.make());
}
continue;
}
// set PASS filter if needed
if (filterHeaderLine != null && !variation.isFiltered()) {
w.add(new VariantContextBuilder(variation).passFilters().make());
continue;
}
w.add(variation);
}
}
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations