use of com.github.lindenb.jvarkit.pedigree.Sample in project jvarkit by lindenb.
the class VcfBurdenFisherH method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator r, final VariantContextWriter w) {
final PrintWriter report;
if (this.bedExportPath == null) {
report = new PrintWriter(new NullOuputStream());
} else {
report = new PrintWriter(IOUtil.openFileForBufferedUtf8Writing(this.bedExportPath));
}
final VCFHeader header = r.getHeader();
final VCFHeader h2 = new VCFHeader(header);
final Pedigree pedigree;
try {
pedigree = new PedigreeParser().parse(this.pedigreeFile);
} catch (final IOException error) {
throw new RuntimeIOException(error);
}
final Set<Sample> individualSet = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isStatusSet()).collect(Collectors.toSet());
if (individualSet.isEmpty())
throw new IllegalArgumentException("No overlapping samples between header and pedigree.");
final VCFInfoHeaderLine fisherAlleleInfoHeader = new VCFInfoHeaderLine(this.burdenHFisherTag, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Fisher Exact Test Case/Control.");
final VCFFilterHeaderLine variantFilterHeader;
if (StringUtils.isBlank(this.softFilterName)) {
variantFilterHeader = null;
} else {
variantFilterHeader = new VCFFilterHeaderLine(this.softFilterName, "Fisher case:control vs miss|have ALT not between " + this.min_fisher + " and " + this.max_fisher);
h2.addMetaDataLine(variantFilterHeader);
}
final VCFFilterHeaderLine filterCtrlgtCaseRatio;
if (StringUtils.isBlank(this.filterCtrlgtCaseRatioStr)) {
filterCtrlgtCaseRatio = null;
} else {
filterCtrlgtCaseRatio = new VCFFilterHeaderLine(this.filterCtrlgtCaseRatioStr, "The number of CONTROLS carrying the ALT allele is creater than the number of CASES carrying the ALT allele.");
h2.addMetaDataLine(filterCtrlgtCaseRatio);
}
final VCFInfoHeaderLine fisherDetailInfoHeader = new VCFInfoHeaderLine(this.burdenHFisherTag + "Detail", VCFHeaderLineCount.A, VCFHeaderLineType.String, "Fisher Exact Test Case/Control");
h2.addMetaDataLine(fisherAlleleInfoHeader);
h2.addMetaDataLine(fisherDetailInfoHeader);
JVarkitVersion.getInstance().addMetaData(this, h2);
w.writeHeader(h2);
while (r.hasNext()) {
final VariantContext ctx = r.next();
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
if (this.overwrite_qual)
vcb.log10PError(VariantContext.NO_LOG10_PERROR);
vcb.rmAttribute(fisherAlleleInfoHeader.getID());
vcb.rmAttribute(fisherDetailInfoHeader.getID());
final Set<String> oldFilters = new HashSet<>(ctx.getFilters());
if (variantFilterHeader != null)
oldFilters.remove(variantFilterHeader.getID());
if (filterCtrlgtCaseRatio != null)
oldFilters.remove(filterCtrlgtCaseRatio.getID());
if (this.ignoreFiltered && ctx.isFiltered()) {
w.add(vcb.make());
continue;
}
boolean set_filter_in_range = true;
boolean set_filter_case_ctrl_ratio = true;
boolean found_one_alt = false;
final List<String> infoData = new ArrayList<>(ctx.getAlleles().size());
final List<Double> fisherValues = new ArrayList<>(ctx.getAlleles().size());
for (final Allele observed_alt : ctx.getAlternateAlleles()) {
if (observed_alt.isNoCall()) {
infoData.add(String.join("|", "ALLELE", String.valueOf(observed_alt.getDisplayString()), "FISHER", "1.0"));
fisherValues.add(1.0);
continue;
}
/* count for fisher allele */
int count_case_have_alt = 0;
int count_case_miss_alt = 0;
int count_ctrl_have_alt = 0;
int count_ctrl_miss_alt = 0;
/* loop over persons in this pop */
for (final Sample p : individualSet) {
/* get genotype for this individual */
final Genotype genotype = ctx.getGenotype(p.getId());
/* individual is not in vcf header */
if (genotype == null || !genotype.isCalled() || (this.ignore_filtered_genotype && genotype.isFiltered())) {
if (genotype == null)
LOG.warn("Genotype is null for sample " + p.getId() + " not is pedigree!");
// no information , we consider that sample was called AND HOM REF
if (p.isAffected()) {
count_case_miss_alt++;
} else {
count_ctrl_miss_alt++;
}
continue;
}
/* loop over alleles */
final boolean genotype_contains_allele = genotype.getAlleles().stream().anyMatch(A -> A.equals(observed_alt));
/* fisher */
if (genotype_contains_allele) {
if (p.isAffected()) {
count_case_have_alt++;
;
} else {
count_ctrl_have_alt++;
}
} else {
if (p.isAffected()) {
count_case_miss_alt++;
} else {
count_ctrl_miss_alt++;
}
}
}
/* end of loop over persons */
/* fisher test for alleles */
final FisherExactTest fisherAlt = FisherExactTest.compute(count_case_have_alt, count_case_miss_alt, count_ctrl_have_alt, count_ctrl_miss_alt);
fisherValues.add(fisherAlt.getAsDouble());
infoData.add(String.join("|", "ALLELE", String.valueOf(observed_alt.getDisplayString()), "FISHER", String.valueOf(fisherAlt.getAsDouble()), "CASE_HAVE_ALT", String.valueOf(count_case_have_alt), "CASE_MISS_ALT", String.valueOf(count_case_miss_alt), "CTRL_HAVE_ALT", String.valueOf(count_ctrl_have_alt), "CTRL_MISS_ALT", String.valueOf(count_ctrl_miss_alt)));
found_one_alt = true;
final boolean is_in_range = this.min_fisher <= fisherAlt.getAsDouble() && fisherAlt.getAsDouble() <= this.max_fisher;
final int total_ctrls = count_ctrl_have_alt + count_ctrl_miss_alt;
final int total_cases = count_case_have_alt + count_case_miss_alt;
// check ratio case/control
if (total_ctrls > 0 && total_cases > 0 && (count_case_have_alt / (double) total_cases) >= (count_ctrl_have_alt / (double) total_ctrls)) {
set_filter_case_ctrl_ratio = false;
}
if (is_in_range) {
set_filter_in_range = false;
}
if (this.bedExportPath != null && is_in_range) {
report.print(ctx.getContig());
report.print('\t');
report.print(ctx.getStart() - 1);
report.print('\t');
report.print(ctx.getEnd());
report.print('\t');
report.print(ctx.getReference().getDisplayString());
report.print('\t');
report.print(observed_alt.getDisplayString());
report.print('\t');
report.print(fisherAlt.getAsDouble());
report.print('\t');
report.print(count_case_have_alt);
report.print('\t');
report.print(count_case_miss_alt);
report.print('\t');
report.print(count_ctrl_have_alt);
report.print('\t');
report.print(count_ctrl_miss_alt);
report.println();
}
}
// better than nothing, otherwise i'll mess with the filters
if (!found_one_alt) {
w.add(vcb.make());
continue;
}
// soft filter, skip variant
if ((set_filter_in_range && variantFilterHeader == null) || (set_filter_case_ctrl_ratio && filterCtrlgtCaseRatio == null))
// skip variant
continue;
vcb.attribute(fisherAlleleInfoHeader.getID(), fisherValues);
vcb.attribute(fisherDetailInfoHeader.getID(), infoData);
if (this.overwrite_qual) {
final OptionalDouble minV = fisherValues.stream().mapToDouble(V -> V.doubleValue()).min();
if (minV.isPresent())
vcb.log10PError(Math.max(1.0E-100, /* arbitrary */
minV.getAsDouble()) / -10);
}
if (set_filter_in_range && variantFilterHeader != null) {
vcb.filter(variantFilterHeader.getID());
// only set this one if the filter is set above
if (set_filter_case_ctrl_ratio && filterCtrlgtCaseRatio != null) {
vcb.filter(filterCtrlgtCaseRatio.getID());
}
}
w.add(vcb.make());
}
w.close();
report.flush();
report.close();
return 0;
}
use of com.github.lindenb.jvarkit.pedigree.Sample in project jvarkit by lindenb.
the class VcfBurdenFisherV method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator in, final VariantContextWriter out) {
final boolean HAS_VARIANT = true;
final boolean NO_VARIANT = false;
Path tmVcfOut = null;
VariantContextWriter tmpw = null;
try {
final VCFHeader header = in.getHeader();
if (tableOut == null && header.getMetaDataLine(VCF_HEADER_FISHER_VALUE) != null) {
throw new JvarkitException.UserError("VCF Header " + VCF_HEADER_FISHER_VALUE + " already specified in input");
}
final VCFHeader header2 = new VCFHeader(header);
final Set<Sample> persons = new PedigreeParser().parse(this.pedigreeFile).getSamplesInVcfHeader(header).filter(S -> S.isStatusSet()).collect(Collectors.toSet());
if (persons.isEmpty()) {
LOG.warn("No sample in pedigree + vcf header");
return -1;
}
final Map<Sample, Boolean> indi2supervariant = new HashMap<>(persons.size());
for (final Sample person : persons) {
indi2supervariant.put(person, NO_VARIANT);
}
if (this.tableOut != null) {
tmVcfOut = null;
} else if (this.outputFile == null) {
tmVcfOut = Files.createTempFile("tmp.", FileExtensions.BCF);
} else {
tmVcfOut = Files.createTempFile(this.outputFile.getParent(), "tmp.", FileExtensions.BCF);
}
if (tmVcfOut != null) {
final VariantContextWriterBuilder vcwb = new VariantContextWriterBuilder();
vcwb.setCreateMD5(false);
vcwb.setReferenceDictionary(SequenceDictionaryUtils.extractRequired(header));
vcwb.clearOptions();
vcwb.setOutputPath(tmVcfOut);
tmpw = vcwb.build();
tmpw.writeHeader(header);
} else {
tmpw = null;
}
long count_variants = 0L;
while (in.hasNext()) {
final VariantContext ctx = in.next();
if (tmpw != null) {
tmpw.add(ctx);
} else {
out.add(ctx);
}
if (ctx.isFiltered() && !this.acceptFiltered)
continue;
final int n_alts = ctx.getAlternateAlleles().size();
if (n_alts == 0) {
LOG.warn("ignoring variant without ALT allele.");
continue;
}
count_variants++;
if (n_alts > 1) {
LOG.warn("variant with more than one ALT. " + ctx.getContig() + ":" + ctx.getStart());
}
// loop over person in this pedigree
for (final Sample person : indi2supervariant.keySet()) {
if (indi2supervariant.get(person) == HAS_VARIANT)
continue;
final Genotype g = ctx.getGenotype(person.getId());
if (g == null) {
// not in vcf header
continue;
}
if (g.isFiltered()) {
LOG.warn("ignoring filtered genotype");
// not filter.
continue;
}
if (g.getAlleles().stream().anyMatch(A -> A.isCalled() && A.isNonReference())) {
indi2supervariant.put(person, HAS_VARIANT);
}
// end of allele
}
// en dof for[person]
}
// end of
int count_case_sv0 = 0;
int count_ctrl_sv0 = 0;
int count_case_sv1 = 0;
int count_ctrl_sv1 = 0;
for (final Sample person : indi2supervariant.keySet()) {
final boolean hasVariant = indi2supervariant.get(person);
if (!hasVariant) {
if (person.isAffected())
count_case_sv0++;
else
count_ctrl_sv0++;
} else // AT_LEAST_ONE_VARIANT
{
if (person.isAffected())
count_case_sv1++;
else
count_ctrl_sv1++;
}
}
// end of person
final FisherExactTest fisher = FisherExactTest.compute(count_case_sv0, count_case_sv1, count_ctrl_sv0, count_ctrl_sv1);
if (this.tableOut == null) {
header2.addMetaDataLine(new VCFHeaderLine(VCF_HEADER_FISHER_VALUE, String.valueOf(fisher.getAsDouble())));
header2.addMetaDataLine(new VCFHeaderLine(VCF_HEADER_FISHER_VALUE + ".count", String.join("|", "CASE_SV0=" + count_case_sv0, "CASE_SV1=" + count_case_sv1, "CTRL_SV0=" + count_ctrl_sv0, "CTRL_SV1=" + count_ctrl_sv1)));
try (VCFIterator in2 = new VCFIteratorBuilder().open(tmVcfOut)) {
out.writeHeader(header2);
while (in2.hasNext()) {
out.add(in2.next());
}
}
Files.deleteIfExists(tmVcfOut);
tmpw = null;
} else {
/* save xml report */
try (final OutputStream os = super.openPathOrStdoutAsPrintStream(this.tableOut)) {
final XMLOutputFactory xof = XMLOutputFactory.newFactory();
final XMLStreamWriter w = xof.createXMLStreamWriter(os, "UTF-8");
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("html");
w.writeStartElement("head");
w.writeStartElement("title");
w.writeCharacters(getClass().getSimpleName() + ":" + inputName);
w.writeEndElement();
w.writeEmptyElement("meta");
w.writeAttribute("name", "vcf");
w.writeAttribute("content", String.valueOf(inputName));
w.writeEmptyElement("meta");
w.writeAttribute("name", "version");
w.writeAttribute("content", JVarkitVersion.getInstance().getLabel());
// hread
w.writeEndElement();
w.writeStartElement("body");
w.writeStartElement("table");
w.writeStartElement("caption");
w.writeCharacters("Fisher: ");
w.writeStartElement("span");
w.writeAttribute("id", "fisher");
w.writeCharacters(String.valueOf(fisher.getAsDouble()));
// span
w.writeEndElement();
w.writeCharacters(" Variant(s): ");
w.writeStartElement("span");
w.writeAttribute("id", "variants");
w.writeCharacters(String.valueOf(count_variants));
// span
w.writeEndElement();
// caption
w.writeEndElement();
w.writeStartElement("tr");
w.writeEmptyElement("th");
w.writeStartElement("th");
w.writeCharacters("With Rare");
// th
w.writeEndElement();
w.writeStartElement("th");
w.writeCharacters("No Rare");
// th
w.writeEndElement();
// tr
w.writeEndElement();
w.writeStartElement("tr");
w.writeStartElement("th");
w.writeCharacters("Case");
// th
w.writeEndElement();
w.writeStartElement("td");
w.writeAttribute("id", "case1");
w.writeCharacters(String.valueOf(count_case_sv1));
// td
w.writeEndElement();
w.writeStartElement("td");
w.writeAttribute("id", "case0");
w.writeCharacters(String.valueOf(count_case_sv0));
// td
w.writeEndElement();
// tr
w.writeEndElement();
w.writeStartElement("tr");
w.writeStartElement("th");
w.writeCharacters("Controls");
// th
w.writeEndElement();
w.writeStartElement("td");
w.writeAttribute("id", "ctrl1");
w.writeCharacters(String.valueOf(count_ctrl_sv1));
// td
w.writeEndElement();
w.writeStartElement("td");
w.writeAttribute("id", "ctrl0");
w.writeCharacters(String.valueOf(count_ctrl_sv0));
// td
w.writeEndElement();
// tr
w.writeEndElement();
// table
w.writeEndElement();
// body
w.writeEndElement();
// html
w.writeEndElement();
w.writeEndDocument();
w.close();
os.flush();
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
if (tmVcfOut != null)
try {
Files.deleteIfExists(tmVcfOut);
} catch (IOException err) {
}
}
}
use of com.github.lindenb.jvarkit.pedigree.Sample in project jvarkit by lindenb.
the class VcfBurdenMAF method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator in, final VariantContextWriter out) {
final int CASE_POP = 0;
final VCFHeader header0 = in.getHeader();
final String maf_label = "(" + this.min_AF + "<= maf <= " + this.max_AF + ")";
final VCFInfoHeaderLine mafCasInfoHeader = (StringUtils.isBlank(this.prefix) ? null : new VCFInfoHeaderLine(this.prefix + "AF_Cases", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "MAF Cases"));
final VCFInfoHeaderLine mafControlsInfoHeader = (StringUtils.isBlank(this.prefix) ? null : new VCFInfoHeaderLine(this.prefix + "AF_Controls", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "MAF Controls"));
final VCFInfoHeaderLine acCasInfoHeader = (StringUtils.isBlank(this.prefix) ? null : new VCFInfoHeaderLine(this.prefix + "AC_Cases", VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "AC Cases"));
final VCFInfoHeaderLine acControlsInfoHeader = (StringUtils.isBlank(this.prefix) ? null : new VCFInfoHeaderLine(this.prefix + "AC_Controls", VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "AC Controls"));
final VCFFilterHeaderLine filterCasHeader = (StringUtils.isBlank(this.prefix) ? null : new VCFFilterHeaderLine(mafCasInfoHeader.getID(), "MAF of case failed: " + maf_label));
final VCFFilterHeaderLine filterControlsHeader = (StringUtils.isBlank(this.prefix) ? null : new VCFFilterHeaderLine(mafControlsInfoHeader.getID(), "MAF of controls failed: " + maf_label));
final VCFFilterHeaderLine filterCaseOrControlsHeader = (StringUtils.isBlank(this.prefix) ? null : new VCFFilterHeaderLine(this.prefix + "MAFCaseOrControls", "MAF of cases OR MAF of controls failed: " + maf_label));
final Set<Sample> persons;
{
try {
persons = new PedigreeParser().parse(this.pedigreeFile).getSamplesInVcfHeader(header0).filter(S -> S.isStatusSet()).collect(Collectors.toSet());
} catch (final IOException err) {
LOG.error(err);
return -1;
}
}
final DoublePredicate isInAfRange = AF -> this.min_AF <= AF && AF <= this.max_AF;
final Set<Sample> caseSamples = persons.stream().filter(I -> I.isAffected()).collect(Collectors.toSet());
final Set<Sample> controlSamples = persons.stream().filter(I -> I.isUnaffected()).collect(Collectors.toSet());
if (caseSamples.isEmpty()) {
LOG.warn("NO case in " + this.pedigreeFile);
}
if (controlSamples.isEmpty()) {
LOG.warn("NO control in " + this.pedigreeFile);
}
final VCFHeader h2 = new VCFHeader(header0);
if (!StringUtils.isBlank(this.prefix)) {
h2.addMetaDataLine(mafCasInfoHeader);
h2.addMetaDataLine(mafControlsInfoHeader);
h2.addMetaDataLine(acCasInfoHeader);
h2.addMetaDataLine(acControlsInfoHeader);
h2.addMetaDataLine(filterCasHeader);
h2.addMetaDataLine(filterControlsHeader);
h2.addMetaDataLine(filterCaseOrControlsHeader);
}
JVarkitVersion.getInstance().addMetaData(this, h2);
out.writeHeader(h2);
while (in.hasNext()) {
final VariantContext ctx = in.next();
if (this.ignoreFiltered && ctx.isFiltered()) {
if (!StringUtils.isBlank(this.prefix))
out.add(ctx);
continue;
}
if (!ctx.hasGenotypes()) {
if (!StringUtils.isBlank(this.prefix))
out.add(ctx);
continue;
}
final List<Double> mafCasList = new ArrayList<>();
final List<Double> mafCtrlList = new ArrayList<>();
final List<Integer> acCasList = new ArrayList<>();
final List<Integer> acCtrlList = new ArrayList<>();
boolean set_max_maf_cas = true;
boolean set_max_maf_control = true;
boolean seen_data = false;
for (final Allele observed_alt : ctx.getAlternateAlleles()) {
/* loop over two populations : 0 = case, 1=controls */
for (int pop = 0; pop < 2; ++pop) {
final MafCalculator mafCalculator = new MafCalculator(observed_alt, ctx.getContig());
mafCalculator.setNoCallIsHomRef(this.noCallAreHomRef);
/* loop over persons in this pop */
for (final Sample p : (pop == CASE_POP ? caseSamples : controlSamples)) {
/* get genotype for this individual */
final Genotype genotype = ctx.getGenotype(p.getId());
if (this.ignore_filtered_genotype && genotype.isFiltered())
continue;
mafCalculator.add(genotype, p.isMale());
/* if(pop==CASE_POP && genotype.isCalled()) LOG.info("DEBUGMAF: "+p+" "+genotype); */
}
/* at least one genotype found */
if (!mafCalculator.isEmpty()) {
seen_data = true;
/* get MAF */
final double maf = mafCalculator.getMaf();
final double ac = mafCalculator.getCountAlt();
if (pop == CASE_POP) {
/* add INFO attribute */
mafCasList.add(maf);
acCasList.add((int) ac);
/* remove FILTER if needed */
if (isInAfRange.test(maf))
set_max_maf_cas = false;
} else {
/* add INFO attribute */
mafCtrlList.add(maf);
acCtrlList.add((int) ac);
/* remove FILTER if needed */
if (isInAfRange.test(maf))
set_max_maf_control = false;
}
} else {
if (pop == CASE_POP) {
mafCasList.add(0.0);
acCasList.add(0);
set_max_maf_cas = false;
} else {
mafCtrlList.add(0.0);
acCtrlList.add(0);
set_max_maf_control = false;
}
}
}
/* end of loop over pop */
}
if (StringUtils.isBlank(this.prefix)) {
if (!seen_data || set_max_maf_cas || set_max_maf_control)
continue;
out.add(ctx);
} else {
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.attribute(mafCasInfoHeader.getID(), mafCasList);
vcb.attribute(mafControlsInfoHeader.getID(), mafCtrlList);
vcb.attribute(acCasInfoHeader.getID(), acCasList);
vcb.attribute(acControlsInfoHeader.getID(), acCtrlList);
if (seen_data) {
if (set_max_maf_cas)
vcb.filter(filterCasHeader.getID());
if (set_max_maf_control)
vcb.filter(filterControlsHeader.getID());
if (set_max_maf_cas || set_max_maf_control) {
vcb.filter(filterCaseOrControlsHeader.getID());
}
}
out.add(vcb.make());
}
}
return 0;
}
use of com.github.lindenb.jvarkit.pedigree.Sample in project jvarkit by lindenb.
the class IndexCovToVcf method doWork.
@Override
public int doWork(final List<String> args) {
final IndexCovUtils indexCovUtils = new IndexCovUtils(this.treshold);
final CharSplitter tab = CharSplitter.TAB;
BufferedReader r = null;
VariantContextWriter vcw = null;
try {
final SAMSequenceDictionary dict;
if (this.refFile == null) {
dict = null;
} else {
dict = SequenceDictionaryUtils.extractRequired(this.refFile);
}
r = super.openBufferedReader(oneFileOrNull(args));
String line = r.readLine();
if (line == null) {
LOG.error("Cannot read first line of input");
return -1;
}
String[] tokens = tab.split(line);
if (tokens.length < 4 || !tokens[0].equals("#chrom") || !tokens[1].equals("start") || !tokens[2].equals("end")) {
LOG.error("bad first line " + line);
return -1;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_QUALITY_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
/**
* raw value in indexcov
*/
final VCFFormatHeaderLine foldHeader = new VCFFormatHeaderLine("F", 1, VCFHeaderLineType.Float, "Relative number of copy: 0.5 deletion 1 normal 2.0 duplication");
metaData.add(foldHeader);
final VCFFilterHeaderLine filterAllDel = new VCFFilterHeaderLine("ALL_DEL", "number of samples greater than 1 and all are deletions");
metaData.add(filterAllDel);
final VCFFilterHeaderLine filterAllDup = new VCFFilterHeaderLine("ALL_DUP", "number of samples greater than 1 and all are duplication");
metaData.add(filterAllDup);
final VCFFilterHeaderLine filterNoSV = new VCFFilterHeaderLine("NO_SV", "There is no DUP or DEL in this variant");
metaData.add(filterNoSV);
final VCFFilterHeaderLine filterHomDel = new VCFFilterHeaderLine("HOM_DEL", "There is one Homozygous deletion.");
metaData.add(filterHomDel);
final VCFFilterHeaderLine filterHomDup = new VCFFilterHeaderLine("HOM_DUP", "There is one Homozygous duplication.");
metaData.add(filterHomDup);
final VCFInfoHeaderLine infoNumDup = new VCFInfoHeaderLine("NDUP", 1, VCFHeaderLineType.Integer, "Number of samples being duplicated");
metaData.add(infoNumDup);
final VCFInfoHeaderLine infoNumDel = new VCFInfoHeaderLine("NDEL", 1, VCFHeaderLineType.Integer, "Number of samples being deleted");
metaData.add(infoNumDel);
final VCFInfoHeaderLine infoSingleton = new VCFInfoHeaderLine("SINGLETON", 1, VCFHeaderLineType.Flag, "Singleton candidate");
metaData.add(infoSingleton);
final VCFInfoHeaderLine infoAllAffected = new VCFInfoHeaderLine("ALL_CASES", 1, VCFHeaderLineType.Flag, "All cases are affected");
metaData.add(infoAllAffected);
final List<String> samples = Arrays.asList(tokens).subList(3, tokens.length);
final Pedigree pedigree;
final int count_cases_in_pedigree;
if (this.pedFile == null) {
pedigree = PedigreeParser.empty();
count_cases_in_pedigree = 0;
} else {
pedigree = new PedigreeParser().parse(this.pedFile);
final Set<String> set = new HashSet<>(samples);
count_cases_in_pedigree = (int) pedigree.getAffectedSamples().stream().filter(S -> set.contains(S.getId())).count();
}
final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
if (dict != null) {
vcfHeader.setSequenceDictionary(dict);
}
vcw = this.writingDelegate.dictionary(dict).open(outputFile);
vcw.writeHeader(vcfHeader);
// final List<Allele> NO_CALL_NO_CALL = Arrays.asList(Allele.NO_CALL,Allele.NO_CALL);
final Allele DUP_ALLELE = Allele.create("<DUP>", false);
final Allele DEL_ALLELE = Allele.create("<DEL>", false);
final Allele REF_ALLELE = Allele.create("N", true);
while ((line = r.readLine()) != null) {
if (StringUtil.isBlank(line))
continue;
tokens = tab.split(line);
if (tokens.length != 3 + samples.size()) {
r.close();
vcw.close();
throw new JvarkitException.TokenErrors("expected " + (samples.size() + 3) + "columns.", tokens);
}
final Set<Allele> alleles = new HashSet<>();
alleles.add(REF_ALLELE);
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(tokens[0]);
vcb.start(Integer.parseInt(tokens[1]));
final int chromEnd = Integer.parseInt(tokens[2]);
vcb.stop(chromEnd);
vcb.attribute(VCFConstants.END_KEY, chromEnd);
if (dict != null) {
final SAMSequenceRecord ssr = dict.getSequence(tokens[0]);
if (ssr == null) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(tokens[0], dict));
return -1;
}
if (chromEnd > ssr.getSequenceLength()) {
LOG.warn("WARNING sequence length in " + line + " is greater than in dictionary ");
}
}
int count_dup = 0;
int count_del = 0;
final Map<String, Float> sample2fold = new HashMap<>(samples.size());
for (int i = 3; i < tokens.length; i++) {
final String sampleName = samples.get(i - 3);
final float f = Float.parseFloat(tokens[i]);
if (f < 0 || Float.isNaN(f) || !Float.isFinite(f)) {
LOG.error("Bad fold " + f + " for sample " + sampleName + " in " + line);
}
sample2fold.put(sampleName, f);
}
final List<Genotype> genotypes = new ArrayList<>(samples.size());
int count_sv_cases = 0;
int count_sv_controls = 0;
int count_ref_cases = 0;
int count_ref_controls = 0;
boolean got_sv = false;
for (final String sampleName : sample2fold.keySet()) {
final float normDepth = sample2fold.get(sampleName);
final IndexCovUtils.SvType type = indexCovUtils.getType(normDepth);
final GenotypeBuilder gb;
switch(type) {
case REF:
{
gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, REF_ALLELE));
break;
}
case HET_DEL:
{
gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, DEL_ALLELE));
alleles.add(DEL_ALLELE);
count_del++;
break;
}
case HOM_DEL:
{
gb = new GenotypeBuilder(sampleName, Arrays.asList(DEL_ALLELE, DEL_ALLELE));
alleles.add(DEL_ALLELE);
count_del++;
vcb.filter(filterHomDel.getID());
break;
}
case HET_DUP:
{
gb = new GenotypeBuilder(sampleName, Arrays.asList(REF_ALLELE, DUP_ALLELE));
alleles.add(DUP_ALLELE);
count_dup++;
break;
}
case HOM_DUP:
{
gb = new GenotypeBuilder(sampleName, Arrays.asList(DUP_ALLELE, DUP_ALLELE));
alleles.add(DUP_ALLELE);
vcb.filter(filterHomDup.getID());
count_dup++;
break;
}
default:
{
gb = new GenotypeBuilder(sampleName, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
break;
}
}
if (type.isVariant())
got_sv = true;
gb.attribute(foldHeader.getID(), normDepth);
gb.GQ(type.getGenotypeQuality(normDepth));
final Sample sn = pedigree.getSampleById(sampleName);
if (sn != null) {
if (type.isVariant()) {
if (sn.isAffected()) {
count_sv_cases++;
} else if (sn.isUnaffected()) {
count_sv_controls++;
}
} else {
if (sn.isAffected()) {
count_ref_cases++;
} else if (sn.isUnaffected()) {
count_ref_controls++;
}
}
}
genotypes.add(gb.make());
}
vcb.alleles(alleles);
if (!pedigree.isEmpty() && count_sv_cases == 1 && count_ref_cases > 0 && count_sv_controls == 0 && count_ref_controls > 0) {
vcb.attribute(infoSingleton.getID(), Boolean.TRUE);
} else if (!pedigree.isEmpty() && count_sv_cases > 0 && count_sv_cases == count_cases_in_pedigree && count_ref_cases == 0 && count_sv_controls == 0 && count_ref_controls > 0) {
vcb.attribute(infoAllAffected.getID(), Boolean.TRUE);
}
vcb.genotypes(genotypes);
if (count_dup == samples.size() && samples.size() != 1) {
vcb.filter(filterAllDup.getID());
}
if (count_del == samples.size() && samples.size() != 1) {
vcb.filter(filterAllDel.getID());
}
if (!got_sv) {
vcb.filter(filterNoSV.getID());
}
vcb.attribute(infoNumDel.getID(), count_del);
vcb.attribute(infoNumDup.getID(), count_dup);
vcw.add(vcb.make());
}
vcw.close();
vcw = null;
r.close();
r = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(vcw);
}
}
use of com.github.lindenb.jvarkit.pedigree.Sample in project jvarkit by lindenb.
the class VCFComposite method doVcfToVcf.
protected int doVcfToVcf(final String inputName, final VCFIterator iterin, final VariantContextWriter out) {
final VCFHeader header = iterin.getHeader();
if (!header.hasGenotypingData()) {
LOG.error("No genotypes in " + inputName);
return -1;
}
final GeneExtractorFactory geneExtractorFactory = new GeneExtractorFactory(header);
final List<GeneExtractorFactory.GeneExtractor> extractors = geneExtractorFactory.parse(this.extractorsNames);
if (extractors.isEmpty()) {
LOG.error("no gene extractor found/defined.");
return -1;
}
final Pedigree pedigree;
try {
final Set<String> sampleNames = new HashSet<>(header.getSampleNamesInOrder());
final PedigreeParser pedParser = new PedigreeParser();
pedigree = pedParser.parse(this.pedigreeFile);
if (pedigree == null || pedigree.isEmpty()) {
LOG.error("pedigree missing/empty");
return -1;
}
this.affectedSamples.addAll(pedigree.getAffectedSamples());
this.affectedSamples.removeIf(S -> !sampleNames.contains(S.getId()));
if (this.affectedSamples.isEmpty()) {
LOG.error("No Affected sample in pedigree. " + this.pedigreeFile + "/" + inputName);
return -1;
}
this.unaffectedSamples.addAll(pedigree.getUnaffectedSamples());
this.unaffectedSamples.removeIf(S -> !sampleNames.contains(S.getId()));
if (pedigree.getUnaffectedSamples().isEmpty()) {
LOG.error("No Unaffected sample in " + this.pedigreeFile + "/" + inputName);
return -1;
}
} catch (final IOException err) {
throw new RuntimeIOException(err);
}
header.addMetaDataLine(new VCFInfoHeaderLine(INFO_TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Variant of VCFComposite"));
if (!StringUtils.isBlank(this.filterNonCompositeTag)) {
header.addMetaDataLine(new VCFFilterHeaderLine(this.filterNonCompositeTag, "Not a Variant fir VCFComposite"));
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
final Comparator<String> contigCmp;
if (dict == null || dict.isEmpty()) {
contigCmp = (A, B) -> A.compareTo(B);
} else {
contigCmp = new ContigDictComparator(dict);
}
final Comparator<VariantContext> ctxComparator = (V1, V2) -> {
int i = contigCmp.compare(V1.getContig(), V2.getContig());
if (i != 0)
return i;
i = Integer.compare(V1.getStart(), V2.getStart());
if (i != 0)
return i;
return V1.getReference().compareTo(V2.getReference());
};
final Comparator<VariantLine> variantLineComparator = (V1, V2) -> {
final int i = ctxComparator.compare(V1.ctx, V2.ctx);
if (i != 0)
return i;
return Long.compare(V1.id, V2.id);
};
long ID_GENERATOR = 0L;
this.vcfDecoder = VCFUtils.createDefaultVCFCodec();
this.vcfDecoder.setVCFHeader(header, VCFHeaderVersion.VCF4_2);
this.vcfEncoder = new VCFEncoder(header, false, true);
SortingCollection<GeneAndVariant> sorting = null;
SortingCollection<VariantLine> outputSorter = null;
try {
LOG.info("reading variants and genes");
/* Gene and variant sorter */
sorting = SortingCollection.newInstance(GeneAndVariant.class, new GeneAndVariantCodec(), GeneAndVariant::compareGeneThenIndex, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
/* Variant sorter */
outputSorter = SortingCollection.newInstance(VariantLine.class, new VariantLineCodec(), variantLineComparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
outputSorter.setDestructiveIteration(true);
/* read input */
while (iterin.hasNext()) {
final VariantContext ctx = iterin.next();
final VariantLine variantLine = new VariantLine(++ID_GENERATOR, ctx);
if (!this.variantJexl.test(ctx)) {
outputSorter.add(variantLine);
continue;
}
if (!acceptVariant(ctx)) {
outputSorter.add(variantLine);
continue;
}
final Set<GeneIdentifier> geneKeys = new HashSet<>();
extractors.stream().map(EX -> EX.apply(ctx)).flatMap(H -> H.keySet().stream()).forEach(KG -> {
geneKeys.add(new GeneIdentifier(KG.getKey(), KG.getGene(), KG.getMethod().replace('/', '_')));
});
if (geneKeys.isEmpty()) {
outputSorter.add(variantLine);
continue;
}
for (final GeneIdentifier gk : geneKeys) {
final GeneAndVariant gav = new GeneAndVariant(gk, variantLine);
gav.gene.contig = ctx.getContig();
sorting.add(gav);
}
}
sorting.doneAdding();
LOG.info("compile per gene");
this.reportGeneWriter = (this.geneReportPath == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openPathForPrintWriter(this.geneReportPath));
this.reportGeneWriter.print("#CHROM");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("bed.start");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("bed.end");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("gene.key");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("gene.label");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("gene.source");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("count.variants");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("affected.counts");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("affected.total");
this.reportGeneWriter.print('\t');
this.reportGeneWriter.print("affected.samples");
this.reportGeneWriter.println();
this.reportWriter = (this.reportPath == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openPathForPrintWriter(this.reportPath));
this.reportWriter.print("#CHROM");
this.reportWriter.print('\t');
this.reportWriter.print("bed.start");
this.reportWriter.print('\t');
this.reportWriter.print("bed.end");
this.reportWriter.print('\t');
this.reportWriter.print("gene.index");
this.reportWriter.print('\t');
this.reportWriter.print("gene.key");
this.reportWriter.print('\t');
this.reportWriter.print("gene.label");
this.reportWriter.print('\t');
this.reportWriter.print("gene.source");
for (int side = 0; side < 2; ++side) {
this.reportWriter.print('\t');
final String prefix = "variant" + (side + 1) + ".";
this.reportWriter.print(prefix + "start");
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "end");
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "ref");
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "alt");
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "info");
for (final Sample sn : this.affectedSamples) {
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "gt[" + sn.getId() + "].affected");
}
for (final Sample sn : this.unaffectedSamples) {
this.reportWriter.print('\t');
this.reportWriter.print(prefix + "gt[" + sn.getId() + "].unaffected");
}
}
this.reportWriter.println();
// compile data
CloseableIterator<GeneAndVariant> iter2 = sorting.iterator();
EqualRangeIterator<GeneAndVariant> eqiter = new EqualRangeIterator<>(iter2, (A, B) -> A.gene.compareTo(B.gene));
while (eqiter.hasNext()) {
final List<GeneAndVariant> variants = eqiter.next();
scan(variants.get(0).gene, variants.stream().map(L -> L.variant).collect(Collectors.toList()));
for (final GeneAndVariant ga : variants) outputSorter.add(ga.variant);
}
eqiter.close();
iter2.close();
sorting.cleanup();
//
this.reportWriter.flush();
this.reportWriter.close();
this.reportGeneWriter.flush();
this.reportGeneWriter.close();
LOG.info("write variants");
CloseableIterator<VariantLine> iter1 = outputSorter.iterator();
EqualRangeIterator<VariantLine> eqiter1 = new EqualRangeIterator<>(iter1, variantLineComparator);
out.writeHeader(header);
while (eqiter1.hasNext()) {
final List<VariantLine> array = eqiter1.next();
final VariantContext firstCtx = array.get(0).ctx;
final Set<String> set = getAnnotationsForVariant(firstCtx);
final VariantContext outCtx;
final VariantContextBuilder vcb = new VariantContextBuilder(firstCtx);
for (int y = 1; y < array.size(); ++y) {
set.addAll(getAnnotationsForVariant(array.get(y).ctx));
}
if (set.isEmpty()) {
if (StringUtils.isBlank(this.filterNonCompositeTag)) {
// ignore
continue;
} else {
vcb.filter(this.filterNonCompositeTag);
}
} else {
if (!firstCtx.isFiltered()) {
vcb.passFilters();
}
vcb.attribute(INFO_TAG, new ArrayList<>(set));
}
outCtx = vcb.make();
out.add(outCtx);
}
outputSorter.cleanup();
eqiter1.close();
iter1.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
Aggregations