use of com.github.lindenb.jvarkit.dict.OrderChecker in project jvarkit by lindenb.
the class VcfROH method doWork.
@Override
public int doWork(final List<String> args) {
try {
final String input = oneFileOrNull(args);
try (VCFIterator iter = input == null ? new VCFIteratorBuilder().open(stdin()) : new VCFIteratorBuilder().open(input)) {
final VCFHeader header = iter.getHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final OrderChecker<VariantContext> checker = new OrderChecker<VariantContext>(dict, false);
final List<Sample> samples = new ArrayList<>(header.getNGenotypeSamples());
for (String sn : header.getSampleNamesInOrder()) {
final Sample sample = new Sample(sn, header.getSampleNameToOffset().get(sn));
samples.add(sample);
}
final IntervalTreeMap<Boolean> treemap;
if (this.intervalBedPath != null) {
try (BedLineReader br = new BedLineReader(this.intervalBedPath)) {
treemap = br.toIntervalTreeMap(B -> Boolean.TRUE);
}
} else {
treemap = null;
}
try (PrintWriter w = super.openPathOrStdoutAsPrintWriter(this.output)) {
if (!this.output_as_bed) {
final SAMFileHeader samHeader = new SAMFileHeader(dict);
samHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
for (Sample sn : samples) {
final SAMReadGroupRecord rg = new SAMReadGroupRecord(sn.name);
rg.setSample(sn.name);
samHeader.addReadGroup(rg);
}
JVarkitVersion.getInstance().addMetaData(this, samHeader);
codec.encode(w, samHeader);
}
while (iter.hasNext()) {
final VariantContext ctx = checker.apply(iter.next());
for (Sample sample : samples) sample.visit(w, ctx, treemap);
}
for (Sample sample : samples) sample.finish(w, treemap);
w.flush();
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.dict.OrderChecker in project jvarkit by lindenb.
the class VcfGnomadPext method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator iter, final VariantContextWriter out) {
final JsonParser jsonParser = new JsonParser();
final String[] standard_pext_header = new String[] { "chrom", "pos", "ref", "alt", "tx_annotation" };
final VCFHeader h0 = iter.getHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(h0);
if (!SequenceDictionaryUtils.isGRCh37(dict)) {
LOG.error("Input is NOT GRCh37 ");
return -1;
}
final OrderChecker<VariantContext> orderChecked = new OrderChecker<>(dict, false);
final CharSplitter tab = CharSplitter.TAB;
try (TabixReader gextFileReader = new TabixReader(this.pextDatabasePath)) {
final String line1 = gextFileReader.readLine();
if (StringUtils.isBlank(line1)) {
LOG.error("Cannot read first line of " + this.pextDatabasePath);
return -1;
}
if (!Arrays.equals(tab.split(line1), standard_pext_header)) {
LOG.error("Bad header in " + this.pextDatabasePath + " expected " + String.join("(tab)", standard_pext_header) + " but got " + line1.replace("\t", "(tab)"));
return -1;
}
final VCFHeader h2 = new VCFHeader(h0);
final VCFInfoHeaderLine pexInfo = new VCFInfoHeaderLine("GNOMAD_PEXT", VCFHeaderLineCount.A, VCFHeaderLineType.String, "Gnomad pext Data from " + this.pextDatabasePath);
h2.addMetaDataLine(pexInfo);
JVarkitVersion.getInstance().addMetaData(this, h2);
out.writeHeader(h2);
while (iter.hasNext()) {
final VariantContext ctx = orderChecked.apply(iter.next());
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.rmAttribute(pexInfo.getID());
if (!ctx.isVariant() || (this.skipFiltered && ctx.isFiltered())) {
out.add(vcb.make());
continue;
}
final List<PextEntry> entries = findOverlapping(gextFileReader, ctx);
if (entries.isEmpty()) {
out.add(vcb.make());
continue;
}
final List<String> altInfo = new ArrayList<>(ctx.getAlleles().size());
for (int allele_idx = 1; /* 0 is ref */
allele_idx < ctx.getAlleles().size(); allele_idx++) {
final Allele alt = ctx.getAlleles().get(allele_idx);
final PextEntry entry = entries.stream().filter(E -> E.alt.equals(alt)).findFirst().orElse(null);
if (entry == null) {
altInfo.add(".");
} else {
final JsonElement e = jsonParser.parse(entry.jsonStr);
if (!e.isJsonArray())
throw new IllegalStateException("not an array: " + entry.jsonStr);
final JsonArray array = e.getAsJsonArray();
final StringBuilder sb = new StringBuilder();
for (int x = 0; x < array.size(); ++x) {
if (x > 0)
sb.append("&");
if (!array.get(x).isJsonObject())
throw new IllegalStateException("not an array: " + entry.jsonStr);
final StringBuilder sb2 = new StringBuilder();
final JsonObject obj = array.get(x).getAsJsonObject();
for (final Map.Entry<String, JsonElement> kv : obj.entrySet()) {
String key = kv.getKey();
// "Brain_FrontalCortex_BA9_": 1.0,
if (key.endsWith("_"))
key = key.substring(0, key.length() - 1);
// as far as I can see, tissues start with a uppercase
if (Character.isUpperCase(key.charAt(0)) && !this.restrictTissues.isEmpty() && !this.restrictTissues.contains(key))
continue;
final JsonElement v = kv.getValue();
if (v.isJsonNull())
continue;
if (v.getAsJsonPrimitive().isString()) {
final String strv = v.getAsString();
if (strv.equals("NaN"))
continue;
}
if (sb2.length() > 0)
sb2.append("|");
sb2.append(key);
sb2.append(":");
sb2.append(kv.getValue().getAsString());
}
sb.append(sb2.toString());
}
if (sb.length() == 0)
sb.append(".");
altInfo.add(sb.toString());
}
}
// at least none is not '.'
if (altInfo.stream().anyMatch(S -> !S.equals("."))) {
vcb.attribute(pexInfo.getID(), altInfo);
}
out.add(vcb.make());
}
out.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.dict.OrderChecker in project jvarkit by lindenb.
the class VcfToIntervals method doWork.
@Override
public int doWork(final List<String> args) {
if (n_variants_per_interval >= 0 && distance_per_interval >= 0) {
LOG.error("n-variants per interval and distance both defined");
return -1;
} else if (n_variants_per_interval < 0 && distance_per_interval < 0) {
LOG.error("n-variants per interval or distance must be defined");
return -1;
}
try {
try (VCFIterator iter = super.openVCFIterator(oneFileOrNull(args))) {
final VCFHeader header = iter.getHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final OrderChecker<VariantContext> orderChecker = new OrderChecker<>(dict, false);
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
if (!this.force_bed_output) {
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
final SAMFileHeader samHeader = new SAMFileHeader(dict);
samHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
for (final String sn : header.getSampleNamesInOrder()) {
final SAMReadGroupRecord rg = new SAMReadGroupRecord(sn);
rg.setSample(sn);
samHeader.addReadGroup(rg);
}
JVarkitVersion.getInstance().addMetaData(this, samHeader);
codec.encode(pw, samHeader);
}
while (iter.hasNext()) {
final VariantContext first = orderChecker.apply(iter.next());
VariantContext last = first;
long n_variants = 1;
if (this.n_variants_per_interval > 0L) {
while (iter.hasNext() && n_variants < this.n_variants_per_interval) {
if (!first.contigsMatch(iter.peek())) {
break;
}
// consumme
last = orderChecker.apply(iter.next());
n_variants++;
}
} else {
while (iter.hasNext()) {
final VariantContext curr = iter.peek();
if (!first.contigsMatch(curr)) {
break;
}
if (CoordMath.getLength(first.getStart(), curr.getEnd()) > this.distance_per_interval) {
break;
}
// consumme
last = orderChecker.apply(iter.next());
n_variants++;
}
}
// next variant is just too close than the last one
while (this.min_distance >= 0 && iter.hasNext()) {
final VariantContext curr = iter.peek();
if (!last.withinDistanceOf(curr, this.min_distance))
break;
// consumme
last = orderChecker.apply(iter.next());
n_variants++;
}
pw.print(first.getContig());
pw.print("\t");
pw.print(first.getStart() - (this.force_bed_output ? 1 : 0));
pw.print("\t");
pw.print(last.getEnd());
pw.print("\t");
pw.print(n_variants);
pw.print("\t");
pw.print(CoordMath.getLength(first.getStart(), last.getEnd()));
pw.println();
}
// end while iter
pw.flush();
}
// end out
}
// end vcf iterator
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.dict.OrderChecker in project jvarkit by lindenb.
the class VcfCadd method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator in, final VariantContextWriter out) {
try {
final VCFHeader header = in.getHeader();
final OrderChecker<VariantContext> orderChecker = new OrderChecker<>(SequenceDictionaryUtils.extractRequired(header), false);
JVarkitVersion.getInstance().addMetaData(this, header);
if (header.getInfoHeaderLine(this.CADD_FLAG_PHRED) != null) {
throw new JvarkitException.DuplicateVcfHeaderInfo(header, this.CADD_FLAG_PHRED);
}
if (header.getInfoHeaderLine(this.CADD_FLAG_SCORE) != null) {
throw new JvarkitException.DuplicateVcfHeaderInfo(header, this.CADD_FLAG_SCORE);
}
header.addMetaDataLine(new VCFInfoHeaderLine(this.CADD_FLAG_SCORE, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Score suggests that that variant is likely to be observed (negative values) vs simulated(positive values)." + "However, raw values do have relative meaning, with higher values indicating that a variant is more likely to be simulated (or -not observed-) and therefore more likely to have deleterious effects." + " URI was " + this.ccaduri + ". We use " + NA_value + " for unknown value."));
header.addMetaDataLine(new VCFInfoHeaderLine(this.CADD_FLAG_PHRED, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "PHRED expressing the rank in order of magnitude terms. For example, reference genome single nucleotide variants at the 10th-% of CADD scores are assigned to CADD-10, top 1% to CADD-20, top 0.1% to CADD-30, etc. " + " URI was " + this.ccaduri + ". We use " + NA_value + " for unknown value."));
for (final String uf : this.userFields) {
header.addMetaDataLine(new VCFInfoHeaderLine("CADD_" + uf, this.getFieldHeaderLineCount(uf), VCFHeaderLineType.String, "User field extracted from " + this.ccaduri));
}
out.writeHeader(header);
while (in.hasNext()) {
out.add(runTabix(orderChecker.apply(in.next())));
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.dict.OrderChecker in project jvarkit by lindenb.
the class BamToMNV method doWork.
@Override
public int doWork(final List<String> args) {
try {
final List<Path> bams = IOUtils.unrollPaths(this.input_bams);
if (bams.isEmpty()) {
LOG.error("No bam was defined");
return -1;
}
final Pedigree pedigree;
if (this.pedigreePath != null) {
pedigree = new PedigreeParser().parse(this.pedigreePath);
pedigree.checkUniqIds();
} else {
pedigree = null;
}
try (VCFReader reader = VCFReaderFactory.makeDefault().open(oneAndOnlyOneFile(args), false)) {
final VCFHeader header = reader.getHeader();
final OrderChecker<VariantContext> order = new OrderChecker<>(header.getSequenceDictionary(), false);
try (CloseableIterator<VariantContext> r = reader.iterator()) {
this.all_variants.addAll(r.stream().filter(V -> V.isBiallelic() && V.isSNP()).map(V -> new VariantContextBuilder(V).noGenotypes().make()).map(order).collect(Collectors.toList()));
}
}
final List<Mnv> all_mnvs = new ArrayList<>();
for (int i = 0; i + 1 < this.all_variants.size(); i++) {
final VariantContext v1 = this.all_variants.get(i);
for (int j = i + 1; j < this.all_variants.size(); j++) {
final VariantContext v2 = this.all_variants.get(j);
if (!v1.withinDistanceOf(v2, min_distance_mnv))
break;
if (v1.getStart() == v2.getStart())
continue;
all_mnvs.add(new Mnv(i, j));
}
}
final Set<String> samples = new TreeSet<>();
final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.faidx);
for (final Path bam : bams) {
LOG.info(String.valueOf(bam));
try (SamReader sr = srf.open(bam)) {
final SAMFileHeader header = sr.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final String sample = header.getReadGroups().stream().map(R -> R.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(bam));
if (samples.contains(sample)) {
LOG.error("duplicate sample " + sample);
return -1;
}
samples.add(sample);
if (pedigree != null && pedigree.getSampleById(sample) == null) {
LOG.warn("sample " + sample + " from " + bam + " is not in pedigree.");
}
if (all_mnvs.isEmpty())
continue;
final QueryInterval[] intervals = QueryInterval.optimizeIntervals(this.all_variants.stream().map(V -> new QueryInterval(dict.getSequenceIndex(V.getContig()), V.getStart(), V.getEnd())).toArray(X -> new QueryInterval[X]));
final List<SAMRecord> sam_reads = new ArrayList<>();
try (CloseableIterator<SAMRecord> iter = sr.query(intervals, false)) {
while (iter.hasNext()) {
final SAMRecord record = iter.next();
if (!SAMRecordDefaultFilter.accept(record, this.minmapq))
continue;
if (record.getReadBases() == SAMRecord.NULL_SEQUENCE)
continue;
sam_reads.add(record);
}
}
// sort on query name
Collections.sort(sam_reads, (A, B) -> A.getReadName().compareTo(B.getReadName()));
for (final Mnv mnv : all_mnvs) {
final Phase phase = mnv.getPhase(sam_reads);
if (phase.equals(Phase.ambigous))
continue;
mnv.sample2phase.put(sample, phase);
}
}
}
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
pw.print("#CHROM1\tPOS1\tREF1\tALT1");
pw.print("\tCHROM2\tPOS2\tREF2\tALT2");
pw.print("\tdistance");
for (final String sn : samples) pw.print("\t" + sn);
if (pedigree != null) {
pw.print("\tcase-cis\tcase-trans\tcontrol-cis\tcontrol-trans\tfisher");
}
pw.println();
for (final Mnv mnv : all_mnvs) {
if (mnv.sample2phase.values().stream().allMatch(V -> V.equals(Phase.ambigous) || V.equals(Phase.ref)))
continue;
for (int side = 0; side < 2; ++side) {
final VariantContext ctx = mnv.get(side);
if (side > 0)
pw.print("\t");
pw.print(ctx.getContig());
pw.print("\t");
pw.print(ctx.getStart());
pw.print("\t");
pw.print(ctx.getReference().getDisplayString());
pw.print("\t");
pw.print(ctx.getAlleles().get(1).getDisplayString());
}
pw.print("\t");
pw.print(CoordMath.getLength(mnv.get(0).getStart(), mnv.get(1).getEnd()));
int case_cis = 0;
int case_trans = 0;
int ctrl_cis = 0;
int ctrl_trans = 0;
for (final String sn : samples) {
pw.print("\t");
final Phase phase = mnv.sample2phase.get(sn);
if (phase == null) {
pw.print(".");
continue;
}
pw.print(phase.name());
if (pedigree != null) {
final Sample sample = pedigree.getSampleById(sn);
if (sample == null) {
// nothing
} else if (sample.isAffected()) {
if (phase.equals(Phase.cis)) {
case_cis++;
} else if (phase.equals(Phase.trans)) {
case_trans++;
}
} else if (sample.isUnaffected()) {
if (phase.equals(Phase.cis)) {
ctrl_cis++;
} else if (phase.equals(Phase.trans)) {
ctrl_trans++;
}
}
}
}
if (pedigree != null) {
pw.print("\t");
pw.print(case_cis);
pw.print("\t");
pw.print(case_trans);
pw.print("\t");
pw.print(ctrl_cis);
pw.print("\t");
pw.print(ctrl_trans);
pw.print("\t");
final FisherExactTest fisher = FisherExactTest.compute(case_cis, case_trans, ctrl_cis, ctrl_trans);
pw.print(fisher.getAsDouble());
}
pw.println();
}
pw.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
Aggregations