use of com.github.lindenb.jvarkit.variant.vcf.BcfIteratorBuilder in project jvarkit by lindenb.
the class OnePassVcfLauncher method doWork.
@Override
public int doWork(final List<String> args) {
VCFIterator in = null;
VariantContextWriter vcw = null;
final String input = super.oneFileOrNull(args);
if (input != null && this.outputFile != null && !IOUtil.isUrl(input) && Paths.get(input).equals(this.outputFile)) {
LOG.error("Input == output : " + input);
return -1;
}
try {
if (beforeVcf() != 0) {
LOG.error("initialization failed");
return -1;
}
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
try {
final BcfIteratorBuilder bcb = new BcfIteratorBuilder();
if (input == null) {
in = bcb.open(stdin());
} else {
in = bcb.open(input);
}
if (getLogger() != null) {
in = new VCFIter(in, getLogger());
}
vcw = this.writingVariantsDelegate.dictionary(in.getHeader()).open(this.outputFile);
final int err = doVcfToVcf(input == null ? "<stdin>" : input, in, vcw);
vcw.close();
vcw = null;
in.close();
in = null;
if (err != 0)
deleteOutputOnError();
return err;
} catch (final Throwable err) {
LOG.error(err);
if (vcw != null)
vcw.close();
deleteOutputOnError();
return -1;
} finally {
try {
afterVcf();
} catch (final Throwable err) {
LOG.error(err);
}
if (in != null)
in.close();
if (vcw != null)
vcw.close();
}
}
use of com.github.lindenb.jvarkit.variant.vcf.BcfIteratorBuilder in project jvarkit by lindenb.
the class GroupByGene method read.
private void read(final String input) throws IOException {
SortingCollection<Call> sortingCollection = null;
VCFIterator vcfIterator = null;
try {
final BcfIteratorBuilder iterbuilder = new BcfIteratorBuilder();
vcfIterator = (input == null ? iterbuilder.open(stdin()) : iterbuilder.open(input));
final VCFHeader header = vcfIterator.getHeader();
this.contigDictComparator = new ContigDictComparator(SequenceDictionaryUtils.extractRequired(header));
sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(header), (C1, C2) -> C1.compare2(C2), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
final GeneExtractorFactory geneExtractorFactory = new GeneExtractorFactory(header);
final List<GeneExtractorFactory.GeneExtractor> geneExtractors = geneExtractorFactory.parse(this.extractorsNames);
final List<String> sampleNames;
if (header.getSampleNamesInOrder() != null) {
sampleNames = header.getSampleNamesInOrder();
} else {
sampleNames = Collections.emptyList();
}
final Pedigree pedigree;
if (this.pedigreePath != null) {
final PedigreeParser pedParser = new PedigreeParser();
pedigree = pedParser.parse(this.pedigreePath);
} else {
pedigree = PedigreeParser.empty();
}
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(header).logger(LOG).build();
while (vcfIterator.hasNext()) {
final VariantContext ctx = progress.apply(vcfIterator.next());
if (!ctx.isVariant())
continue;
if (ignore_filtered && ctx.isFiltered())
continue;
// simplify line
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.noID();
vcb.log10PError(VariantContext.NO_LOG10_PERROR);
vcb.unfiltered();
vcb.attributes(Collections.emptyMap());
final VariantContext ctx2 = vcb.make();
final SortingCollection<Call> finalSorter = sortingCollection;
geneExtractors.stream().flatMap(EX -> EX.apply(ctx).keySet().stream()).forEach(KG -> {
final Call c = new Call();
c.ctx = ctx2;
c.gene = new GeneName(KG.getKey(), KG.getGene(), KG.getMethod());
finalSorter.add(c);
});
}
CloserUtil.close(vcfIterator);
vcfIterator = null;
sortingCollection.doneAdding();
progress.close();
/**
* dump
*/
final Set<String> casesSamples = pedigree.getAffectedSamples().stream().map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> controlsSamples = pedigree.getUnaffectedSamples().stream().map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> maleSamples = pedigree.getSamples().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> femaleSamples = pedigree.getSamples().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Predicate<Genotype> genotypeFilter = genotype -> {
if (!genotype.isAvailable())
return false;
if (!genotype.isCalled())
return false;
if (genotype.isNoCall())
return false;
if (genotype.isHomRef())
return false;
if (this.ignore_filtered_genotype && genotype.isFiltered())
return false;
return true;
};
PrintStream pw = openPathOrStdoutAsPrintStream(this.outFile);
pw.print("#chrom");
pw.print('\t');
pw.print("min.POS");
pw.print('\t');
pw.print("max.POS");
pw.print('\t');
pw.print("gene.name");
pw.print('\t');
pw.print("gene.label");
pw.print('\t');
pw.print("gene.type");
pw.print('\t');
pw.print("samples.affected");
pw.print('\t');
pw.print("count.variations");
if (this.print_positions) {
pw.print('\t');
pw.print("positions");
}
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.cases");
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.controls");
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.males");
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.females");
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
pw.print('\t');
pw.print("fisher");
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample);
}
pw.println();
final CloseableIterator<Call> iter = sortingCollection.iterator();
final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
while (eqiter.hasNext()) {
final List<Call> row = eqiter.next();
final Call first = row.get(0);
final List<VariantContext> variantList = row.stream().map(R -> R.ctx).collect(Collectors.toList());
final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
final Set<String> sampleCarryingMut = new HashSet<String>();
final Counter<String> pedCasesCarryingMut = new Counter<String>();
final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
final Counter<String> malesCarryingMut = new Counter<String>();
final Counter<String> femalesCarryingMut = new Counter<String>();
final Counter<String> sample2count = new Counter<String>();
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
if (!genotypeFilter.test(genotype))
continue;
final String sampleName = genotype.getSampleName();
sample2count.incr(sampleName);
sampleCarryingMut.add(sampleName);
if (casesSamples.contains(sampleName)) {
pedCasesCarryingMut.incr(sampleName);
}
if (controlsSamples.contains(sampleName)) {
pedCtrlsCarryingMut.incr(sampleName);
}
if (maleSamples.contains(sampleName)) {
malesCarryingMut.incr(sampleName);
}
if (femaleSamples.contains(sampleName)) {
femalesCarryingMut.incr(sampleName);
}
}
}
pw.print(first.getContig());
pw.print('\t');
// convert to bed
pw.print(minPos - 1);
pw.print('\t');
pw.print(maxPos);
pw.print('\t');
pw.print(first.gene.name);
pw.print('\t');
pw.print(first.gene.label);
pw.print('\t');
pw.print(first.gene.type);
pw.print('\t');
pw.print(sampleCarryingMut.size());
pw.print('\t');
pw.print(variantList.size());
if (this.print_positions) {
pw.print('\t');
pw.print(variantList.stream().map(CTX -> String.valueOf(CTX.getStart()) + ":" + CTX.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/"))).collect(Collectors.joining(";")));
}
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCasesCarryingMut.getCountCategories());
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCtrlsCarryingMut.getCountCategories());
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print(malesCarryingMut.getCountCategories());
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print(femalesCarryingMut.getCountCategories());
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
int count_case_mut = 0;
int count_ctrl_mut = 0;
int count_case_wild = 0;
int count_ctrl_wild = 0;
for (final String sampleName : header.getSampleNamesInOrder()) {
final boolean has_mutation = variantList.stream().map(V -> V.getGenotype(sampleName)).anyMatch(G -> G != null && genotypeFilter.test(G));
if (controlsSamples.contains(sampleName)) {
if (has_mutation) {
count_ctrl_mut++;
} else {
count_ctrl_wild++;
}
} else if (casesSamples.contains(sampleName)) {
if (has_mutation) {
count_case_mut++;
} else {
count_case_wild++;
}
}
}
final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
pw.print('\t');
pw.print(fisher.getAsDouble());
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample2count.count(sample));
}
pw.println();
if (pw.checkError())
break;
}
eqiter.close();
iter.close();
pw.flush();
if (this.outFile != null)
pw.close();
} finally {
CloserUtil.close(vcfIterator);
if (sortingCollection != null)
sortingCollection.cleanup();
}
}
use of com.github.lindenb.jvarkit.variant.vcf.BcfIteratorBuilder in project jvarkit by lindenb.
the class GridssMergeBnd method doWork.
@Override
public int doWork(List<String> args) {
int onlyContigTid = -1;
try {
final Map<String, Integer> sample2index = new HashMap<>();
final List<String> idx2sample = new ArrayList<>();
SortingCollection<BreakPoint> sorter = SortingCollection.newInstance(BreakPoint.class, new BreakPointCodec(), (A, B) -> A.compare(B), this.sortingDelegate.maxRecordsInRam, this.sortingDelegate.getTmpPaths());
sorter.setDestructiveIteration(true);
for (final Path path : IOUtils.unrollPaths(args)) {
LOG.info("Reading " + path);
try (VCFIterator iter = new BcfIteratorBuilder().open(path)) {
final VCFHeader header = iter.getHeader();
if (header.getNGenotypeSamples() != 1) {
LOG.error("Expected one sample in " + path);
return -1;
}
final String sn = header.getSampleNamesInOrder().get(0);
if (sample2index.containsKey(sn)) {
LOG.error("Duplicate sample " + sn + " from " + path);
return -1;
}
final int sample_id = idx2sample.size();
idx2sample.add(sn);
sample2index.put(sn, sample_id);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
if (this.theDict == null) {
this.theDict = dict;
if (this.onlyContig != null) {
onlyContigTid = this.theDict.getSequenceIndex(this.onlyContig);
if (onlyContigTid < 0)
throw new JvarkitException.ContigNotFoundInDictionary(this.onlyContig, this.theDict);
}
} else {
SequenceUtil.assertSequenceDictionariesEqual(dict, this.theDict);
}
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
final int tid = this.theDict.getSequenceIndex(ctx.getContig());
if (tid == -1)
throw new JvarkitException.ContigNotFoundInDictionary(ctx.getContig(), this.theDict);
if (onlyContigTid != -1 && onlyContigTid != tid)
continue;
if (onlyContigTid != -1 && onlyContigTid < tid)
break;
int start = ctx.getStart();
// yes start
int end = ctx.getStart();
if (ctx.hasAttribute("CIPOS")) {
final List<Integer> cipos = ctx.getAttributeAsIntList("CIPOS", 0);
if (cipos.size() > 0)
start += cipos.get(0);
if (cipos.size() > 1)
end += cipos.get(1);
if (start > end) {
final int tmp = start;
start = end;
end = tmp;
}
}
final QueryInterval qi1 = new QueryInterval(tid, start, end);
final QueryInterval[] qintervals;
if (!ctx.getAttributeAsString(VCFConstants.SVTYPE, "undefined").equals("BND") && ctx.getLengthOnReference() > this.withinDistance) {
int start2 = ctx.getEnd();
int end2 = ctx.getEnd();
if (ctx.hasAttribute("CIEND")) {
final List<Integer> ciend = ctx.getAttributeAsIntList("CIEND", 0);
if (ciend.size() > 0)
start2 += ciend.get(0);
if (ciend.size() > 1)
end2 += ciend.get(1);
if (start2 > end2) {
final int tmp = start2;
start2 = end2;
end2 = tmp;
}
}
final QueryInterval qi2 = new QueryInterval(tid, start2, end2);
qintervals = new QueryInterval[] { qi1, qi2 };
} else {
qintervals = new QueryInterval[] { qi1 };
}
for (QueryInterval qi : qintervals) {
sorter.add(new BreakPoint(qi.referenceIndex, qi.start, qi.end, sample_id, ++ID_GENERATOR));
}
}
// end while
}
}
sorter.doneAdding();
LOG.info("Done adding");
boolean debug = false;
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY, VCFConstants.ALLELE_NUMBER_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
final VCFHeader header = new VCFHeader(metaData, new TreeSet<>(sample2index.keySet()));
header.setSequenceDictionary(this.theDict);
JVarkitVersion.getInstance().addMetaData(this, header);
long count = 0L;
final Allele ref_allele = Allele.create("N", true);
final Allele alt_allele = Allele.create("<B>", false);
final List<Allele> alleles = Arrays.asList(ref_allele, alt_allele);
try (CloseableIterator<BreakPoint> iter0 = sorter.iterator();
VariantContextWriter vcw = this.writingVariantsDelegate.dictionary(this.theDict).open(this.outputFile)) {
vcw.writeHeader(header);
final PeekIterator<BreakPoint> iter = new PeekIterator<>(iter0);
final Map<Integer, BreakPoint> sample2bnd = new HashMap<>();
while (iter.hasNext()) {
final BreakPoint first = iter.next();
// debug = first.getStart() >= 32_552_334 && first.getStart()< 50_000_000 && first.getContig().equals("chr2");
if (debug)
System.err.println("A " + first);
sample2bnd.clear();
sample2bnd.put(first.sample_id, first);
while (iter.hasNext()) {
if (debug)
System.err.println("B " + first);
final BreakPoint bp2 = iter.peek();
if (debug)
System.err.println("C " + bp2);
if (sample2bnd.values().stream().noneMatch(BP -> BP.withinDistance(bp2, this.withinDistance))) {
if (debug)
System.err.println("D " + first + " " + bp2);
break;
}
if (debug)
System.err.println("E " + first + " " + bp2 + " ");
sample2bnd.put(bp2.sample_id, iter.next());
}
final int end = sample2bnd.values().stream().mapToInt(BP -> BP.end).max().orElse(first.end);
if (end - first.getStart() > 1000)
continue;
final VariantContextBuilder vcb = new VariantContextBuilder(null, first.getContig(), first.getStart(), end, alleles);
if (first.getStart() != end)
vcb.attribute(VCFConstants.END_KEY, end);
if (this.add_attributes) {
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, sample2bnd.size());
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, sample2index.size() * 2);
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, sample2bnd.size() / (2.0 * sample2index.size()));
}
vcb.genotypes(sample2bnd.keySet().stream().map(SIDX -> new GenotypeBuilder(idx2sample.get(SIDX), alleles).make()).collect(Collectors.toList()));
if (debug)
System.err.println("F " + first + " " + count);
vcw.add(vcb.make());
count++;
if (count % 10_000 == 0) {
LOG.info("N=" + count + " last:" + first);
if (vcw.checkError()) {
LOG.error("Cannot write!!! " + count + "/" + first);
return -1;
}
}
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations