use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class SamFindClippedRegions method doWork.
@Override
public int doWork(final List<String> args) {
final int bad_mapq = 30;
try {
if (this.min_clip_depth > this.min_depth) {
LOG.error("this.min_clip_depth>this.min_depth");
return -1;
}
if (this.fraction < 0 || this.fraction > 1.0) {
LOG.error("bad ratio: " + fraction);
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceFai);
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.error("input is missing");
return -1;
}
final IntervalTreeMap<Interval> intronMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasIntron()).flatMap(T -> T.getIntrons().stream()).map(I -> new Interval(I.getContig(), I.getStart(), I.getEnd(), I.isNegativeStrand(), I.getTranscript().getId())).forEach(I -> intronMap.put(I, I));
}
}
final List<String> sample_list = new ArrayList<>();
final QueryInterval[] queryIntervals;
if (this.bedPath == null) {
queryIntervals = null;
} else {
try (BedLineReader br = new BedLineReader(this.bedPath).setValidationStringency(ValidationStringency.LENIENT).setContigNameConverter(ContigNameConverter.fromOneDictionary(dict))) {
queryIntervals = br.optimizeIntervals(dict);
}
}
SortingCollection<Base> sortingCollection = SortingCollection.newInstance(Base.class, new BaseCodec(), (A, B) -> A.compare1(B), writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPath());
sortingCollection.setDestructiveIteration(true);
final Predicate<Base> acceptBase = B -> {
return B.clip() > 0;
};
final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.referenceFai);
for (final Path path : inputs) {
try (SamReader sr = srf.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
final String sample_name = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(path));
if (sample_list.contains(sample_name)) {
LOG.error("duplicate sample " + sample_name + " in " + path);
return -1;
}
final int sample_idx = sample_list.size();
sample_list.add(sample_name);
int prev_tid = -1;
final SortedMap<Integer, Base> pos2base = new TreeMap<>();
/* get base in pos2base, create it if needed */
final BiFunction<Integer, Integer, Base> baseAt = (TID, POS) -> {
Base b = pos2base.get(POS);
if (b == null) {
b = new Base(sample_idx, TID, POS);
pos2base.put(POS, b);
}
return b;
};
try (CloseableIterator<SAMRecord> iter = queryIntervals == null ? sr.iterator() : sr.query(queryIntervals, false)) {
for (; ; ) {
final SAMRecord rec = (iter.hasNext() ? iter.next() : null);
if (rec != null && !SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
if (rec == null || prev_tid != rec.getReferenceIndex().intValue()) {
for (final Integer pos : pos2base.keySet()) {
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
}
if (rec == null)
break;
pos2base.clear();
prev_tid = rec.getReferenceIndex().intValue();
}
/* pop back previous bases */
for (Iterator<Integer> rpos = pos2base.keySet().iterator(); rpos.hasNext(); ) {
final Integer pos = rpos.next();
if (pos.intValue() + this.max_clip_length >= rec.getUnclippedStart())
break;
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
rpos.remove();
}
final Cigar cigar = rec.getCigar();
int refPos = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
final Base gt = baseAt.apply(prev_tid, refPos + x);
gt.noClip++;
gt.noClip_sum_mapq += rec.getMappingQuality();
}
} else if (op.equals(CigarOperator.D) || op.equals(CigarOperator.N)) {
baseAt.apply(prev_tid, refPos).del++;
baseAt.apply(prev_tid, refPos + ce.getLength() - 1).del++;
}
refPos += ce.getLength();
}
}
CigarElement ce = cigar.getFirstCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(prev_tid, rec.getStart() - 1).leftClip++;
}
ce = cigar.getLastCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(prev_tid, rec.getEnd() + 1).rightClip++;
}
}
}
/* write last bases */
for (final Integer pos : pos2base.keySet()) {
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
}
}
// end open reader
}
// end loop sam files
sortingCollection.doneAdding();
/* build VCF header */
final Allele reference_allele = Allele.create("N", true);
final Allele alt_allele = Allele.create("<CLIP>", false);
final Set<VCFHeaderLine> vcfHeaderLines = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(vcfHeaderLines, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
VCFStandardHeaderLines.addStandardInfoLines(vcfHeaderLines, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_FREQUENCY_KEY);
final VCFFormatHeaderLine leftClip = new VCFFormatHeaderLine("CL", 1, VCFHeaderLineType.Integer, "Left Clip");
vcfHeaderLines.add(leftClip);
final VCFFormatHeaderLine rightClip = new VCFFormatHeaderLine("RL", 1, VCFHeaderLineType.Integer, "Right Clip");
vcfHeaderLines.add(rightClip);
final VCFFormatHeaderLine totalCip = new VCFFormatHeaderLine("TL", 1, VCFHeaderLineType.Integer, "Total Clip");
vcfHeaderLines.add(totalCip);
final VCFFormatHeaderLine totalDel = new VCFFormatHeaderLine("DL", 1, VCFHeaderLineType.Integer, "Total Deletions");
vcfHeaderLines.add(totalDel);
final VCFFormatHeaderLine noClipMAPQ = new VCFFormatHeaderLine("MQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for reads without clip at this position.");
vcfHeaderLines.add(noClipMAPQ);
final VCFInfoHeaderLine averageMAPQ = new VCFInfoHeaderLine("AVG_MAPQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for called genotypes");
vcfHeaderLines.add(averageMAPQ);
final VCFInfoHeaderLine infoRetrogene = new VCFInfoHeaderLine("RETROGENE", 1, VCFHeaderLineType.String, "transcript name for Possible retrogene.");
vcfHeaderLines.add(infoRetrogene);
final VCFFilterHeaderLine filterRetrogene = new VCFFilterHeaderLine("POSSIBLE_RETROGENE", "Junction is a possible Retrogene.");
vcfHeaderLines.add(filterRetrogene);
final VCFFilterHeaderLine filterlowMapq = new VCFFilterHeaderLine("LOW_MAPQ", "Low average mapq (< " + bad_mapq + ")");
vcfHeaderLines.add(filterlowMapq);
final VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample_list);
vcfHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
this.writingVcfConfig.dictionary(dict);
try (VariantContextWriter w = this.writingVcfConfig.open(this.outputFile)) {
w.writeHeader(vcfHeader);
try (CloseableIterator<Base> r1 = sortingCollection.iterator()) {
try (EqualRangeIterator<Base> r2 = new EqualRangeIterator<>(r1, (A, B) -> A.compare2(B))) {
while (r2.hasNext()) {
final List<Base> array = r2.next();
final Base first = array.get(0);
if (first.pos < 1)
continue;
// no clip
if (array.stream().mapToInt(G -> G.clip()).sum() == 0)
continue;
if (array.stream().allMatch(G -> G.clip() < min_clip_depth))
continue;
if (array.stream().allMatch(G -> G.dp() < min_depth))
continue;
if (array.stream().allMatch(G -> G.ratio() < fraction))
continue;
final VariantContextBuilder vcb = new VariantContextBuilder();
final String chrom = dict.getSequence(first.tid).getSequenceName();
vcb.chr(chrom);
vcb.start(first.pos);
vcb.stop(first.pos);
vcb.alleles(Arrays.asList(reference_allele, alt_allele));
vcb.attribute(VCFConstants.DEPTH_KEY, array.stream().mapToInt(G -> G.dp()).sum());
final List<Genotype> genotypes = new ArrayList<>(array.size());
int AC = 0;
int AN = 0;
int max_clip = 1;
double sum_mapq = 0.0;
int count_mapq = 0;
for (final Base gt : array) {
final GenotypeBuilder gb = new GenotypeBuilder(sample_list.get(gt.sample_idx));
if (gt.clip() == 0 && gt.noClip == 0) {
gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
} else if (gt.noClip == 0) {
gb.alleles(Arrays.asList(alt_allele, alt_allele));
AC += 2;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
} else if (gt.clip() == 0) {
gb.alleles(Arrays.asList(reference_allele, reference_allele));
AN += 2;
} else {
gb.alleles(Arrays.asList(reference_allele, alt_allele));
AC++;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
}
gb.DP(gt.dp());
gb.attribute(leftClip.getID(), gt.leftClip);
gb.attribute(rightClip.getID(), gt.rightClip);
gb.attribute(totalCip.getID(), gt.clip());
gb.attribute(totalDel.getID(), gt.del);
gb.attribute(noClipMAPQ.getID(), gt.noClipMapq());
gb.AD(new int[] { gt.noClip, gt.clip() });
genotypes.add(gb.make());
max_clip = Math.max(max_clip, gt.clip());
}
if (count_mapq > 0) {
final int avg_mapq = (int) (sum_mapq / count_mapq);
vcb.attribute(averageMAPQ.getID(), avg_mapq);
if (avg_mapq < bad_mapq)
vcb.filter(filterlowMapq.getID());
}
if (gtfPath != null) {
final Locatable bounds1 = new SimpleInterval(chrom, Math.max(1, first.pos - max_intron_distance), first.pos + max_intron_distance);
intronMap.getOverlapping(bounds1).stream().filter(I -> Math.abs(I.getStart() - first.pos) <= this.max_intron_distance || Math.abs(I.getEnd() - first.pos) <= this.max_intron_distance).map(I -> I.getName()).findFirst().ifPresent(transcript_id -> {
vcb.attribute(infoRetrogene.getID(), transcript_id);
vcb.filter(filterRetrogene.getID());
});
;
}
vcb.log10PError(max_clip / -10.0);
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, AC);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, AN);
if (AN > 0)
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, AC / (float) AN);
vcb.genotypes(genotypes);
w.add(vcb.make());
}
// end while r2
}
// end r2
}
// end r1
}
// end writer
sortingCollection.cleanup();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class VcfROH method doWork.
@Override
public int doWork(final List<String> args) {
try {
final String input = oneFileOrNull(args);
try (VCFIterator iter = input == null ? new VCFIteratorBuilder().open(stdin()) : new VCFIteratorBuilder().open(input)) {
final VCFHeader header = iter.getHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final OrderChecker<VariantContext> checker = new OrderChecker<VariantContext>(dict, false);
final List<Sample> samples = new ArrayList<>(header.getNGenotypeSamples());
for (String sn : header.getSampleNamesInOrder()) {
final Sample sample = new Sample(sn, header.getSampleNameToOffset().get(sn));
samples.add(sample);
}
final IntervalTreeMap<Boolean> treemap;
if (this.intervalBedPath != null) {
try (BedLineReader br = new BedLineReader(this.intervalBedPath)) {
treemap = br.toIntervalTreeMap(B -> Boolean.TRUE);
}
} else {
treemap = null;
}
try (PrintWriter w = super.openPathOrStdoutAsPrintWriter(this.output)) {
if (!this.output_as_bed) {
final SAMFileHeader samHeader = new SAMFileHeader(dict);
samHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
for (Sample sn : samples) {
final SAMReadGroupRecord rg = new SAMReadGroupRecord(sn.name);
rg.setSample(sn.name);
samHeader.addReadGroup(rg);
}
JVarkitVersion.getInstance().addMetaData(this, samHeader);
codec.encode(w, samHeader);
}
while (iter.hasNext()) {
final VariantContext ctx = checker.apply(iter.next());
for (Sample sample : samples) sample.visit(w, ctx, treemap);
}
for (Sample sample : samples) sample.finish(w, treemap);
w.flush();
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class VcfGeneEpistasis method doWork.
@Override
public int doWork(final List<String> args) {
if (this.geneBed == null) {
LOG.error("gene file bed undefined");
return -1;
}
if (this.outputFile == null) {
LOG.error("output file undefined");
return -1;
}
CloseableIterator<VariantContext> iter = null;
try {
final File vcfFile = new File(oneAndOnlyOneFile(args));
this.vcfFileReader = VCFReaderFactory.makeDefault().open(vcfFile.toPath(), true);
final VCFHeader header = this.vcfFileReader.getHeader();
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
pedigree = new Pedigree.Parser().parse(header);
}
if (pedigree == null || pedigree.isEmpty() || !pedigree.hasAffected() || !pedigree.hasUnaffected()) {
LOG.error("empty ped or no case/ctrl");
return -1;
}
pedigree.verifyPersonsHaveUniqueNames();
for (final Pedigree.Person p : pedigree.getPersons().stream().filter(P -> P.isAffected() || P.isUnaffected()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toSet())) {
this.id2samples.put(p.getId(), p);
}
this.vcfTools = new VcfTools(header);
List<Interval> geneList;
if (!this.geneBed.exists()) {
final Map<String, Interval> gene2interval = new HashMap<>(50000);
LOG.info("building gene file" + this.geneBed);
iter = this.vcfFileReader.iterator();
// iter = this.vcfFileReader.query("chr3",1,300_000_000);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!accept(ctx))
continue;
for (final String geneName : getGenes(ctx)) {
final Interval old = gene2interval.get(geneName);
if (old == null) {
gene2interval.put(geneName, new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, geneName));
LOG.info("adding " + geneName + ". number of genes: " + gene2interval.size());
} else if (!old.getContig().equals(ctx.getContig())) {
LOG.error("boum :" + geneName + ": on chrom " + ctx.getContig() + " vs " + old);
return -1;
} else {
gene2interval.put(geneName, new Interval(ctx.getContig(), Math.min(ctx.getStart(), old.getStart()), Math.max(ctx.getEnd(), old.getEnd()), false, geneName));
}
}
}
iter.close();
iter = null;
progress.finish();
geneList = new ArrayList<>(gene2interval.values());
PrintWriter pw = new PrintWriter(this.geneBed);
for (final Interval g : geneList) {
pw.println(g.getContig() + "\t" + (g.getStart() - 1) + "\t" + (g.getEnd()) + "\t" + g.getName());
}
pw.flush();
pw.close();
pw = null;
} else {
try (BedLineReader r = new BedLineReader(geneBed)) {
geneList = r.stream().filter(B -> B != null).map(B -> new Interval(B.getContig(), B.getStart(), B.getEnd(), true, B.get(3))).collect(Collectors.toList());
}
}
if (geneList.isEmpty()) {
LOG.error("gene List is empty");
return -1;
}
final Comparator<VariantContext> ctxSorter = VCFUtils.createTidPosRefComparator(header.getSequenceDictionary());
final Function<Interval, List<VariantContext>> loadVariants = (R) -> {
List<VariantContext> L = new ArrayList<>();
CloseableIterator<VariantContext> r = this.vcfFileReader.query(R.getContig(), R.getStart(), R.getEnd());
while (r.hasNext()) {
final VariantContext ctx = r.next();
if (!accept(ctx))
continue;
if (!getGenes(ctx).contains(R.getName()))
continue;
L.add(ctx);
}
r.close();
return L;
};
final SkatExecutor executor = this.skatFactory.build();
Double bestSkat = null;
LOG.info("number of genes : " + geneList.size());
final int list_end_index = (this.user_end_index < 0 ? geneList.size() : Math.min(geneList.size(), this.user_end_index));
for (int x = this.user_begin_index; x < list_end_index; ++x) {
final Interval intervalx = geneList.get(x);
final List<VariantContext> variantsx = loadVariants.apply(intervalx);
if (variantsx.isEmpty())
continue;
for (int y = x; y < geneList.size(); /* pas list_end_index */
++y) {
final Interval intervaly;
final List<VariantContext> merge;
if (y == x) {
// we-re testing gene 1 only
intervaly = intervalx;
merge = variantsx;
} else {
intervaly = geneList.get(y);
if (intervaly.intersects(intervalx))
continue;
final List<VariantContext> variantsy = loadVariants.apply(intervaly);
if (variantsy.isEmpty())
continue;
merge = new MergedList<>(variantsx, variantsy);
}
LOG.info("testing : [" + x + "]" + intervalx + " [" + y + "]" + intervaly + " N:" + geneList.size() + " best: " + bestSkat);
final Double skat = eval(executor, merge);
if (skat == null)
continue;
if (bestSkat == null || skat.compareTo(bestSkat) < 0) {
bestSkat = skat;
LOG.info("best " + bestSkat + " " + intervalx + " " + intervaly);
if (this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz")) {
final VCFHeader header2 = new VCFHeader(header);
header2.addMetaDataLine(new VCFHeaderLine(VcfGeneEpistasis.class.getName(), intervalx.getName() + " " + intervaly.getName() + " " + bestSkat));
final VariantContextWriter w = VCFUtils.createVariantContextWriter(outputFile);
w.writeHeader(header2);
merge.stream().sorted(ctxSorter).forEach(V -> w.add(V));
w.close();
} else {
final PrintWriter w = super.openFileOrStdoutAsPrintWriter(outputFile);
w.println(String.valueOf(bestSkat) + "\t" + intervalx.getName() + "\t" + intervaly.getName());
w.flush();
w.close();
}
}
}
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(this.vcfFileReader);
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class VcfFisherCombinatorics method doWork.
@Override
public int doWork(final List<String> args) {
try {
final Pedigree pedigree = new PedigreeParser().parse(this.pedigreeFile);
final IntervalTreeMap<List<GeneInfo>> geneInfoMap = new IntervalTreeMap<>();
final Predicate<Genotype> isGtCnv = G -> G.isHet() || G.isHomVar();
try (VCFIterator vcfIter = super.openVCFIterator(super.oneFileOrNull(args))) {
final VCFHeader header = vcfIter.getHeader();
final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(SequenceDictionaryUtils.extractRequired(header));
if (!header.hasGenotypingData()) {
LOG.error("No Genotype data in " + header);
return -1;
}
final Map<String, Integer> sample2index = header.getSampleNameToOffset();
final List<Integer> casesIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isAffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
LOG.info("cases N=" + casesIdx.size());
if (casesIdx.isEmpty()) {
LOG.error("No affected/cases sample in the input VCF");
return -1;
}
final List<Integer> controlsIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isUnaffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
LOG.info("controls N=" + controlsIdx.size());
if (controlsIdx.isEmpty()) {
LOG.error("No unaffected/controls sample in the input VCF");
return -1;
}
final Predicate<VariantContext> acceptCtx = V -> {
if (discard_bnd && V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
return false;
if (discard_filtered && V.isFiltered())
return false;
if (max_af < 1.0) {
final Iterator<Integer> it = Stream.concat(controlsIdx.stream(), casesIdx.stream()).iterator();
int ac = 0;
int an = 0;
while (it.hasNext()) {
switch(V.getGenotype(it.next()).getType()) {
case HOM_VAR:
ac += 2;
an += 2;
break;
case HOM_REF:
ac += 0;
an += 2;
break;
case HET:
ac += 1;
an += 2;
break;
default:
break;
}
}
if (an == 0)
return false;
if (ac / (double) an > max_af)
return false;
}
return true;
};
try (BedLineReader br = new BedLineReader(this.bedPath)) {
while (br.hasNext()) {
final BedLine bed = br.next();
final String ctg = ctgConverter.apply(bed.getContig());
if (StringUtil.isBlank(ctg))
continue;
final GeneInfo geneInfo = new GeneInfo();
geneInfo.casesflags = new BitSet(casesIdx.size());
geneInfo.controlsflags = new BitSet(controlsIdx.size());
geneInfo.contig = ctg;
geneInfo.name = bed.getOrDefault(3, "undefined");
geneInfo.parts.add(new SimpleInterval(bed).renameContig(ctg));
final Interval key = new Interval(geneInfo);
List<GeneInfo> L = geneInfoMap.get(key);
if (L == null) {
L = new ArrayList<>();
geneInfoMap.put(key, L);
}
L.add(geneInfo);
}
}
if (geneInfoMap.isEmpty()) {
LOG.error("no gene found in " + this.bedPath);
return -1;
}
LOG.info("reading variants...");
while (vcfIter.hasNext()) {
final VariantContext ctx = vcfIter.next();
if (!acceptCtx.test(ctx))
continue;
for (List<GeneInfo> gil : geneInfoMap.getOverlapping(ctx)) {
for (GeneInfo gi : gil) {
for (int y = 0; y < casesIdx.size(); ++y) {
final Genotype gt = ctx.getGenotype(casesIdx.get(y));
if (!isGtCnv.test(gt))
continue;
gi.casesflags.set(y);
}
for (int y = 0; y < controlsIdx.size(); ++y) {
final Genotype gt = ctx.getGenotype(controlsIdx.get(y));
if (!isGtCnv.test(gt))
continue;
gi.controlsflags.set(y);
}
}
}
}
/* remove Genes without variant , count sample per gene*/
for (List<GeneInfo> gil : geneInfoMap.values()) {
gil.removeIf(GI -> GI.casesflags.nextSetBit(0) == -1 && GI.controlsflags.nextSetBit(0) == -1);
}
// end remove
/* load bed file */
final List<GeneInfo> geneList = geneInfoMap.values().stream().filter(L -> !L.isEmpty()).flatMap(L -> L.stream()).sorted().collect(Collectors.toList());
if (geneList.isEmpty()) {
LOG.error("no gene found in " + this.bedPath);
return -1;
}
LOG.info("N Genes=" + geneList.size());
Solution best = null;
for (int i = 2; i < Math.min(this.max_num_genes, geneList.size()); i++) {
LOG.info("sarting loop from " + i);
best = recursion(geneList, casesIdx, controlsIdx, new ArrayList<>(), i, best);
}
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
pw.println(best);
pw.flush();
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class FindGVCFsBlocks method doWork.
@Override
public int doWork(final List<String> args) {
try {
if (this.bedPath != null) {
IOUtil.assertFileIsReadable(this.bedPath);
}
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.error("input missing");
return -1;
}
if (merge_blocks_distance < 1) {
LOG.error("bad merge size : " + merge_blocks_distance);
return -1;
}
final Predicate<Locatable> inCapture;
if (this.bedPath != null) {
final IntervalTreeMap<Boolean> intervalTreeMap;
try (BedLineReader blr = new BedLineReader(this.bedPath)) {
intervalTreeMap = blr.toIntervalTreeMap(BED -> Boolean.TRUE);
}
inCapture = (L) -> intervalTreeMap.containsOverlapping(L);
} else {
inCapture = (L) -> true;
}
if (this.outputFile != null) {
final String fname = this.outputFile.getFileName().toString();
if (!fname.endsWith(FileExtensions.INTERVAL_LIST) && !fname.endsWith(FileExtensions.COMPRESSED_INTERVAL_LIST)) {
LOG.error("Output should end with " + FileExtensions.INTERVAL_LIST + " or " + FileExtensions.COMPRESSED_INTERVAL_LIST);
return -1;
}
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(inputs.get(0));
if (!StringUtils.isBlank(the_contig) && dict.getSequence(the_contig) == null)
throw new JvarkitException.ContigNotFoundInDictionary(the_contig, dict);
try (IntervalListWriter w = new IntervalListWriter(this.outputFile, dict)) {
/**
* loop over chromosomes
*/
for (final SAMSequenceRecord ssr : dict.getSequences()) {
ContigBlocks mainBlock = null;
if (!StringUtils.isBlank(the_contig) && !ssr.getContig().equals(the_contig))
continue;
final long initMilliSec = System.currentTimeMillis();
for (int i = 0; i < inputs.size(); i++) {
final long startMilliSec = System.currentTimeMillis();
LOG.info(inputs.get(i) + " " + (i + 1) + "/" + inputs.size());
mainBlock = scanVcfFile(mainBlock, dict, ssr, inputs.get(i));
final long count_variants = mainBlock.end_positions.size();
final long millisecPerVcf = (System.currentTimeMillis() - initMilliSec) / (i + 1L);
LOG.info("N=" + count_variants + ". That took: " + StringUtils.niceDuration(System.currentTimeMillis() - startMilliSec) + " Remains: " + StringUtils.niceDuration((inputs.size() - (i + 1)) * millisecPerVcf));
}
/* nothing was seen */
if (mainBlock.minPos == null)
continue;
if (mainBlock.end_positions.isEmpty()) {
final Locatable loc = new SimpleInterval(ssr.getContig(), mainBlock.minPos, mainBlock.maxPos);
if (inCapture.test(loc)) {
w.add(loc);
}
continue;
}
final List<Locatable> intervals = new ArrayList<>(mainBlock.end_positions.size() + 1);
final int pos1 = mainBlock.end_positions.stream().mapToInt(P -> P.intValue()).min().getAsInt();
if (mainBlock.minPos < pos1) {
final Locatable loc = new SimpleInterval(ssr.getContig(), mainBlock.minPos, pos1 - 1);
if (inCapture.test(loc)) {
intervals.add(loc);
}
}
Integer prev = null;
for (Integer p : mainBlock.end_positions) {
if (prev != null) {
final Locatable loc = new SimpleInterval(ssr.getContig(), prev + 1, p);
if (inCapture.test(loc)) {
intervals.add(loc);
}
}
prev = p;
}
final int pos2 = mainBlock.end_positions.stream().mapToInt(P -> P.intValue()).max().getAsInt();
if (mainBlock.maxPos > pos2) {
final Locatable loc = new SimpleInterval(ssr.getContig(), pos2 + 1, mainBlock.maxPos);
if (inCapture.test(loc)) {
intervals.add(loc);
}
}
Collections.sort(intervals, (A, B) -> Integer.compare(A.getStart(), B.getStart()));
while (!intervals.isEmpty()) {
Locatable loc = intervals.remove(0);
while (this.min_block_size > 0 && !intervals.isEmpty()) {
final Locatable loc2 = intervals.get(0);
if (!loc2.withinDistanceOf(loc, merge_blocks_distance))
break;
if (CoordMath.getLength(loc.getStart(), loc2.getEnd()) > this.min_block_size)
break;
// consumme loc2
intervals.remove(0);
loc = new SimpleInterval(loc.getContig(), loc.getStart(), loc2.getEnd());
}
w.add(loc);
}
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations