use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfFisherCombinatorics method doWork.
@Override
public int doWork(final List<String> args) {
try {
final Pedigree pedigree = new PedigreeParser().parse(this.pedigreeFile);
final IntervalTreeMap<List<GeneInfo>> geneInfoMap = new IntervalTreeMap<>();
final Predicate<Genotype> isGtCnv = G -> G.isHet() || G.isHomVar();
try (VCFIterator vcfIter = super.openVCFIterator(super.oneFileOrNull(args))) {
final VCFHeader header = vcfIter.getHeader();
final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(SequenceDictionaryUtils.extractRequired(header));
if (!header.hasGenotypingData()) {
LOG.error("No Genotype data in " + header);
return -1;
}
final Map<String, Integer> sample2index = header.getSampleNameToOffset();
final List<Integer> casesIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isAffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
LOG.info("cases N=" + casesIdx.size());
if (casesIdx.isEmpty()) {
LOG.error("No affected/cases sample in the input VCF");
return -1;
}
final List<Integer> controlsIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isUnaffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
LOG.info("controls N=" + controlsIdx.size());
if (controlsIdx.isEmpty()) {
LOG.error("No unaffected/controls sample in the input VCF");
return -1;
}
final Predicate<VariantContext> acceptCtx = V -> {
if (discard_bnd && V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
return false;
if (discard_filtered && V.isFiltered())
return false;
if (max_af < 1.0) {
final Iterator<Integer> it = Stream.concat(controlsIdx.stream(), casesIdx.stream()).iterator();
int ac = 0;
int an = 0;
while (it.hasNext()) {
switch(V.getGenotype(it.next()).getType()) {
case HOM_VAR:
ac += 2;
an += 2;
break;
case HOM_REF:
ac += 0;
an += 2;
break;
case HET:
ac += 1;
an += 2;
break;
default:
break;
}
}
if (an == 0)
return false;
if (ac / (double) an > max_af)
return false;
}
return true;
};
try (BedLineReader br = new BedLineReader(this.bedPath)) {
while (br.hasNext()) {
final BedLine bed = br.next();
final String ctg = ctgConverter.apply(bed.getContig());
if (StringUtil.isBlank(ctg))
continue;
final GeneInfo geneInfo = new GeneInfo();
geneInfo.casesflags = new BitSet(casesIdx.size());
geneInfo.controlsflags = new BitSet(controlsIdx.size());
geneInfo.contig = ctg;
geneInfo.name = bed.getOrDefault(3, "undefined");
geneInfo.parts.add(new SimpleInterval(bed).renameContig(ctg));
final Interval key = new Interval(geneInfo);
List<GeneInfo> L = geneInfoMap.get(key);
if (L == null) {
L = new ArrayList<>();
geneInfoMap.put(key, L);
}
L.add(geneInfo);
}
}
if (geneInfoMap.isEmpty()) {
LOG.error("no gene found in " + this.bedPath);
return -1;
}
LOG.info("reading variants...");
while (vcfIter.hasNext()) {
final VariantContext ctx = vcfIter.next();
if (!acceptCtx.test(ctx))
continue;
for (List<GeneInfo> gil : geneInfoMap.getOverlapping(ctx)) {
for (GeneInfo gi : gil) {
for (int y = 0; y < casesIdx.size(); ++y) {
final Genotype gt = ctx.getGenotype(casesIdx.get(y));
if (!isGtCnv.test(gt))
continue;
gi.casesflags.set(y);
}
for (int y = 0; y < controlsIdx.size(); ++y) {
final Genotype gt = ctx.getGenotype(controlsIdx.get(y));
if (!isGtCnv.test(gt))
continue;
gi.controlsflags.set(y);
}
}
}
}
/* remove Genes without variant , count sample per gene*/
for (List<GeneInfo> gil : geneInfoMap.values()) {
gil.removeIf(GI -> GI.casesflags.nextSetBit(0) == -1 && GI.controlsflags.nextSetBit(0) == -1);
}
// end remove
/* load bed file */
final List<GeneInfo> geneList = geneInfoMap.values().stream().filter(L -> !L.isEmpty()).flatMap(L -> L.stream()).sorted().collect(Collectors.toList());
if (geneList.isEmpty()) {
LOG.error("no gene found in " + this.bedPath);
return -1;
}
LOG.info("N Genes=" + geneList.size());
Solution best = null;
for (int i = 2; i < Math.min(this.max_num_genes, geneList.size()); i++) {
LOG.info("sarting loop from " + i);
best = recursion(geneList, casesIdx, controlsIdx, new ArrayList<>(), i, best);
}
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
pw.println(best);
pw.flush();
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class CompareBamAndBuild method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
SortingCollection<Match> database = null;
if (this.chainFile == null) {
LOG.error("Chain file is not defined Option");
return -1;
}
if (args.size() != 2) {
LOG.error("Illegal number of arguments. Expected two indexed BAMS.");
return -1;
}
try {
LOG.info("load chain file");
this.liftOver = new LiftOver(this.chainFile);
database = SortingCollection.newInstance(Match.class, new MatchCodec(), new MatchOrdererInSortingCollection(), this.maxRecordsInRam, this.tmpDir.toPath());
database.setDestructiveIteration(true);
for (int currentSamFileIndex = 0; currentSamFileIndex < 2; currentSamFileIndex++) {
final File samFile = new File(args.get(currentSamFileIndex));
LOG.info("read " + samFile);
this.bamFiles[currentSamFileIndex] = samFile;
SamReader samFileReader = CompareBamAndBuild.this.openSamReader(samFile.getPath());
final SAMSequenceDictionary dict = samFileReader.getFileHeader().getSequenceDictionary();
this.sequenceDictionaries[currentSamFileIndex] = dict;
if (dict.isEmpty()) {
samFileReader.close();
LOG.error("Empty Dict in " + samFile);
return -1;
}
final SimpleInterval interval;
if (!StringUtils.isBlank(this.REGION)) {
final Function<String, Optional<SimpleInterval>> intervalParser = IntervalParserFactory.newInstance().dictionary(dict).make();
interval = intervalParser.apply(this.REGION).orElseThrow(IntervalParserFactory.exception(this.REGION));
} else {
interval = null;
}
final SAMRecordIterator iter;
if (interval == null) {
iter = samFileReader.iterator();
} else {
iter = samFileReader.queryOverlapping(interval.getContig(), interval.getStart(), interval.getEnd());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.isSecondaryOrSupplementary())
continue;
final Match m = new Match();
m.flag = rec.getFlags();
m.readName = rec.getReadName();
m.firstBamFile = currentSamFileIndex == 0;
if (rec.getReadUnmappedFlag()) {
m.tid = -1;
m.pos = -1;
} else {
m.tid = rec.getReferenceIndex();
m.pos = rec.getAlignmentStart();
}
database.add(m);
}
iter.close();
samFileReader.close();
samFileReader = null;
LOG.info("Close " + samFile);
}
database.doneAdding();
LOG.info("Writing results....");
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
// compute the differences for each read
out.print("#READ-Name\tCOMPARE");
for (final File f : this.bamFiles) {
out.print("\t" + f);
}
out.println();
/* create an array of set<Match> */
final MatchComparator match_comparator = new MatchComparator();
final List<Set<Match>> matches = new ArrayList<Set<CompareBamAndBuild.Match>>(2);
while (matches.size() < 2) {
matches.add(new TreeSet<CompareBamAndBuild.Match>(match_comparator));
}
CloseableIterator<Match> iter = database.iterator();
String currReadName = null;
int curr_num_in_pair = -1;
for (; ; ) {
final Match nextMatch;
if (iter.hasNext()) {
nextMatch = iter.next();
} else {
nextMatch = null;
}
if (nextMatch == null || (currReadName != null && !currReadName.equals(nextMatch.readName)) || (curr_num_in_pair != -1 && curr_num_in_pair != nextMatch.indexInPair())) {
if (currReadName != null) {
out.print(currReadName);
if (curr_num_in_pair > 0) {
out.print("/");
out.print(curr_num_in_pair);
}
out.print("\t");
if (same(matches.get(0), matches.get(1))) {
out.print("EQ");
} else {
out.print("NE");
}
for (int x = 0; x < 2; ++x) {
out.print("\t");
print(out, matches.get(x));
}
out.println();
}
if (nextMatch == null)
break;
for (final Set<Match> set : matches) set.clear();
}
currReadName = nextMatch.readName;
curr_num_in_pair = nextMatch.indexInPair();
matches.get(nextMatch.firstBamFile ? 0 : 1).add(nextMatch);
}
iter.close();
out.flush();
out.close();
database.cleanup();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
this.liftOver = null;
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class FindGVCFsBlocks method doWork.
@Override
public int doWork(final List<String> args) {
try {
if (this.bedPath != null) {
IOUtil.assertFileIsReadable(this.bedPath);
}
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.error("input missing");
return -1;
}
if (merge_blocks_distance < 1) {
LOG.error("bad merge size : " + merge_blocks_distance);
return -1;
}
final Predicate<Locatable> inCapture;
if (this.bedPath != null) {
final IntervalTreeMap<Boolean> intervalTreeMap;
try (BedLineReader blr = new BedLineReader(this.bedPath)) {
intervalTreeMap = blr.toIntervalTreeMap(BED -> Boolean.TRUE);
}
inCapture = (L) -> intervalTreeMap.containsOverlapping(L);
} else {
inCapture = (L) -> true;
}
if (this.outputFile != null) {
final String fname = this.outputFile.getFileName().toString();
if (!fname.endsWith(FileExtensions.INTERVAL_LIST) && !fname.endsWith(FileExtensions.COMPRESSED_INTERVAL_LIST)) {
LOG.error("Output should end with " + FileExtensions.INTERVAL_LIST + " or " + FileExtensions.COMPRESSED_INTERVAL_LIST);
return -1;
}
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(inputs.get(0));
if (!StringUtils.isBlank(the_contig) && dict.getSequence(the_contig) == null)
throw new JvarkitException.ContigNotFoundInDictionary(the_contig, dict);
try (IntervalListWriter w = new IntervalListWriter(this.outputFile, dict)) {
/**
* loop over chromosomes
*/
for (final SAMSequenceRecord ssr : dict.getSequences()) {
ContigBlocks mainBlock = null;
if (!StringUtils.isBlank(the_contig) && !ssr.getContig().equals(the_contig))
continue;
final long initMilliSec = System.currentTimeMillis();
for (int i = 0; i < inputs.size(); i++) {
final long startMilliSec = System.currentTimeMillis();
LOG.info(inputs.get(i) + " " + (i + 1) + "/" + inputs.size());
mainBlock = scanVcfFile(mainBlock, dict, ssr, inputs.get(i));
final long count_variants = mainBlock.end_positions.size();
final long millisecPerVcf = (System.currentTimeMillis() - initMilliSec) / (i + 1L);
LOG.info("N=" + count_variants + ". That took: " + StringUtils.niceDuration(System.currentTimeMillis() - startMilliSec) + " Remains: " + StringUtils.niceDuration((inputs.size() - (i + 1)) * millisecPerVcf));
}
/* nothing was seen */
if (mainBlock.minPos == null)
continue;
if (mainBlock.end_positions.isEmpty()) {
final Locatable loc = new SimpleInterval(ssr.getContig(), mainBlock.minPos, mainBlock.maxPos);
if (inCapture.test(loc)) {
w.add(loc);
}
continue;
}
final List<Locatable> intervals = new ArrayList<>(mainBlock.end_positions.size() + 1);
final int pos1 = mainBlock.end_positions.stream().mapToInt(P -> P.intValue()).min().getAsInt();
if (mainBlock.minPos < pos1) {
final Locatable loc = new SimpleInterval(ssr.getContig(), mainBlock.minPos, pos1 - 1);
if (inCapture.test(loc)) {
intervals.add(loc);
}
}
Integer prev = null;
for (Integer p : mainBlock.end_positions) {
if (prev != null) {
final Locatable loc = new SimpleInterval(ssr.getContig(), prev + 1, p);
if (inCapture.test(loc)) {
intervals.add(loc);
}
}
prev = p;
}
final int pos2 = mainBlock.end_positions.stream().mapToInt(P -> P.intValue()).max().getAsInt();
if (mainBlock.maxPos > pos2) {
final Locatable loc = new SimpleInterval(ssr.getContig(), pos2 + 1, mainBlock.maxPos);
if (inCapture.test(loc)) {
intervals.add(loc);
}
}
Collections.sort(intervals, (A, B) -> Integer.compare(A.getStart(), B.getStart()));
while (!intervals.isEmpty()) {
Locatable loc = intervals.remove(0);
while (this.min_block_size > 0 && !intervals.isEmpty()) {
final Locatable loc2 = intervals.get(0);
if (!loc2.withinDistanceOf(loc, merge_blocks_distance))
break;
if (CoordMath.getLength(loc.getStart(), loc2.getEnd()) > this.min_block_size)
break;
// consumme loc2
intervals.remove(0);
loc = new SimpleInterval(loc.getContig(), loc.getStart(), loc2.getEnd());
}
w.add(loc);
}
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfBurdenSlidingWindow method runBurden.
@Override
protected void runBurden(final PrintWriter pw, final VCFReader vcfReader, final VariantContextWriter vcw) throws IOException {
if (this.window_size <= 0)
throw new IllegalArgumentException("bad window size: " + this.window_size);
if (this.window_shift <= 0)
throw new IllegalArgumentException("bad window shift:" + this.window_shift);
final SAMSequenceDictionary vcfDict = SequenceDictionaryUtils.extractRequired(vcfReader.getHeader());
pw.print("#chrom");
pw.print("\t");
pw.print("start0");
pw.print("\t");
pw.print("end");
pw.print("\t");
pw.print("name");
pw.print("\t");
pw.print("length");
pw.print("\t");
pw.print("p-value");
pw.print("\t");
pw.print("affected_alt");
pw.print("\t");
pw.print("affected_hom");
pw.print("\t");
pw.print("unaffected_alt");
pw.print("\t");
pw.print("unaffected_hom");
pw.print("\t");
pw.print("variants.count");
pw.println();
final ProgressFactory.Watcher<Locatable> progress = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
CloseableIterator<VariantContext> iter;
if (!StringUtils.isBlank(this.limitContig)) {
final SAMSequenceRecord ssr = vcfDict.getSequence(this.limitContig);
if (ssr == null)
throw new JvarkitException.ContigNotFoundInDictionary(limitContig, vcfDict);
iter = vcfReader.query(ssr);
} else {
iter = vcfReader.iterator();
}
final SlidingWindowIterator<VariantContext> iter2 = new SlidingWindowIterator<>(iter.stream().filter(CTX -> accept(CTX)).iterator(), this.window_size, this.window_shift);
while (iter2.hasNext()) {
final Map.Entry<? extends Locatable, List<VariantContext>> entry = iter2.next();
final Locatable W = progress.apply(entry.getKey());
final FisherResult fisher = runFisher(entry.getValue());
if (fisher.p_value > this.fisherTreshold)
continue;
pw.print(W.getContig());
pw.print("\t");
pw.print(W.getStart() - 1);
pw.print("\t");
pw.print(W.getEnd());
pw.print("\t");
pw.print(new SimpleInterval(W).toString());
pw.print("\t");
pw.print(W.getLengthOnReference());
pw.print("\t");
pw.print(fisher.p_value);
pw.print("\t");
pw.print(fisher.affected_alt);
pw.print("\t");
pw.print(fisher.affected_hom);
pw.print("\t");
pw.print(fisher.unaffected_alt);
pw.print("\t");
pw.print(fisher.unaffected_hom);
pw.print("\t");
pw.print(entry.getValue().size());
pw.println();
if (vcw != null) {
for (final VariantContext ctx : entry.getValue()) {
vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(W.toString())).make());
}
}
}
iter.close();
progress.close();
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfToBed method scan.
private int scan(String uriOrNull, final VCFIterator iter, final PrintWriter pw) {
try {
final SAMSequenceDictionary dictIn = iter.getHeader().getSequenceDictionary();
final IntervalExtender extender = IntervalExtender.of(dictIn, this.extendsStr);
if (extender.isShriking()) {
throw new IllegalArgumentException("shrinking interval are not supported " + extender);
}
final ContigNameConverter ctgNameConverter;
final Function<String, SAMSequenceRecord> funGetSSR;
if (this.samSequenceDictionary != null) {
ctgNameConverter = ContigNameConverter.fromOneDictionary(this.samSequenceDictionary);
funGetSSR = C -> samSequenceDictionary.getSequence(C);
} else if (dictIn != null) {
ctgNameConverter = ContigNameConverter.fromOneDictionary(dictIn);
funGetSSR = C -> dictIn.getSequence(C);
} else {
ctgNameConverter = null;
funGetSSR = null;
}
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
int start = ctx.getStart();
int end = ctx.getEnd();
if (!this.ignoreCi && (ctx.hasAttribute("CIPOS") || ctx.hasAttribute("CIEND"))) {
if (ctx.hasAttribute("CIPOS")) {
int x = 0;
try {
x = ctx.getAttributeAsIntList("CIPOS", 0).get(0);
} catch (final Throwable err) {
LOG.error(err);
x = 0;
}
start += x;
}
if (ctx.hasAttribute("CIEND")) {
int x = 0;
try {
x = ctx.getAttributeAsIntList("CIEND", 0).get(1);
} catch (final Throwable err) {
LOG.error(err);
x = 0;
}
end += x;
}
// it happens for SV=BND
if (start > end) {
final int tmp = start;
start = end;
end = tmp;
}
}
if (extender.isChanging()) {
final Locatable extended = extender.apply(new SimpleInterval(ctx.getContig(), start, end));
start = extended.getStart();
end = extended.getEnd();
}
final String newtContig;
if (ctgNameConverter != null) {
newtContig = ctgNameConverter.apply(ctx.getContig());
if (StringUtil.isBlank(newtContig)) {
this.contigsNotFound.add(ctx.getContig());
continue;
}
} else {
newtContig = ctx.getContig();
}
if (funGetSSR != null) {
final SAMSequenceRecord ssr = funGetSSR.apply(newtContig);
if (ssr != null) {
end = Math.min(ssr.getSequenceLength(), end);
}
}
start = Math.max(1, start);
final SimpleInterval interval = new SimpleInterval(newtContig, start, end);
if (interval.getStart() > interval.getEnd()) {
LOG.info("negative interval " + interval + " for " + ctx + " in " + uriOrNull);
continue;
}
if (this.minLength != -1 && interval.getLengthOnReference() < this.minLength)
continue;
if (this.maxLength != -1 && interval.getLengthOnReference() > this.maxLength)
continue;
switch(this.outputFormat) {
case bed:
{
pw.print(interval.getContig());
pw.print("\t");
pw.print(interval.getStart() - 1);
pw.print("\t");
pw.print(interval.getEnd());
pw.print("\t");
pw.print(ctx.hasID() ? ctx.getID() : ctx.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining(",")));
pw.print("\t");
pw.print(ctx.hasLog10PError() ? Math.min(1000, (int) ctx.getPhredScaledQual()) : ".");
/*
pw.print("\t");
pw.print("+");
pw.print("\t");
pw.print(interval.getStart()-1);
pw.print("\t");
pw.print(interval.getEnd());
pw.print("\t");
pw.print("0,0,0");
*/
pw.println();
break;
}
case interval:
{
pw.print(interval.getContig());
pw.print(":");
pw.print(interval.getStart());
pw.print("-");
pw.print(interval.getEnd());
pw.println();
break;
}
default:
{
throw new IllegalStateException("" + this.outputFormat);
}
}
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
Aggregations