use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.
the class BamStats01 method run.
private void run(final String filename, SamReader samFileReader) throws IOException {
Map<String, Histogram> sample2hist = new HashMap<String, BamStats01.Histogram>();
SAMSequenceDictionary currDict = samFileReader.getFileHeader().getSequenceDictionary();
if (samSequenceDictionary == null) {
samSequenceDictionary = currDict;
}
if (this.bedFile != null) {
if (!SequenceUtil.areSequenceDictionariesEqual(currDict, samSequenceDictionary)) {
samFileReader.close();
throw new IOException("incompatible sequence dictionaries." + filename);
}
if (intervalTreeMap == null) {
final BedLineCodec bedCodec = new BedLineCodec();
intervalTreeMap = new IntervalTreeMap<>();
LOG.info("opening " + this.bedFile);
String line;
final BufferedReader bedIn = IOUtils.openFileForBufferedReading(bedFile);
while ((line = bedIn.readLine()) != null) {
final BedLine bedLine = bedCodec.decode(line);
if (bedLine == null)
continue;
int seqIndex = currDict.getSequenceIndex(bedLine.getContig());
if (seqIndex == -1) {
throw new IOException("unknown chromosome from dict in in " + line + " " + this.bedFile);
}
intervalTreeMap.put(bedLine.toInterval(), Boolean.TRUE);
}
bedIn.close();
LOG.info("done reading " + this.bedFile);
}
}
this.chrX_index = -1;
this.chrY_index = -1;
for (final SAMSequenceRecord rec : currDict.getSequences()) {
final String chromName = rec.getSequenceName().toLowerCase();
if (chromName.equals("x") || chromName.equals("chrx")) {
this.chrX_index = rec.getSequenceIndex();
} else if (chromName.equals("y") || chromName.equals("chry")) {
this.chrY_index = rec.getSequenceIndex();
}
}
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(currDict);
final SAMRecordIterator iter = samFileReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progess.watch(iter.next());
String sampleName = groupBy.getPartion(rec);
if (sampleName == null || sampleName.isEmpty())
sampleName = "undefined";
Histogram hist = sample2hist.get(sampleName);
if (hist == null) {
hist = new Histogram();
sample2hist.put(sampleName, hist);
}
hist.histograms[Category2.ALL.ordinal()].watch(rec);
if (intervalTreeMap == null)
continue;
if (rec.getReadUnmappedFlag()) {
continue;
}
if (!intervalTreeMap.containsOverlapping(new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd()))) {
hist.histograms[Category2.OFF_TARGET.ordinal()].watch(rec);
} else {
hist.histograms[Category2.IN_TARGET.ordinal()].watch(rec);
}
}
progess.finish();
samFileReader.close();
samFileReader = null;
for (final String sampleName : sample2hist.keySet()) {
final Histogram hist = sample2hist.get(sampleName);
out.print(filename + "\t" + sampleName);
for (final Category2 cat2 : Category2.values()) {
for (// je je suis libertineuuh, je suis une cat1
final Category cat1 : // je je suis libertineuuh, je suis une cat1
Category.values()) {
out.print("\t");
out.print(hist.histograms[cat2.ordinal()].counts[cat1.ordinal()]);
}
if (intervalTreeMap == null)
break;
}
out.println();
}
}
use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.
the class KnimeVariantHelper method parseBedAsBooleanIntervalTreeMap.
public IntervalTreeMap<Boolean> parseBedAsBooleanIntervalTreeMap(final String bedUri) throws IOException {
if (bedUri == null || bedUri.isEmpty())
throw new IllegalArgumentException("bad bed uri");
BufferedReader r = null;
final IntervalTreeMap<Boolean> treeMap = new IntervalTreeMap<>();
try {
r = IOUtils.openURIForBufferedReading(bedUri);
final BedLineCodec codec = new BedLineCodec();
r.lines().forEach(L -> {
final BedLine bedline = codec.decode(L);
if (bedline == null) {
LOG.warn("Ignoring line in BED (doesn't look like a BED line):" + L);
return;
}
treeMap.put(bedline.toInterval(), Boolean.TRUE);
});
return treeMap;
} finally {
CloserUtil.close(r);
}
}
use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.
the class Launcher method readBedFileAsBooleanIntervalTreeMap.
/**
* reads a Bed file and convert it to a IntervalTreeMap<Boolean>
*/
protected IntervalTreeMap<Boolean> readBedFileAsBooleanIntervalTreeMap(final java.io.File file) throws java.io.IOException {
java.io.BufferedReader r = null;
try {
final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<Boolean>();
r = com.github.lindenb.jvarkit.io.IOUtils.openFileForBufferedReading(file);
final BedLineCodec bedCodec = new BedLineCodec();
r.lines().filter(line -> !(line.startsWith("#") || com.github.lindenb.jvarkit.util.bio.bed.BedLine.isBedHeader(line) || line.isEmpty())).map(line -> bedCodec.decode(line)).filter(B -> B != null).map(B -> B.toInterval()).filter(L -> L.getStart() < L.getEnd()).forEach(L -> {
intervals.put(L, true);
});
return intervals;
} finally {
htsjdk.samtools.util.CloserUtil.close(r);
}
}
use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.
the class VcfGeneEpistasis method doWork.
@Override
public int doWork(final List<String> args) {
if (this.geneBed == null) {
LOG.error("gene file bed undefined");
return -1;
}
if (this.outputFile == null) {
LOG.error("output file undefined");
return -1;
}
CloseableIterator<VariantContext> iter = null;
try {
final File vcfFile = new File(oneAndOnlyOneFile(args));
this.vcfFileReader = new VCFFileReader(vcfFile, true);
final VCFHeader header = this.vcfFileReader.getFileHeader();
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
pedigree = new Pedigree.Parser().parse(header);
}
if (pedigree == null || pedigree.isEmpty() || !pedigree.hasAffected() || !pedigree.hasUnaffected()) {
LOG.error("empty ped or no case/ctrl");
return -1;
}
pedigree.verifyPersonsHaveUniqueNames();
for (final Pedigree.Person p : pedigree.getPersons().stream().filter(P -> P.isAffected() || P.isUnaffected()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toSet())) {
this.id2samples.put(p.getId(), p);
}
this.vcfTools = new VcfTools(header);
List<Interval> geneList;
if (!this.geneBed.exists()) {
final Map<String, Interval> gene2interval = new HashMap<>(50000);
LOG.info("building gene file" + this.geneBed);
iter = this.vcfFileReader.iterator();
// iter = this.vcfFileReader.query("chr3",1,300_000_000);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!accept(ctx))
continue;
for (final String geneName : getGenes(ctx)) {
final Interval old = gene2interval.get(geneName);
if (old == null) {
gene2interval.put(geneName, new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, geneName));
LOG.info("adding " + geneName + ". number of genes: " + gene2interval.size());
} else if (!old.getContig().equals(ctx.getContig())) {
LOG.error("boum :" + geneName + ": on chrom " + ctx.getContig() + " vs " + old);
return -1;
} else {
gene2interval.put(geneName, new Interval(ctx.getContig(), Math.min(ctx.getStart(), old.getStart()), Math.max(ctx.getEnd(), old.getEnd()), false, geneName));
}
}
}
iter.close();
iter = null;
progress.finish();
geneList = new ArrayList<>(gene2interval.values());
PrintWriter pw = new PrintWriter(this.geneBed);
for (final Interval g : geneList) {
pw.println(g.getContig() + "\t" + (g.getStart() - 1) + "\t" + (g.getEnd()) + "\t" + g.getName());
}
pw.flush();
pw.close();
pw = null;
} else {
BedLineCodec codec = new BedLineCodec();
BufferedReader r = IOUtil.openFileForBufferedReading(geneBed);
geneList = r.lines().map(L -> codec.decode(L)).filter(B -> B != null).map(B -> new Interval(B.getContig(), B.getStart(), B.getEnd(), true, B.get(3))).collect(Collectors.toList());
r.close();
}
if (geneList.isEmpty()) {
LOG.error("gene List is empty");
return -1;
}
final Comparator<VariantContext> ctxSorter = VCFUtils.createTidPosRefComparator(header.getSequenceDictionary());
final Function<Interval, List<VariantContext>> loadVariants = (R) -> {
List<VariantContext> L = new ArrayList<>();
CloseableIterator<VariantContext> r = this.vcfFileReader.query(R.getContig(), R.getStart(), R.getEnd());
while (r.hasNext()) {
final VariantContext ctx = r.next();
if (!accept(ctx))
continue;
if (!getGenes(ctx).contains(R.getName()))
continue;
L.add(ctx);
}
r.close();
return L;
};
final SkatExecutor executor = this.skatFactory.build();
Double bestSkat = null;
LOG.info("number of genes : " + geneList.size());
final int list_end_index = (this.user_end_index < 0 ? geneList.size() : Math.min(geneList.size(), this.user_end_index));
for (int x = this.user_begin_index; x < list_end_index; ++x) {
final Interval intervalx = geneList.get(x);
final List<VariantContext> variantsx = loadVariants.apply(intervalx);
if (variantsx.isEmpty())
continue;
for (int y = x; y < geneList.size(); /* pas list_end_index */
++y) {
final Interval intervaly;
final List<VariantContext> merge;
if (y == x) {
// we-re testing gene 1 only
intervaly = intervalx;
merge = variantsx;
} else {
intervaly = geneList.get(y);
if (intervaly.intersects(intervalx))
continue;
final List<VariantContext> variantsy = loadVariants.apply(intervaly);
if (variantsy.isEmpty())
continue;
merge = new MergedList<>(variantsx, variantsy);
}
LOG.info("testing : [" + x + "]" + intervalx + " [" + y + "]" + intervaly + " N:" + geneList.size() + " best: " + bestSkat);
final Double skat = eval(executor, merge);
if (skat == null)
continue;
if (bestSkat == null || skat.compareTo(bestSkat) < 0) {
bestSkat = skat;
LOG.info("best " + bestSkat + " " + intervalx + " " + intervaly);
if (this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz")) {
final VCFHeader header2 = new VCFHeader(header);
header2.addMetaDataLine(new VCFHeaderLine(VcfGeneEpistasis.class.getName(), intervalx.getName() + " " + intervaly.getName() + " " + bestSkat));
final VariantContextWriter w = VCFUtils.createVariantContextWriter(outputFile);
w.writeHeader(header2);
merge.stream().sorted(ctxSorter).forEach(V -> w.add(V));
w.close();
} else {
final PrintWriter w = super.openFileOrStdoutAsPrintWriter(outputFile);
w.println(String.valueOf(bestSkat) + "\t" + intervalx.getName() + "\t" + intervaly.getName());
w.flush();
w.close();
}
}
}
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(this.vcfFileReader);
}
}
use of com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec in project jvarkit by lindenb.
the class Biostar77828 method doWork.
@Override
public int doWork(List<String> args) {
PrintStream pw = null;
try {
LOG.info("load BED");
BufferedReader in = super.openBufferedReader(oneFileOrNull(args));
final BedLineCodec codec = new BedLineCodec();
String line;
while ((line = in.readLine()) != null) {
if (line.isEmpty() || line.startsWith("#"))
continue;
final BedLine bedLine = codec.decode(line);
if (bedLine == null)
continue;
if (bedLine.getColumnCount() < 3)
throw new IOException("bad BED input " + bedLine);
final Segment seg = new Segment(bedLine.getContig(), bedLine.getStart() - 1, bedLine.getEnd());
if (seg.size() == 0)
continue;
this.effective_genome_size += seg.size();
this.all_segments.add(seg);
}
pw = super.openFileOrStdoutAsPrintStream(this.outputFile);
Solution best = null;
for (long generation = 0; generation < this.N_ITERATIONS; ++generation) {
if (generation % 100000 == 0)
LOG.info("generation:" + generation + "/" + this.N_ITERATIONS + " " + (int) ((generation / (double) this.N_ITERATIONS) * 100.0) + "% " + String.valueOf(best));
Solution sol = createSolution();
sol.generation = generation;
if (best == null || sol.compareTo(best) < 0) {
best = sol;
/*
if(LOG.isDebugEnabled())
{
LOG.info("%%generation:"+generation);
best.print(stderr());
}*/
}
}
if (best != null) {
best.print(pw);
}
pw.flush();
pw.close();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(pw);
}
}
Aggregations