use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class VcfGeneEpistasis method doWork.
@Override
public int doWork(final List<String> args) {
if (this.geneBed == null) {
LOG.error("gene file bed undefined");
return -1;
}
if (this.outputFile == null) {
LOG.error("output file undefined");
return -1;
}
CloseableIterator<VariantContext> iter = null;
try {
final File vcfFile = new File(oneAndOnlyOneFile(args));
this.vcfFileReader = VCFReaderFactory.makeDefault().open(vcfFile.toPath(), true);
final VCFHeader header = this.vcfFileReader.getHeader();
final Pedigree pedigree;
if (this.pedigreeFile != null) {
pedigree = new Pedigree.Parser().parse(this.pedigreeFile);
} else {
pedigree = new Pedigree.Parser().parse(header);
}
if (pedigree == null || pedigree.isEmpty() || !pedigree.hasAffected() || !pedigree.hasUnaffected()) {
LOG.error("empty ped or no case/ctrl");
return -1;
}
pedigree.verifyPersonsHaveUniqueNames();
for (final Pedigree.Person p : pedigree.getPersons().stream().filter(P -> P.isAffected() || P.isUnaffected()).filter(P -> header.getSampleNamesInOrder().contains(P.getId())).collect(Collectors.toSet())) {
this.id2samples.put(p.getId(), p);
}
this.vcfTools = new VcfTools(header);
List<Interval> geneList;
if (!this.geneBed.exists()) {
final Map<String, Interval> gene2interval = new HashMap<>(50000);
LOG.info("building gene file" + this.geneBed);
iter = this.vcfFileReader.iterator();
// iter = this.vcfFileReader.query("chr3",1,300_000_000);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
if (!accept(ctx))
continue;
for (final String geneName : getGenes(ctx)) {
final Interval old = gene2interval.get(geneName);
if (old == null) {
gene2interval.put(geneName, new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, geneName));
LOG.info("adding " + geneName + ". number of genes: " + gene2interval.size());
} else if (!old.getContig().equals(ctx.getContig())) {
LOG.error("boum :" + geneName + ": on chrom " + ctx.getContig() + " vs " + old);
return -1;
} else {
gene2interval.put(geneName, new Interval(ctx.getContig(), Math.min(ctx.getStart(), old.getStart()), Math.max(ctx.getEnd(), old.getEnd()), false, geneName));
}
}
}
iter.close();
iter = null;
progress.finish();
geneList = new ArrayList<>(gene2interval.values());
PrintWriter pw = new PrintWriter(this.geneBed);
for (final Interval g : geneList) {
pw.println(g.getContig() + "\t" + (g.getStart() - 1) + "\t" + (g.getEnd()) + "\t" + g.getName());
}
pw.flush();
pw.close();
pw = null;
} else {
try (BedLineReader r = new BedLineReader(geneBed)) {
geneList = r.stream().filter(B -> B != null).map(B -> new Interval(B.getContig(), B.getStart(), B.getEnd(), true, B.get(3))).collect(Collectors.toList());
}
}
if (geneList.isEmpty()) {
LOG.error("gene List is empty");
return -1;
}
final Comparator<VariantContext> ctxSorter = VCFUtils.createTidPosRefComparator(header.getSequenceDictionary());
final Function<Interval, List<VariantContext>> loadVariants = (R) -> {
List<VariantContext> L = new ArrayList<>();
CloseableIterator<VariantContext> r = this.vcfFileReader.query(R.getContig(), R.getStart(), R.getEnd());
while (r.hasNext()) {
final VariantContext ctx = r.next();
if (!accept(ctx))
continue;
if (!getGenes(ctx).contains(R.getName()))
continue;
L.add(ctx);
}
r.close();
return L;
};
final SkatExecutor executor = this.skatFactory.build();
Double bestSkat = null;
LOG.info("number of genes : " + geneList.size());
final int list_end_index = (this.user_end_index < 0 ? geneList.size() : Math.min(geneList.size(), this.user_end_index));
for (int x = this.user_begin_index; x < list_end_index; ++x) {
final Interval intervalx = geneList.get(x);
final List<VariantContext> variantsx = loadVariants.apply(intervalx);
if (variantsx.isEmpty())
continue;
for (int y = x; y < geneList.size(); /* pas list_end_index */
++y) {
final Interval intervaly;
final List<VariantContext> merge;
if (y == x) {
// we-re testing gene 1 only
intervaly = intervalx;
merge = variantsx;
} else {
intervaly = geneList.get(y);
if (intervaly.intersects(intervalx))
continue;
final List<VariantContext> variantsy = loadVariants.apply(intervaly);
if (variantsy.isEmpty())
continue;
merge = new MergedList<>(variantsx, variantsy);
}
LOG.info("testing : [" + x + "]" + intervalx + " [" + y + "]" + intervaly + " N:" + geneList.size() + " best: " + bestSkat);
final Double skat = eval(executor, merge);
if (skat == null)
continue;
if (bestSkat == null || skat.compareTo(bestSkat) < 0) {
bestSkat = skat;
LOG.info("best " + bestSkat + " " + intervalx + " " + intervaly);
if (this.outputFile.getName().endsWith(".vcf") || this.outputFile.getName().endsWith(".vcf.gz")) {
final VCFHeader header2 = new VCFHeader(header);
header2.addMetaDataLine(new VCFHeaderLine(VcfGeneEpistasis.class.getName(), intervalx.getName() + " " + intervaly.getName() + " " + bestSkat));
final VariantContextWriter w = VCFUtils.createVariantContextWriter(outputFile);
w.writeHeader(header2);
merge.stream().sorted(ctxSorter).forEach(V -> w.add(V));
w.close();
} else {
final PrintWriter w = super.openFileOrStdoutAsPrintWriter(outputFile);
w.println(String.valueOf(bestSkat) + "\t" + intervalx.getName() + "\t" + intervaly.getName());
w.flush();
w.close();
}
}
}
}
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(this.vcfFileReader);
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class VcfFisherCombinatorics method doWork.
@Override
public int doWork(final List<String> args) {
try {
final Pedigree pedigree = new PedigreeParser().parse(this.pedigreeFile);
final IntervalTreeMap<List<GeneInfo>> geneInfoMap = new IntervalTreeMap<>();
final Predicate<Genotype> isGtCnv = G -> G.isHet() || G.isHomVar();
try (VCFIterator vcfIter = super.openVCFIterator(super.oneFileOrNull(args))) {
final VCFHeader header = vcfIter.getHeader();
final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(SequenceDictionaryUtils.extractRequired(header));
if (!header.hasGenotypingData()) {
LOG.error("No Genotype data in " + header);
return -1;
}
final Map<String, Integer> sample2index = header.getSampleNameToOffset();
final List<Integer> casesIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isAffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
LOG.info("cases N=" + casesIdx.size());
if (casesIdx.isEmpty()) {
LOG.error("No affected/cases sample in the input VCF");
return -1;
}
final List<Integer> controlsIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isUnaffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
LOG.info("controls N=" + controlsIdx.size());
if (controlsIdx.isEmpty()) {
LOG.error("No unaffected/controls sample in the input VCF");
return -1;
}
final Predicate<VariantContext> acceptCtx = V -> {
if (discard_bnd && V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
return false;
if (discard_filtered && V.isFiltered())
return false;
if (max_af < 1.0) {
final Iterator<Integer> it = Stream.concat(controlsIdx.stream(), casesIdx.stream()).iterator();
int ac = 0;
int an = 0;
while (it.hasNext()) {
switch(V.getGenotype(it.next()).getType()) {
case HOM_VAR:
ac += 2;
an += 2;
break;
case HOM_REF:
ac += 0;
an += 2;
break;
case HET:
ac += 1;
an += 2;
break;
default:
break;
}
}
if (an == 0)
return false;
if (ac / (double) an > max_af)
return false;
}
return true;
};
try (BedLineReader br = new BedLineReader(this.bedPath)) {
while (br.hasNext()) {
final BedLine bed = br.next();
final String ctg = ctgConverter.apply(bed.getContig());
if (StringUtil.isBlank(ctg))
continue;
final GeneInfo geneInfo = new GeneInfo();
geneInfo.casesflags = new BitSet(casesIdx.size());
geneInfo.controlsflags = new BitSet(controlsIdx.size());
geneInfo.contig = ctg;
geneInfo.name = bed.getOrDefault(3, "undefined");
geneInfo.parts.add(new SimpleInterval(bed).renameContig(ctg));
final Interval key = new Interval(geneInfo);
List<GeneInfo> L = geneInfoMap.get(key);
if (L == null) {
L = new ArrayList<>();
geneInfoMap.put(key, L);
}
L.add(geneInfo);
}
}
if (geneInfoMap.isEmpty()) {
LOG.error("no gene found in " + this.bedPath);
return -1;
}
LOG.info("reading variants...");
while (vcfIter.hasNext()) {
final VariantContext ctx = vcfIter.next();
if (!acceptCtx.test(ctx))
continue;
for (List<GeneInfo> gil : geneInfoMap.getOverlapping(ctx)) {
for (GeneInfo gi : gil) {
for (int y = 0; y < casesIdx.size(); ++y) {
final Genotype gt = ctx.getGenotype(casesIdx.get(y));
if (!isGtCnv.test(gt))
continue;
gi.casesflags.set(y);
}
for (int y = 0; y < controlsIdx.size(); ++y) {
final Genotype gt = ctx.getGenotype(controlsIdx.get(y));
if (!isGtCnv.test(gt))
continue;
gi.controlsflags.set(y);
}
}
}
}
/* remove Genes without variant , count sample per gene*/
for (List<GeneInfo> gil : geneInfoMap.values()) {
gil.removeIf(GI -> GI.casesflags.nextSetBit(0) == -1 && GI.controlsflags.nextSetBit(0) == -1);
}
// end remove
/* load bed file */
final List<GeneInfo> geneList = geneInfoMap.values().stream().filter(L -> !L.isEmpty()).flatMap(L -> L.stream()).sorted().collect(Collectors.toList());
if (geneList.isEmpty()) {
LOG.error("no gene found in " + this.bedPath);
return -1;
}
LOG.info("N Genes=" + geneList.size());
Solution best = null;
for (int i = 2; i < Math.min(this.max_num_genes, geneList.size()); i++) {
LOG.info("sarting loop from " + i);
best = recursion(geneList, casesIdx, controlsIdx, new ArrayList<>(), i, best);
}
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
pw.println(best);
pw.flush();
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class FindGVCFsBlocks method doWork.
@SuppressWarnings("resource")
@Override
public int doWork(final List<String> args) {
Path tmpBedFile0 = null;
Path tmpBedFile1 = null;
try {
if (this.bedPath != null) {
IOUtil.assertFileIsReadable(this.bedPath);
}
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.error("input missing");
return -1;
}
if (this.outputFile != null) {
final String fname = this.outputFile.getFileName().toString();
if (!fname.endsWith(FileExtensions.INTERVAL_LIST) && !fname.endsWith(FileExtensions.COMPRESSED_INTERVAL_LIST)) {
LOG.error("Output should end with " + FileExtensions.INTERVAL_LIST + " or " + FileExtensions.COMPRESSED_INTERVAL_LIST);
return -1;
}
}
if (this.tmpDir == null && this.outputFile != null) {
this.tmpDir = this.outputFile.getParent();
}
if (this.tmpDir == null) {
this.tmpDir = IOUtils.getDefaultTempDir();
}
IOUtil.assertDirectoryIsWritable(this.tmpDir);
tmpBedFile0 = Files.createTempFile(this.tmpDir, "tmp.", ".bed");
tmpBedFile1 = Files.createTempFile(this.tmpDir, "tmp.", ".bed");
SAMSequenceDictionary dict = null;
final long initMilliSec = System.currentTimeMillis();
for (int i = 0; i < inputs.size(); i++) {
final long startMilliSec = System.currentTimeMillis();
LOG.info(inputs.get(i) + " " + (i + 1) + "/" + inputs.size());
try (GVCFOrIntervalIterator r0 = openInput(inputs.get(i))) {
if (dict != null) {
SequenceUtil.assertSequenceDictionariesEqual(dict, r0.getSAMSequenceDictionary());
} else {
dict = r0.getSAMSequenceDictionary();
}
long count_variants = 0L;
try (IntervalListWriter pw = new IntervalListWriter(tmpBedFile0, dict)) {
/* first VCF , just convert to bed */
if (i == 0) {
while (r0.hasNext()) {
final Locatable loc = r0.next();
pw.add(loc);
count_variants++;
}
} else /* merge previous bed with current VCF using INFO/END */
{
Locatable start0 = null;
Locatable start1 = null;
try (IntervalListIterator r1 = new IntervalListIterator(tmpBedFile1)) {
PeekableIterator<Locatable> peek0 = new PeekableIterator<>(r0);
PeekableIterator<Locatable> peek1 = new PeekableIterator<>(r1);
while (peek0.hasNext() && peek1.hasNext()) {
final Locatable loc0 = peek0.peek();
final Locatable loc1 = peek1.peek();
if (!loc0.contigsMatch(loc1)) {
throw new IllegalStateException("unexpected: not the same contigs " + loc0 + " " + loc1);
}
if (start0 == null)
start0 = loc0;
if (start1 == null)
start1 = loc1;
final int end0 = loc0.getEnd();
final int end1 = loc1.getEnd();
if (end0 < end1) {
peek0.next();
continue;
} else if (end0 > end1) {
peek1.next();
continue;
} else {
/* end0==end1 */
pw.add(new SimpleInterval(loc0.getContig(), (Math.min(start0.getStart(), start1.getStart())), loc0.getEnd()));
count_variants++;
// consumme
peek0.next();
// consumme
peek1.next();
start0 = null;
start1 = null;
}
}
if (lenient_discordant_end) {
while (peek0.hasNext()) {
final Locatable loc = peek0.next();
LOG.warn("extra interval 0 : " + loc);
}
while (peek1.hasNext()) {
final Locatable loc = peek1.next();
LOG.warn("extra interval 1 : " + loc);
}
} else {
if (peek0.hasNext())
throw new IllegalStateException("peek0 has Next ? " + peek0.next() + " use --lenient ?");
if (peek1.hasNext())
throw new IllegalStateException("peek1 has Next ? " + peek1.next() + " use --lenient ?");
}
peek0.close();
peek1.close();
}
}
final long millisecPerVcf = (System.currentTimeMillis() - initMilliSec) / (i + 1L);
LOG.info("N=" + count_variants + ". That took: " + StringUtils.niceDuration(System.currentTimeMillis() - startMilliSec) + " Remains: " + StringUtils.niceDuration((inputs.size() - (i + 1)) * millisecPerVcf));
}
// end writer
Files.deleteIfExists(tmpBedFile1);
Files.move(tmpBedFile0, tmpBedFile1);
}
}
final IntervalTreeMap<Boolean> intervalTreeMap;
if (this.bedPath != null) {
try (BedLineReader blr = new BedLineReader(this.bedPath)) {
intervalTreeMap = blr.toIntervalTreeMap(BED -> Boolean.TRUE);
}
} else {
intervalTreeMap = null;
}
try (IntervalListWriter w = new IntervalListWriter(this.outputFile, dict)) {
try (IntervalListIterator r1 = new IntervalListIterator(tmpBedFile1)) {
final PeekableIterator<Locatable> peek1 = new PeekableIterator<>(r1);
while (peek1.hasNext()) {
Locatable loc = peek1.next();
if (intervalTreeMap != null && !intervalTreeMap.containsOverlapping(loc)) {
continue;
}
while (this.min_block_size > 0 && peek1.hasNext()) {
final Locatable loc2 = peek1.peek();
if (!loc2.contigsMatch(loc))
break;
if (CoordMath.getLength(loc.getStart(), loc2.getEnd()) > this.min_block_size)
break;
loc = new SimpleInterval(loc.getContig(), loc.getStart(), loc2.getEnd());
// consumme loc2
peek1.next();
}
w.add(loc);
}
peek1.close();
}
Files.deleteIfExists(tmpBedFile1);
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
if (tmpBedFile0 != null)
try {
Files.deleteIfExists(tmpBedFile0);
} catch (Throwable err) {
}
if (tmpBedFile1 != null)
try {
Files.deleteIfExists(tmpBedFile1);
} catch (Throwable err) {
}
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class Biostar77828 method doWork.
@Override
public int doWork(final List<String> args) {
try {
String input = oneFileOrNull(args);
try (BedLineReader in = new BedLineReader(openBufferedReader(input), input)) {
while (in.hasNext()) {
final BedLine bedLine = in.next();
if (bedLine == null)
continue;
if (bedLine.getColumnCount() < 3)
throw new IOException("bad BED input " + bedLine);
final Interval seg = new Interval(bedLine.getContig(), bedLine.getStart(), bedLine.getEnd());
if (seg.length() == 0)
continue;
this.effective_genome_size += seg.length();
this.all_segments.add(seg);
}
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
Solution best = null;
for (long generation = 0; generation < this.N_ITERATIONS; ++generation) {
if (generation % 100000 == 0)
LOG.info("generation:" + generation + "/" + this.N_ITERATIONS + " " + (int) ((generation / (double) this.N_ITERATIONS) * 100.0) + "% " + String.valueOf(best));
Solution sol = createSolution();
sol.generation = generation;
if (best == null || sol.compareTo(best) < 0) {
best = sol;
/*
if(LOG.isDebugEnabled())
{
LOG.info("%%generation:"+generation);
best.print(stderr());
}*/
}
}
if (best != null) {
best.print(pw);
}
pw.flush();
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.bed.BedLineReader in project jvarkit by lindenb.
the class Biostar178713 method doWork.
@Override
public int doWork(final List<String> args) {
if (this.outputFile == null || !outputFile.getFileName().toString().endsWith(".zip")) {
LOG.error("output file option must be declared and must en with .zip");
return -1;
}
final Set<String> inputs = IOUtils.unrollFiles(args);
final List<BedLine> bedLines = new ArrayList<>();
OutputStream fos = null;
ZipOutputStream zout = null;
try {
if (inputs.isEmpty()) {
try (BedLineReader r = new BedLineReader(stdin(), "stdin")) {
bedLines.addAll(r.stream().collect(Collectors.toList()));
}
} else
for (final String input : inputs) {
try (BedLineReader r = new BedLineReader(IOUtils.openURIForBufferedReading(input), input)) {
bedLines.addAll(r.stream().collect(Collectors.toList()));
}
}
LOG.info("sorting " + bedLines.size());
Collections.sort(bedLines, new Comparator<BedLine>() {
@Override
public int compare(BedLine o1, BedLine o2) {
int i = o1.getContig().compareTo(o2.getContig());
if (i != 0)
return i;
i = o1.getStart() - o2.getStart();
if (i != 0)
return i;
i = o1.getEnd() - o2.getEnd();
return i;
}
});
if (bedLines.isEmpty()) {
LOG.error("no bed line found");
return -1;
}
LOG.info("creating zip " + this.outputFile);
fos = Files.newOutputStream(this.outputFile);
zout = new ZipOutputStream(fos);
int chunk = 0;
while (!bedLines.isEmpty()) {
++chunk;
final ZipEntry entry = new ZipEntry("bed" + String.format("%03d", chunk) + ".bed");
LOG.info("creating " + entry.getName());
zout.putNextEntry(entry);
PrintWriter pw = new PrintWriter(zout);
BedLine prev = bedLines.get(0);
bedLines.remove(0);
pw.println(prev.join());
int n_in_entry = 1;
int i = 0;
while (i < bedLines.size()) {
final BedLine curr = bedLines.get(i);
final double distance = curr.getStart() - prev.getEnd();
if (!prev.getContig().equals(curr.getContig()) || distance > this.distancebed) {
pw.println(curr.join());
prev = curr;
bedLines.remove(i);
++n_in_entry;
} else {
++i;
}
}
pw.flush();
zout.closeEntry();
LOG.info("closing " + entry.getName() + " N=" + n_in_entry);
}
zout.finish();
zout.close();
return 0;
} catch (Throwable e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(fos);
}
}
Aggregations