use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class MakeMiniBam method doWork.
@Override
public int doWork(final List<String> args) {
ArchiveFactory archive = null;
int id_generator = 0;
final Set<String> outputFileNames = new HashSet<>();
try {
if (StringUtils.isBlank(this.filePrefix) || this.filePrefix.equals("now")) {
final SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyyMMdd");
this.filePrefix = simpleDateFormat.format(new Date()) + ".";
}
if (!this.filePrefix.endsWith(".")) {
this.filePrefix += ".";
}
IOUtil.assertDirectoryIsWritable(tmpDir);
final List<Path> bamFiles = IOUtils.unrollPaths(args);
if (bamFiles.isEmpty()) {
LOG.error("no bam file defined");
return -1;
}
final List<Locatable> locatables = this.intervalListProvider.enableBreakEndInterval(!disable_sv_bnd).enableSinglePoint().stream().collect(Collectors.toList());
if (locatables.isEmpty()) {
LOG.error("no position defined");
return -1;
}
final SAMFileWriterFactory swf = new SAMFileWriterFactory();
swf.setCompressionLevel(9);
swf.setCreateIndex(true);
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.referencePath != null)
srf.referenceSequence(this.referencePath);
archive = ArchiveFactory.open(this.outputFile);
archive.setCompressionLevel(Deflater.NO_COMPRESSION);
for (final Path bamFile : bamFiles) {
LOG.info(bamFile.toString());
final StopWatch stopWatch = new StopWatch();
stopWatch.start();
final SamReader sr = srf.open(bamFile);
if (!sr.hasIndex()) {
sr.close();
LOG.error("file " + bamFile + " is not indexed.");
return -1;
}
final SAMFileHeader header = sr.getFileHeader();
if (!header.getSortOrder().equals(SortOrder.coordinate)) {
sr.close();
LOG.error("file " + bamFile + " is not sorted on coordinate.");
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final Optional<String> dictLabel = SequenceDictionaryUtils.getBuildName(dict);
final String labelSuffix = (dictLabel.isPresent() ? "." + dictLabel.get() : "") + (locatables.size() == 1 ? "." + locatables.get(0).getContig() + "_" + locatables.get(0).getStart() + (locatables.get(0).getStart() == locatables.get(0).getEnd() ? "" : "_" + locatables.get(0).getEnd()) : "");
final ContigNameConverter ctgConvert = ContigNameConverter.fromOneDictionary(dict);
final QueryInterval[] array = locatables.stream().flatMap(loc -> {
if (this.bound_edge < 1 || loc.getLengthOnReference() <= this.bound_edge) {
return Collections.singletonList(loc).stream();
}
return Arrays.asList((Locatable) new SimpleInterval(loc.getContig(), loc.getStart(), loc.getStart()), (Locatable) new SimpleInterval(loc.getContig(), loc.getEnd(), loc.getEnd())).stream();
}).map(LOC -> {
final String contig = ctgConvert.apply(LOC.getContig());
if (StringUtils.isBlank(contig)) {
LOG.warn("Cannot find " + LOC.getContig() + " in " + bamFile);
return null;
}
final SAMSequenceRecord ssr = dict.getSequence(contig);
if (LOC.getStart() > ssr.getSequenceLength()) {
LOG.warn("pos " + LOC + " is greater than chromosome size " + ssr.getSequenceLength() + " in " + bamFile);
return null;
}
return new QueryInterval(ssr.getSequenceIndex(), Math.max(1, LOC.getStart() - extend), Math.min(ssr.getSequenceLength(), LOC.getEnd() + extend));
}).filter(Q -> Q != null).collect(HtsCollectors.optimizedQueryIntervals());
final Path tmpBam = Files.createTempFile(this.tmpDir, "tmp.", ".bam");
final SAMFileHeader header2 = header.clone();
final SAMProgramRecord prg = header2.createProgramRecord();
prg.setProgramName(this.getProgramName());
prg.setProgramVersion(this.getGitHash());
prg.setCommandLine(this.getProgramCommandLine());
JVarkitVersion.getInstance().addMetaData(this, header2);
header2.addComment("MiniBam : Bam was " + bamFile);
try (SAMFileWriter sfw = swf.makeBAMWriter(header2, true, tmpBam)) {
if (array.length > 0) {
try (CloseableIterator<SAMRecord> ssr = sr.query(array, false)) {
while (ssr.hasNext()) {
final SAMRecord rec = ssr.next();
if (this.samRecordFilter.filterOut(rec))
continue;
rec.setAttribute(SAMTag.PG.name(), prg.getId());
sfw.addAlignment(rec);
}
}
}
}
final Path tmpBai = SamFiles.findIndex(tmpBam);
if (!Files.exists(tmpBai)) {
LOG.error("Cannot find tmp bai Index for " + bamFile + " " + tmpBam);
return -1;
}
final String sampleName1 = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(null);
final String sampleName;
if (!StringUtils.isBlank(sampleName1)) {
sampleName = sampleName1;
} else if (this.allow_no_sample) {
sampleName = IOUtils.getFilenameWithoutSuffix(bamFile, 1);
LOG.warn("No Read group in " + bamFile + " using filename : " + sampleName);
} else {
throw new IllegalArgumentException("No Sample found in " + bamFile + ". Use --no-samples option ?");
}
String filename = this.filePrefix + sampleName + labelSuffix;
while (outputFileNames.contains(filename)) {
filename = this.filePrefix + sampleName + "." + (id_generator++) + labelSuffix;
}
outputFileNames.add(filename);
archive.copyTo(tmpBam, filename + FileExtensions.BAM);
archive.copyTo(tmpBai, filename + IOUtils.getFileSuffix(tmpBai));
stopWatch.stop();
LOG.info("Added " + StringUtils.niceFileSize(Files.size(tmpBam)) + "(bam) and " + StringUtils.niceFileSize(Files.size(tmpBai)) + " (Bai). " + StringUtils.niceDuration(stopWatch.getElapsedTime()));
Files.deleteIfExists(tmpBam);
Files.deleteIfExists(tmpBai);
}
if (!StringUtils.isBlank(this.userComment)) {
try (final PrintWriter pw = archive.openWriter(this.filePrefix + (this.filePrefix.endsWith(".") ? "" : ".") + "README.md")) {
pw.append(this.userComment);
pw.println();
pw.println("## BAMS");
pw.println();
for (final Path bamFile : bamFiles) pw.println(" * " + bamFile);
pw.println();
pw.println("## Date");
pw.println();
pw.println(new SimpleDateFormat("yyyyMMdd").format(new Date()));
pw.flush();
}
}
archive.close();
archive = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(archive);
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class KnimeVariantHelper method parseVariantIntervalFilters.
public Predicate<VariantContext> parseVariantIntervalFilters(final String... array) {
final Function<String, Optional<SimpleInterval>> parser = IntervalParserFactory.newInstance().make();
Predicate<VariantContext> filter = V -> false;
for (final String str : array) {
final SimpleInterval interval = parser.apply(str).orElseThrow(IntervalParserFactory.exception(str));
filter = filter.or(V -> V.overlaps(interval));
}
return filter;
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class Straw method readHeader.
// reads the header, storing the positions of the normalization vectors and returning the master pointer
private long readHeader(final LittleEndianInputStream fin, final String intervalStr1, final String intervalStr2, final QueryInterval[] queryIntervals) throws IOException {
final long master = readHeader(fin);
Function<String, Optional<SimpleInterval>> intervalParser = IntervalParserFactory.newInstance().dictionary(this.dictionary).make();
final SimpleInterval interval1 = intervalParser.apply(intervalStr1).orElseThrow(IntervalParserFactory.exception(intervalStr1));
final SimpleInterval interval2 = intervalParser.apply(intervalStr2).orElseThrow(IntervalParserFactory.exception(intervalStr2));
queryIntervals[0] = new QueryInterval(this.dictionary.getSequenceIndex(interval1.getContig()), interval1.getStart(), interval1.getEnd());
queryIntervals[1] = new QueryInterval(this.dictionary.getSequenceIndex(interval2.getContig()), interval2.getStart(), interval2.getEnd());
return master;
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfStrechToSvg method run.
private void run(final ArchiveFactory archive, final BedLine bed, final VCFHeader header, final VCFReader in, final PrintWriter manifest) {
LOG.info("processing " + bed);
final Set<String> limitSamples;
if (StringUtils.isBlank(this.sampleStr)) {
limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
} else if (this.sampleStr.startsWith("^")) {
limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
limitSamples.removeAll(Arrays.stream(this.sampleStr.substring(1).split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
} else {
limitSamples = Arrays.stream(this.sampleStr.split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
try (CloseableIterator<VariantContext> iter = in.query(bed)) {
final List<VariantSet> L = iter.stream().filter(V -> acceptVariant(V)).map(V -> new VariantSet(V)).collect(Collectors.toCollection(ArrayList::new));
if (L.isEmpty()) {
LOG.warn("No valid variant found for \"" + bed + "\"");
return;
}
int i = 0;
while (i + 1 < L.size()) {
if (L.get(i).withinDistanceOf(L.get(i + 1), this.withinDistance)) {
L.get(i).variants.addAll(L.get(i + 1).variants);
L.remove(i + 1);
} else {
i++;
}
}
final int margin_left = 50;
final int margin_right = 10;
final double drawingAreaWidth = image_width_pixel - (margin_left + margin_right);
final int intervalLength = L.stream().mapToInt(V -> V.getLengthOnReference()).sum();
double x = 0;
for (i = 0; i < L.size(); i++) {
L.get(i).x = x;
x += (L.get(i).getLengthOnReference() / (double) intervalLength) * drawingAreaWidth;
}
for (i = 0; i < L.size(); i++) {
L.get(i).width = (i + 1 < L.size() ? L.get(i + 1).x : drawingAreaWidth) - L.get(i).x;
}
try {
final DocumentBuilderFactory db = DocumentBuilderFactory.newInstance();
final DocumentBuilder dom = db.newDocumentBuilder();
this.document = dom.newDocument();
final Element svgRoot = element("svg");
this.document.appendChild(svgRoot);
final String mainTitleStr = SequenceDictionaryUtils.getBuildName(dict).orElse("") + " " + new SimpleInterval(bed).toNiceString() + " length:" + StringUtils.niceInt(bed.getLengthOnReference());
/* SVG title */
{
final Element title = element("title");
svgRoot.appendChild(title);
title.appendChild(text(mainTitleStr));
}
/* SVG style */
{
final String gtopacity = this.dynamicParams.getOrDefault("gt.opacity", "0.7");
final Element style = element("style");
svgRoot.appendChild(style);
style.appendChild(text(".maintitle {text-anchor:middle;fill:blue} " + ".vc {stroke-width:0.5px;} " + ".transcript {stroke:black;stroke-width:1px;opacity:1;}" + ".exon {stroke:black;stroke-width:0.5px;fill:blue;opacity:1;}" + ".sample {fill:blue;font-size:7px;} " + ".samplelabel {stroke:gray;stroke-width:0.5px;font-size:" + this.dynamicParams.getOrDefault("sample.fontsize", "7") + "px;}\n" + ".coverage { fill:gray; stroke:yellow;opacity:0.2;} " + ".frame { fill:none; stroke: darkgray;} " + ".area0 {fill:white;}\n" + ".area1 {fill:floralwhite;}\n" + "circle.HOM_REF {fill:green;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HET {fill:blue;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HOM_VAR {fill:red;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "a {cursor: pointer;}\n"));
}
/* desc */
{
final Element descr = element("desc");
svgRoot.appendChild(descr);
descr.appendChild(text("Author: Pierre Lindenbaum\n" + JVarkitVersion.getInstance().getCompilationDate() + "\n" + JVarkitVersion.getInstance().getGitHash()));
}
// main title
{
final Element gtitle = element("text", mainTitleStr);
gtitle.setAttribute("class", "maintitle");
gtitle.setAttribute("x", format(this.image_width_pixel / 2.0));
gtitle.setAttribute("y", "15");
svgRoot.appendChild(wrapLoc(gtitle, bed));
}
int margin_top = 50;
double y = margin_top;
final double min_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r1", "1"));
final double max_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r2", "7"));
final Element main_g = element("g");
svgRoot.appendChild(main_g);
/**
* plot genes
*/
if (!this.all_genes.isEmpty()) {
final double transcript_height = 5;
final double exon_height = (transcript_height - 1);
final double save_y = y;
final Element g_genes = element("g");
g_genes.setAttribute("transform", "translate(" + margin_left + ",0)");
main_g.appendChild(g_genes);
/* loop over each vset */
for (i = 0; i < L.size(); i++) {
final VariantSet vset = L.get(i);
// find transcript in this vset
final List<Transcript> transcripts = this.all_genes.getOverlapping(vset).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.overlaps(vset)).collect(Collectors.toList());
if (transcripts.isEmpty())
continue;
final Element g_vset = element("g");
g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
g_genes.appendChild(g_vset);
// y in this vset
double y2 = save_y;
/* convert base to pixel */
final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
/* loop over transcripts */
for (final Transcript tr : transcripts) {
final Element g_tr = element("g");
g_tr.setAttribute("transform", "translate(0," + format(y2) + ")");
g_vset.appendChild(g_tr);
final Element line = element("line");
line.setAttribute("class", "transcript");
line.setAttribute("x1", format(Math.max(0, base2pixel.applyAsDouble(tr.getStart()))));
line.setAttribute("y1", format(transcript_height / 2.0));
line.setAttribute("x2", format(Math.min(vset.width, base2pixel.applyAsDouble(tr.getEnd()))));
line.setAttribute("y2", format(transcript_height / 2.0));
line.appendChild(element("title", tr.getId()));
g_tr.appendChild(wrapLoc(line, tr));
/* loop over exons */
for (final Exon exon : tr.getExons()) {
if (!exon.overlaps(vset))
continue;
final Element exRect = element("rect");
exRect.setAttribute("class", "exon");
final double x_start = Math.max(0, base2pixel.applyAsDouble(exon.getStart()));
exRect.setAttribute("x", format(x_start));
exRect.setAttribute("y", format(transcript_height / 2.0 - exon_height / 2.0));
final double x_end = Math.min(vset.width, base2pixel.applyAsDouble(exon.getEnd()));
exRect.setAttribute("width", format(x_end - x_start));
exRect.setAttribute("height", format(exon_height));
exRect.appendChild(element("title", exon.getName()));
g_tr.appendChild(wrapLoc(exRect, exon));
}
y2 += transcript_height + 0.5;
}
y = Math.max(y, y2);
}
y++;
}
final double sample_height = Double.parseDouble(this.dynamicParams.getOrDefault("sample.height", "25"));
final double sample_height2 = sample_height - (max_circle_radius * 2.0);
int space_between_samples = 2;
int got_n_samples = 0;
for (final String sn : header.getSampleNamesInOrder()) {
if (!limitSamples.contains(sn))
continue;
boolean got_this_sample = false;
final Element g_sample = element("g");
g_sample.setAttribute("transform", "translate(" + margin_left + "," + format(y) + ")");
/* get coverage */
final int maxCoverage;
if (this.sample2bam.containsKey(sn)) {
final CoverageFactory coverageFactory = new CoverageFactory();
try (SamReader sr = this.openSamReader(this.sample2bam.get(sn))) {
/* loop over each variant set */
for (final VariantSet vset : L) {
vset.coverage = coverageFactory.getSimpleCoverage(sr, vset, sn);
}
}
maxCoverage = L.stream().flatMapToInt(V -> V.coverage.stream()).max().orElse(0);
} else {
maxCoverage = 0;
for (final VariantSet vset : L) {
vset.coverage = null;
}
}
/* loop over each variant set */
for (i = 0; i < L.size(); i++) {
final VariantSet vset = L.get(i);
final Element g_vset = element("g");
g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
g_sample.appendChild(g_vset);
/* convert base to pixel */
final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
// plot set length
final Element rect = element("rect");
rect.setAttribute("class", "area" + (i % 2));
rect.setAttribute("x", "0");
rect.setAttribute("y", "0");
rect.setAttribute("width", format(vset.width));
rect.setAttribute("height", format(sample_height));
if (!remove_tooltip)
rect.appendChild(element("title", vset.toString()));
g_vset.appendChild(rect);
// plot coverage
if (maxCoverage > 0 && this.sample2bam.containsKey(sn)) {
final double[] scaled = vset.coverage.scaleAverage((int) vset.width);
final StringBuilder sb = new StringBuilder();
sb.append("0," + sample_height);
for (int t = 0; t < scaled.length; t++) {
if (t > 1 && t + 1 < scaled.length && format(scaled[t - 1]).equals(format(scaled[t + 1])) && format(scaled[t - 1]).equals(format(scaled[t])))
continue;
sb.append(" ").append(t).append(",");
sb.append(format(sample_height * (1.0 - scaled[t] / maxCoverage)));
}
sb.append(" " + format(vset.width) + "," + sample_height);
final Element polyline = element("polyline");
polyline.setAttribute("class", "coverage");
polyline.setAttribute("points", sb.toString());
g_vset.appendChild(polyline);
vset.coverage = null;
}
// plot vertical line if colorTag defined
if (!StringUtils.isBlank(this.colorTag)) {
for (final VariantContext vc : vset.variants) {
if (!vc.hasAttribute(this.colorTag))
continue;
final String cssColor = vc.getAttributeAsString(this.colorTag, "");
if (StringUtils.isBlank(cssColor))
continue;
final double x0 = base2pixel.applyAsDouble(vc.getStart());
final Element line = element("line");
line.setAttribute("class", "vc");
line.setAttribute("style", "stroke:" + cssColor);
line.setAttribute("x1", format(x0));
line.setAttribute("y1", "0");
line.setAttribute("x2", format(x0));
line.setAttribute("y2", format(sample_height));
g_vset.appendChild(line);
}
}
// print all variants in this vcfset for this sample
for (final VariantContext vc : vset.variants) {
final Genotype gt = vc.getGenotype(sn);
if (gt.isNoCall())
continue;
if (hide_hom_ref && gt.isHomRef())
continue;
if (gt.hasGQ() && gt.getGQ() < this.minGQ)
continue;
final OptionalDouble alt_ratio = getAltRatio(gt);
if (!alt_ratio.isPresent())
continue;
final OptionalDouble af = getAF(vc);
final double circle_radius = min_circle_radius + (max_circle_radius - min_circle_radius) * (1.0 - af.orElse(1.0));
// HOMREF=0; HET =0.5; HOMVAR = 1;
final double gtx = base2pixel.applyAsDouble(vc.getStart());
final double gty = sample_height - (sample_height2 * alt_ratio.getAsDouble() + (sample_height - sample_height2) / 2.0);
final Element circle = element("circle");
circle.setAttribute("class", gt.getType().name());
circle.setAttribute("cx", format(gtx));
circle.setAttribute("cy", format(gty));
circle.setAttribute("r", format(circle_radius));
if (!remove_tooltip)
circle.appendChild(element("title", vc.getStart() + " " + (vc.hasID() ? vc.getID() : "") + " " + vc.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/")) + " " + gt.getType().name() + " AF=" + format(af.orElse(-1))));
g_vset.appendChild(wrapLoc(circle, vc));
got_this_sample = true;
}
}
final Element frame_sample = element("rect");
frame_sample.setAttribute("class", "frame");
frame_sample.setAttribute("x", "0");
frame_sample.setAttribute("y", "0");
frame_sample.setAttribute("width", format(drawingAreaWidth));
frame_sample.setAttribute("height", format(sample_height));
g_sample.appendChild(frame_sample);
final Element label = element("text", sn + (maxCoverage == 0 ? "" : " Max Cov. " + maxCoverage));
label.setAttribute("class", "samplelabel");
label.setAttribute("x", "0");
label.setAttribute("y", "0");
// label.setAttribute("transform", "translate("+format(-10)+","+0+") rotate(90) ");
label.setAttribute("transform", "translate(12,12)");
if (!remove_tooltip)
label.appendChild(element("title", sn));
g_sample.appendChild(label);
if (got_this_sample) {
got_n_samples++;
main_g.appendChild(g_sample);
y += sample_height + space_between_samples;
} else {
LOG.warn("no valid data for sample " + sn + " in " + bed);
}
}
// remove extra sample space
y -= space_between_samples;
svgRoot.setAttribute("width", format(this.image_width_pixel + 1));
svgRoot.setAttribute("height", format(y + 1));
if (got_n_samples == 0) {
LOG.info("no sample/genotype found for " + bed);
return;
}
// save
final Transformer tr = TransformerFactory.newInstance().newTransformer();
final String filename = bed.getContig() + "_" + bed.getStart() + "_" + bed.getEnd() + ".svg" + (this.compressed_svg ? ".gz" : "");
LOG.info("writing " + filename);
if (this.compressed_svg) {
try (final OutputStream pw = archive.openOuputStream(filename)) {
try (GZIPOutputStream gzout = new GZIPOutputStream(pw)) {
tr.transform(new DOMSource(this.document), new StreamResult(gzout));
gzout.finish();
gzout.flush();
}
pw.flush();
}
} else {
try (final PrintWriter pw = archive.openWriter(filename)) {
tr.transform(new DOMSource(this.document), new StreamResult(pw));
pw.flush();
}
}
manifest.print(bed.getContig());
manifest.print("\t");
manifest.print(bed.getStart() - 1);
manifest.print("\t");
manifest.print(bed.getEnd());
manifest.print("\t");
manifest.print(filename);
manifest.println();
} catch (final Throwable err) {
throw new RuntimeException(err);
} finally {
this.document = null;
}
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class CnvSlidingWindow method doWork.
@Override
public int doWork(final List<String> args) {
final List<Sample> samples = new ArrayList<>();
try {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
final DistanceParser distParser = new DistanceParser();
final int[] windows_array = Arrays.stream(CharSplitter.SEMICOLON.split(windowDefs)).filter(S -> !StringUtils.isBlank(S)).mapToInt(N -> distParser.applyAsInt(N)).toArray();
if (windows_array.length == 0) {
LOG.error("No window defined.");
return -1;
}
if (windows_array.length % 2 != 0) {
LOG.error("odd number of windows ? " + this.windowDefs);
return -1;
}
final List<Path> inputBams = IOUtils.unrollPaths(args);
if (inputBams.isEmpty()) {
LOG.error("input bam file missing.");
return -1;
}
final Set<String> sampleNames = new TreeSet<>();
for (final Path samFile : inputBams) {
final Sample sample = new Sample(samFile);
if (sampleNames.contains(sample.name)) {
LOG.error("duplicate sample " + sample.name);
sample.close();
return -1;
}
sampleNames.add(sample.name);
samples.add(sample);
SequenceUtil.assertSequenceDictionariesEqual(dict, sample.dict);
}
final List<Locatable> contigs = dict.getSequences().stream().filter(SR -> !SR.getSequenceName().matches(this.excludeRegex)).collect(Collectors.toCollection(ArrayList::new));
if (this.excludeBedPath != null) {
final BedLineCodec bedLineCodec = new BedLineCodec();
final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(dict);
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.excludeBedPath)) {
final List<SimpleInterval> exclude = br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> bedLineCodec.decode(L)).filter(B -> B != null && !StringUtils.isBlank(ctgConverter.apply(B.getContig()))).map(B -> new SimpleInterval(ctgConverter.apply(B.getContig()), B.getStart(), B.getEnd())).collect(Collectors.toList());
boolean done = false;
while (!done) {
done = true;
int i = 0;
while (i < contigs.size()) {
final Locatable contig = contigs.get(i);
final Locatable overlapper = exclude.stream().filter(EX -> EX.overlaps(contig)).findAny().orElse(null);
if (overlapper != null) {
contigs.remove(i);
contigs.addAll(split(contig, overlapper));
done = false;
} else {
i++;
}
}
}
}
}
contigs.sort(new ContigDictComparator(dict).createLocatableComparator());
final Allele ref_allele = Allele.create("N", true);
final Allele dup_allele = Allele.create("<DUP>", false);
final Allele del_allele = Allele.create("<DEL>", false);
final Function<Integer, List<Allele>> cnv2allele = CNV -> {
switch(CNV) {
case 0:
return Arrays.asList(ref_allele, ref_allele);
case 1:
return Arrays.asList(ref_allele, dup_allele);
case 2:
return Arrays.asList(dup_allele, dup_allele);
case -1:
return Arrays.asList(ref_allele, del_allele);
case -2:
return Arrays.asList(del_allele, del_allele);
default:
throw new IllegalArgumentException("cnv:" + CNV);
}
};
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
final VCFFormatHeaderLine fmtLeftCov = new VCFFormatHeaderLine("LC", 1, VCFHeaderLineType.Float, "Left median coverage.");
final VCFFormatHeaderLine fmtMidCov = new VCFFormatHeaderLine("MC", 1, VCFHeaderLineType.Float, "Middle median coverage.");
final VCFFormatHeaderLine fmtRightCov = new VCFFormatHeaderLine("RC", 1, VCFHeaderLineType.Float, "right median coverage.");
final VCFFormatHeaderLine fmtLeftMedian = new VCFFormatHeaderLine("LM", 1, VCFHeaderLineType.Float, "Left normalized median coverage.");
final VCFFormatHeaderLine fmtMidMedian = new VCFFormatHeaderLine("MM", 1, VCFHeaderLineType.Float, "Middle normalized median coverage.");
final VCFFormatHeaderLine fmtRightMedian = new VCFFormatHeaderLine("RM", 1, VCFHeaderLineType.Float, "right normalized median coverage.");
metaData.add(fmtLeftCov);
metaData.add(fmtMidCov);
metaData.add(fmtRightCov);
metaData.add(fmtLeftMedian);
metaData.add(fmtMidMedian);
metaData.add(fmtRightMedian);
final VCFHeader header = new VCFHeader(metaData, sampleNames);
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
final VariantContextWriter vcw = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
vcw.writeHeader(header);
for (final Locatable contig : contigs) {
System.gc();
final short[] array = new short[contig.getLengthOnReference()];
final SortingCollection<Gt> sorter = SortingCollection.newInstance(Gt.class, new GtCodec(), (A, B) -> A.compare1(B), this.sorting.getMaxRecordsInRam(), this.sorting.getTmpPaths());
for (int bam_index = 0; bam_index < samples.size(); bam_index++) {
final Sample sampleBam = samples.get(bam_index);
Arrays.fill(array, (short) 0);
try (SAMRecordIterator iter = sampleBam.samReader.queryOverlapping(contig.getContig(), contig.getStart(), contig.getEnd())) {
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
int refPos = rec.getStart();
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int i = 0; i < ce.getLength(); i++) {
final int idx = refPos - contig.getStart() + i;
if (idx < 0)
continue;
if (idx >= array.length)
break;
if (array[idx] == Short.MAX_VALUE)
continue;
array[idx]++;
}
}
refPos += ce.getLength();
}
}
}
}
for (int widx = 0; widx < windows_array.length; widx += 2) {
final int window_size = windows_array[widx + 0];
final int extend = (int) Math.ceil(window_size * this.extend);
if (extend <= 0)
continue;
if (window_size > contig.getLengthOnReference())
continue;
final int window_shift = windows_array[widx + 1];
LOG.info(contig + " " + window_size + "+-" + extend + ";" + window_shift + " " + sampleBam.name);
final Coverage leftcov = new Coverage(extend);
final Coverage rightcov = new Coverage(extend);
final Coverage leftrightcov = new Coverage(extend + extend);
final Coverage midcov = new Coverage(window_size);
for (int pos1 = contig.getStart(); pos1 + window_size + extend + extend < contig.getEnd(); pos1 += window_shift) {
leftcov.reset();
rightcov.reset();
leftrightcov.reset();
midcov.reset();
for (int x = 0; x < extend; x++) {
final int idx = pos1 - contig.getStart() + x;
leftcov.add(array[idx]);
leftrightcov.add(array[idx]);
}
final double leftMedian = leftcov.median();
if (leftMedian < this.min_depth)
continue;
for (int x = 0; x < extend; x++) {
final int idx = pos1 - contig.getStart() + extend + window_size + x;
rightcov.add(array[idx]);
leftrightcov.add(array[idx]);
}
final double rightMedian = rightcov.median();
if (rightMedian < this.min_depth)
continue;
for (int x = 0; x < window_size; x++) {
final int idx = pos1 - contig.getStart() + extend + x;
midcov.add(array[idx]);
}
final double median = leftrightcov.median();
if (rightcov.median() < this.min_depth)
continue;
for (int x = 0; x < midcov.count; x++) {
midcov.array[x] /= median;
}
for (int x = 0; x < leftcov.count; x++) {
leftcov.array[x] /= median;
}
for (int x = 0; x < rightcov.count; x++) {
rightcov.array[x] /= median;
}
final double norm_depth = midcov.median();
final int cnv = getCNVIndex(norm_depth);
if (cnv != CNV_UNDEFINED && cnv != 1 && getCNVIndex(leftcov.median()) == 1 && getCNVIndex(rightcov.median()) == 1) {
final Gt gt = new Gt();
gt.start = pos1 + extend;
gt.end = gt.start + window_size;
gt.sample_idx = bam_index;
gt.cnv = cnv;
gt.leftMedian = (float) leftMedian;
gt.midMedian = (float) median;
gt.rightMedian = (float) rightMedian;
gt.leftCov = (float) leftcov.median();
gt.midCov = (float) norm_depth;
gt.rightCov = (float) rightcov.median();
sorter.add(gt);
}
}
}
}
sorter.setDestructiveIteration(true);
try (CloseableIterator<Gt> iter = sorter.iterator()) {
final EqualRangeIterator<Gt> eq = new EqualRangeIterator<>(iter, (A, B) -> A.compare0(B));
while (eq.hasNext()) {
final List<Gt> row = eq.next();
if (row.isEmpty())
continue;
final Gt first = row.get(0);
final Set<Allele> alleles = new HashSet<>();
row.stream().flatMap(GT -> cnv2allele.apply(GT.cnv).stream()).forEach(CNV -> alleles.add(CNV));
alleles.add(ref_allele);
if (alleles.size() < 2)
continue;
final VariantContextBuilder vcb = new VariantContextBuilder(null, contig.getContig(), first.start, first.end, alleles);
vcb.attribute(VCFConstants.END_KEY, first.end);
final List<Genotype> genotypes = new ArrayList<>(samples.size());
for (final Gt gt : row) {
final GenotypeBuilder gb = new GenotypeBuilder(samples.get(gt.sample_idx).name, cnv2allele.apply(gt.cnv));
gb.attribute(fmtLeftMedian.getID(), (double) gt.leftMedian);
gb.attribute(fmtMidMedian.getID(), (double) gt.midMedian);
gb.attribute(fmtRightMedian.getID(), (double) gt.rightMedian);
gb.attribute(fmtLeftCov.getID(), (double) gt.leftCov);
gb.attribute(fmtMidCov.getID(), (double) gt.midCov);
gb.attribute(fmtRightCov.getID(), (double) gt.rightCov);
genotypes.add(gb.make());
}
vcb.genotypes(genotypes);
vcw.add(vcb.make());
}
eq.close();
}
sorter.cleanup();
}
vcw.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
samples.forEach(S -> CloserUtil.close(S));
}
}
Aggregations