use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class CoverageFactory method getSimpleCoverage.
/**
* @param reader
* @param locCollection collection of items. must not be empty, all position must belong to the sampel contig
* @param sample
* @return
*/
public SimpleCoverage getSimpleCoverage(final SamReader reader, final Collection<? extends Locatable> locCollection, final String sample) {
if (reader == null)
throw new IllegalArgumentException("reader==null");
if (!reader.hasIndex())
throw new IllegalArgumentException("SamReader is not indexed. " + reader.getResourceDescription());
if (locCollection.isEmpty())
throw new IllegalArgumentException("Empty Query");
final SAMFileHeader header = reader.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final Set<String> contigs = locCollection.stream().map(L -> L.getContig()).collect(Collectors.toSet());
if (contigs.size() != 1)
throw new IllegalArgumentException("Expected only one chromosome but got:" + String.join(",", contigs));
final String contig = contigs.iterator().next();
final int chromStart = locCollection.stream().mapToInt(L -> L.getStart()).min().orElse(0);
final int chromEnd = locCollection.stream().mapToInt(L -> L.getEnd()).max().orElse(0);
final Locatable loc = new SimpleInterval(contig, chromStart, chromEnd);
final SimpleCoverageImpl cov = new SimpleCoverageImpl(loc);
final int tid = dict.getSequenceIndex(contig);
if (tid < 0) {
return cov;
}
if (!StringUtils.isBlank(sample) && header.getReadGroups().stream().map(RG -> this.partition.apply(RG)).noneMatch(S -> sample.equals(S))) {
return cov;
}
int idx = 0;
QueryInterval[] array = new QueryInterval[locCollection.size()];
for (final Locatable item : locCollection) {
array[idx] = new QueryInterval(tid, item.getStart(), item.getEnd());
idx++;
}
array = QueryInterval.optimizeIntervals(array);
try (CloseableIterator<SAMRecord> iter = reader.query(array, false)) {
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.getMappingQuality() < this.mappingQuality)
continue;
if (!this.samRecordFilter.test(rec))
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
if (!StringUtils.isBlank(sample) && !SAMRecordPartition.any.equals(this.partition)) {
if (!sample.equals(this.partition.getPartion(rec)))
continue;
}
int maxEnd = loc.getEnd();
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex())) {
int mateStart;
if (this.useClip && SAMUtils.getMateCigar(rec) != null) {
mateStart = SAMUtils.getMateUnclippedStart(rec);
} else {
mateStart = rec.getMateAlignmentStart();
}
if (rec.getStart() < mateStart && rec.getEnd() > mateStart) {
maxEnd = Math.min(maxEnd, mateStart);
}
}
if (!this.useClip) {
for (AlignmentBlock block : rec.getAlignmentBlocks()) {
if (block.getReferenceStart() > maxEnd)
break;
for (int t = 0; t < block.getLength(); t++) {
final int pos1 = block.getReferenceStart() + t;
if (pos1 < loc.getStart())
continue;
if (pos1 > maxEnd)
break;
cov.coverage[pos1 - loc.getStart()]++;
}
}
} else {
int ref1 = rec.getUnclippedStart();
for (final CigarElement ce : cigar) {
if (ref1 > maxEnd)
break;
final CigarOperator op = ce.getOperator();
switch(op) {
case P:
break;
case N:
case D:
ref1 += ce.getLength();
break;
case I:
break;
case H:
case S:
case X:
case EQ:
case M:
{
for (int i = 0; i < ce.getLength(); i++) {
final int pos1 = ref1 + i;
if (pos1 < loc.getStart())
continue;
if (pos1 > maxEnd)
break;
cov.coverage[pos1 - loc.getStart()]++;
}
ref1 += ce.getLength();
break;
}
default:
throw new IllegalStateException(op.name());
}
}
}
}
}
return cov;
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class VcfLoopOverGenes method doWork.
@SuppressWarnings("resource")
@Override
public int doWork(final List<String> args) {
PrintWriter pw = null;
VCFReader vcfFileReader = null;
CloseableIterator<VariantContext> iter = null;
CloseableIterator<GeneLoc> iter2 = null;
BufferedReader br = null;
ArchiveFactory archive = null;
try {
final Path vcf = Paths.get(oneAndOnlyOneFile(args));
vcfFileReader = VCFReaderFactory.makeDefault().open(vcf, (this.geneFile != null || !StringUtil.isBlank(this.regionStr)));
this.dictionary = vcfFileReader.getHeader().getSequenceDictionary();
if (this.dictionary == null) {
throw new JvarkitException.VcfDictionaryMissing(vcf);
}
final VcfTools tools = new VcfTools(vcfFileReader.getHeader());
if (!this.prefix.isEmpty() && !this.prefix.endsWith(".")) {
this.prefix += ".";
}
if (this.geneFile == null) {
final SortingCollection<GeneLoc> sortingCollection = SortingCollection.newInstance(GeneLoc.class, new GeneLocCodec(), (A, B) -> A.compareTo(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
if (StringUtil.isBlank(this.regionStr)) {
iter = vcfFileReader.iterator();
} else {
final SimpleInterval interval = IntervalParserFactory.newInstance().dictionary(this.dictionary).enableWholeContig().make().apply(this.regionStr).orElseThrow(IntervalParserFactory.exception(this.regionStr));
iter = vcfFileReader.query(interval.getContig(), interval.getStart(), interval.getEnd());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfFileReader.getHeader()).logger(LOG);
if (this.splitMethod.equals(SplitMethod.Annotations)) {
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
for (final AnnPredictionParser.AnnPrediction pred : tools.getAnnPredictionParser().getPredictions(ctx)) {
if (this.snpEffNoIntergenic && pred.isIntergenicRegion()) {
continue;
}
if (!StringUtil.isBlank(pred.getGeneName())) {
sortingCollection.add(create(ctx, pred.getGeneName(), SourceType.ANN_GeneName));
}
if (!StringUtil.isBlank(pred.getGeneId())) {
sortingCollection.add(create(ctx, pred.getGeneId(), SourceType.ANN_GeneID));
}
if (!StringUtil.isBlank(pred.getFeatureId())) {
sortingCollection.add(create(ctx, pred.getFeatureId(), SourceType.ANN_FeatureID));
}
}
for (final VepPredictionParser.VepPrediction pred : tools.getVepPredictionParser().getPredictions(ctx)) {
if (!StringUtil.isBlank(pred.getGene())) {
sortingCollection.add(create(ctx, pred.getGene(), SourceType.VEP_Gene));
}
if (!StringUtil.isBlank(pred.getFeature())) {
sortingCollection.add(create(ctx, pred.getFeature(), SourceType.VEP_Feature));
}
if (!StringUtil.isBlank(pred.getSymbol())) {
sortingCollection.add(create(ctx, pred.getSymbol(), SourceType.VEP_Symbol));
}
if (!StringUtil.isBlank(pred.getHgncId())) {
sortingCollection.add(create(ctx, pred.getHgncId(), SourceType.VEP_HgncId));
}
}
}
} else /**
* split VCF per sliding window of variants
*/
if (this.splitMethod.equals(SplitMethod.VariantSlidingWindow)) {
if (this.variantsWinCount < 1) {
LOG.error("Bad value for variantsWinCount");
return -1;
}
if (this.variantsWinShift < 1 || this.variantsWinShift > this.variantsWinCount) {
LOG.error("Bad value for variantsWinShift");
return -1;
}
final List<VariantContext> buffer = new ArrayList<>(this.variantsWinCount);
/**
* routine to dump buffer into sorting collection
*/
final Runnable dumpBuffer = () -> {
if (buffer.isEmpty())
return;
final String contig = buffer.get(0).getContig();
final int chromStart = buffer.stream().mapToInt(CTX -> CTX.getStart()).min().getAsInt();
// use last of start too
final int chromEnd0 = buffer.stream().mapToInt(CTX -> CTX.getStart()).max().getAsInt();
// final int chromEnd1 = buffer.stream().mapToInt(CTX->CTX.getEnd()).max().getAsInt();
final String identifier = contig + "_" + String.format(NUM_FORMAT, chromStart) + "_" + String.format(NUM_FORMAT, chromEnd0);
for (final VariantContext ctx : buffer) {
sortingCollection.add(create(ctx, identifier, SourceType.SlidingVariants));
}
};
while (iter.hasNext()) {
VariantContext ctx = progress.watch(iter.next());
/* reduce the memory footprint for this context */
ctx = new VariantContextBuilder(ctx).genotypes(Collections.emptyList()).unfiltered().rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make();
if (!buffer.isEmpty() && !buffer.get(0).getContig().equals(ctx.getContig())) {
dumpBuffer.run();
buffer.clear();
}
buffer.add(ctx);
if (buffer.size() >= this.variantsWinCount) {
dumpBuffer.run();
final int fromIndex = Math.min(this.variantsWinShift, buffer.size());
buffer.subList(0, fromIndex).clear();
}
}
dumpBuffer.run();
buffer.clear();
} else if (this.splitMethod.equals(SplitMethod.ContigSlidingWindow)) {
if (this.contigWinLength < 1) {
LOG.error("Bad value for contigWinCount");
return -1;
}
if (this.contigWinShift < 1 || this.contigWinShift > this.contigWinLength) {
LOG.error("Bad value for contigWinShift");
return -1;
}
while (iter.hasNext()) {
VariantContext ctx = progress.watch(iter.next());
/* reduce the memory footprint for this context */
ctx = new VariantContextBuilder(ctx).genotypes(Collections.emptyList()).unfiltered().rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make();
int start = 0;
while (start <= ctx.getStart()) {
if (start + this.contigWinLength >= ctx.getStart()) {
final int chromStart = start;
final int chromEnd0 = start + this.contigWinLength;
final String identifier = ctx.getContig() + "_" + String.format(NUM_FORMAT, chromStart) + "_" + String.format(NUM_FORMAT, chromEnd0);
sortingCollection.add(create(ctx, identifier, SourceType.SlidingContig));
}
start += this.contigWinShift;
}
}
} else {
throw new IllegalStateException("No such method: " + this.splitMethod);
}
sortingCollection.doneAdding();
progress.finish();
iter.close();
iter = null;
pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
iter2 = sortingCollection.iterator();
final EqualRangeIterator<GeneLoc> eqiter = new EqualRangeIterator<>(iter2, this.compareGeneName);
int geneIdentifierId = 0;
while (eqiter.hasNext()) {
final List<GeneLoc> gene = eqiter.next();
pw.print(gene.get(0).contig);
pw.print('\t');
// -1 for BED
pw.print(gene.stream().mapToInt(G -> G.start).min().getAsInt() - 1);
pw.print('\t');
pw.print(gene.stream().mapToInt(G -> G.end).max().getAsInt());
pw.print('\t');
pw.print(this.prefix + String.format("%09d", ++geneIdentifierId));
pw.print('\t');
pw.print(gene.get(0).geneName);
pw.print('\t');
pw.print(gene.get(0).sourceType);
pw.print('\t');
pw.print(gene.size());
pw.println();
}
pw.flush();
pw.close();
pw = null;
eqiter.close();
iter2.close();
iter2 = null;
sortingCollection.cleanup();
} else {
if (this.nJobs < 1) {
this.nJobs = Math.max(1, Runtime.getRuntime().availableProcessors());
LOG.info("setting njobs to " + this.nJobs);
}
final ExecutorService executorService;
final List<Future<Integer>> futureResults;
if (this.nJobs > 1) {
executorService = new ThreadPoolExecutor(this.nJobs, this.nJobs, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>());
futureResults = new ArrayList<>();
} else {
executorService = null;
futureResults = Collections.emptyList();
}
if (this.outputFile == null) {
LOG.error("When scanning a VCF with " + this.geneFile + ". Output file must be defined");
}
if (!this.exec.isEmpty()) {
if (this.outputFile.getName().endsWith(".zip")) {
LOG.error("Cannot execute " + this.exec + " when saving to a zip.");
return -1;
}
}
archive = ArchiveFactory.open(this.outputFile);
PrintWriter manifest = this.deleteAfterCommand && !this.exec.isEmpty() ? // all files will be deleted, no manifest needed
new PrintWriter(new NullOuputStream()) : archive.openWriter(this.prefix + "manifest.txt");
br = IOUtils.openFileForBufferedReading(this.geneFile);
final BedLineCodec bedCodec = new BedLineCodec();
for (; ; ) {
if (!futureResults.isEmpty()) {
int i = 0;
while (i < futureResults.size()) {
final Future<Integer> r = futureResults.get(i);
if (r.isCancelled()) {
LOG.error("Task was canceled. Break.");
return -1;
} else if (r.isDone()) {
futureResults.remove(i);
int rez = r.get();
if (rez != 0) {
LOG.error("Task Failed (" + rez + "). Break");
}
} else {
i++;
}
}
}
final String line = br.readLine();
if (line == null)
break;
if (line.startsWith("#") || line.isEmpty())
continue;
final BedLine bedLine = bedCodec.decode(line);
if (bedLine == null)
continue;
// ID
final String geneIdentifier = bedLine.get(3);
// name
final String geneName = bedLine.get(4);
final SourceType sourceType = SourceType.valueOf(bedLine.get(5));
final String filename = geneIdentifier;
final String outputVcfName = (filename.startsWith(this.prefix) ? "" : this.prefix) + filename + ".vcf" + (this.compress ? ".gz" : "");
LOG.info(bedLine.getContig() + ":" + bedLine.getStart() + "-" + bedLine.getEnd() + " length :" + (bedLine.getEnd() - bedLine.getStart()));
if (bedLine.getEnd() - bedLine.getStart() > 1E6) {
LOG.warn("That's a large region ! " + bedLine);
}
OutputStream vcfOutputStream = null;
VariantContextWriter vw = null;
int countVariants = 0;
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfFileReader.getHeader()).logger(LOG).prefix(geneName + " " + bedLine.getContig() + ":" + bedLine.getStart() + "-" + bedLine.getEnd());
iter = vcfFileReader.query(bedLine.getContig(), bedLine.getStart(), bedLine.getEnd());
while (iter.hasNext()) {
VariantContext ctx = progress.watch(iter.next());
switch(sourceType) {
case SlidingVariants:
{
// nothing
break;
}
case SlidingContig:
{
// nothing
break;
}
case ANN_GeneName:
case ANN_FeatureID:
case ANN_GeneID:
{
final List<String> preds = new ArrayList<>();
for (final AnnPredictionParser.AnnPrediction pred : tools.getAnnPredictionParser().getPredictions(ctx)) {
final String predictionIdentifier;
switch(sourceType) {
case ANN_GeneName:
predictionIdentifier = pred.getGeneName();
break;
case ANN_FeatureID:
predictionIdentifier = pred.getFeatureId();
break;
case ANN_GeneID:
predictionIdentifier = pred.getGeneId();
break;
default:
throw new IllegalStateException(bedLine.toString());
}
if (StringUtil.isBlank(predictionIdentifier))
continue;
if (!geneName.equals(predictionIdentifier))
continue;
preds.add(pred.getOriginalAttributeAsString());
}
if (preds.isEmpty()) {
ctx = null;
} else {
ctx = new VariantContextBuilder(ctx).rmAttribute(tools.getAnnPredictionParser().getTag()).attribute(tools.getAnnPredictionParser().getTag(), preds).make();
}
break;
}
case VEP_Gene:
case VEP_Feature:
case VEP_Symbol:
case VEP_HgncId:
{
final List<String> preds = new ArrayList<>();
for (final VepPredictionParser.VepPrediction pred : tools.getVepPredictions(ctx)) {
final String predictionIdentifier;
switch(sourceType) {
case VEP_Gene:
predictionIdentifier = pred.getGene();
break;
case VEP_Feature:
predictionIdentifier = pred.getFeature();
break;
case VEP_Symbol:
predictionIdentifier = pred.getSymbol();
break;
case VEP_HgncId:
predictionIdentifier = pred.getHgncId();
break;
default:
throw new IllegalStateException(bedLine.toString());
}
if (StringUtil.isBlank(predictionIdentifier))
continue;
if (!geneName.equals(predictionIdentifier))
continue;
preds.add(pred.getOriginalAttributeAsString());
}
if (preds.isEmpty()) {
ctx = null;
} else {
ctx = new VariantContextBuilder(ctx).rmAttribute(tools.getVepPredictionParser().getTag()).attribute(tools.getVepPredictionParser().getTag(), preds).make();
}
break;
}
default:
throw new IllegalStateException(bedLine.toString());
}
if (ctx == null)
continue;
if (vcfOutputStream == null) {
LOG.info(filename);
manifest.println(outputVcfName);
final VCFHeader header = new VCFHeader(vcfFileReader.getHeader());
header.addMetaDataLine(new VCFHeaderLine(VCF_HEADER_SPLITKEY, filename));
vcfOutputStream = archive.openOuputStream(outputVcfName);
vw = VCFUtils.createVariantContextWriterToOutputStream(vcfOutputStream);
vw.writeHeader(header);
}
countVariants++;
vw.add(ctx);
if (countVariants % 1000 == 0) {
LOG.info("Loading : " + geneIdentifier + " N=" + countVariants);
}
}
progress.finish();
LOG.info(geneIdentifier + " N=" + countVariants);
if (vcfOutputStream != null) {
vw.close();
vcfOutputStream.flush();
vcfOutputStream.close();
vw = null;
if (!this.exec.isEmpty()) {
final Callable<Integer> callable = () -> {
final File vcfOutFile = new File(this.outputFile, outputVcfName);
IOUtil.assertFileIsReadable(vcfOutFile);
final String vcfPath = vcfOutFile.getPath();
final StringTokenizer st = new StringTokenizer(this.exec);
final List<String> command = new ArrayList<>(1 + st.countTokens());
while (st.hasMoreTokens()) {
String token = st.nextToken().replaceAll("__PREFIX__", this.prefix).replaceAll("__CONTIG__", bedLine.getContig()).replaceAll("__CHROM__", bedLine.getContig()).replaceAll("__ID__", geneIdentifier).replaceAll("__NAME__", geneName).replaceAll("__START__", String.valueOf(bedLine.getStart())).replaceAll("__END__", String.valueOf(bedLine.getEnd())).replaceAll("__SOURCE__", sourceType.name()).replaceAll("__VCF__", vcfPath);
command.add(token);
}
LOG.info(command.stream().map(S -> "'" + S + "'").collect(Collectors.joining(" ")));
final ProcessBuilder pb = new ProcessBuilder(command);
pb.redirectErrorStream(true);
final Process p = pb.start();
final Thread stdoutThread = new Thread(() -> {
try {
InputStream in = p.getInputStream();
IOUtils.copyTo(in, stdout());
} catch (Exception err) {
LOG.error(err);
}
});
stdoutThread.start();
int exitValue = p.waitFor();
if (exitValue != 0) {
LOG.error("Command failed (" + exitValue + "):" + String.join(" ", command));
return -1;
} else {
if (deleteAfterCommand) {
if (!vcfOutFile.delete()) {
LOG.warn("Cannot delete " + vcfOutFile);
}
}
return 0;
}
};
if (executorService != null) {
final Future<Integer> rez = executorService.submit(callable);
futureResults.add(rez);
} else {
final int ret = callable.call();
if (ret != 0) {
LOG.error("Error with process (" + ret + ")");
return ret;
}
}
}
} else {
manifest.println("#" + filename);
LOG.warn("No Variant Found for " + line);
}
iter.close();
}
;
if (executorService != null) {
LOG.info("shutdown");
executorService.shutdown();
executorService.awaitTermination(365, TimeUnit.DAYS);
}
br.close();
br = null;
manifest.close();
archive.close();
archive = null;
LOG.info("Done");
}
vcfFileReader.close();
vcfFileReader = null;
return 0;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
{
CloserUtil.close(iter2);
CloserUtil.close(iter);
CloserUtil.close(pw);
CloserUtil.close(vcfFileReader);
CloserUtil.close(br);
CloserUtil.close(archive);
}
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class Biostar78285 method doWork.
@Override
public int doWork(final List<String> args) {
if (this.gc_percent_window < 1) {
LOG.error("Bad GC% window size:" + this.gc_percent_window);
return -1;
}
final List<File> bamFiles = IOUtil.unrollFiles(args.stream().map(F -> new File(F)).collect(Collectors.toCollection(HashSet::new)), ".bam");
SAMSequenceDictionary dict = null;
final List<SamReader> samReaders = new ArrayList<>();
final List<CloseableIterator<SAMRecord>> samIterators = new ArrayList<>();
final TreeSet<String> samples = new TreeSet<>();
final String DEFAULT_PARTITION = "UNDEFINED_PARTITION";
ReferenceSequenceFile indexedFastaSequenceFile = null;
VariantContextWriter out = null;
try {
final SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT);
for (final File bamFile : bamFiles) {
LOG.info("Opening " + bamFile);
final SamReader samReader = samReaderFactory.open(bamFile);
samReaders.add(samReader);
final SAMFileHeader header = samReader.getFileHeader();
if (header == null) {
LOG.error("No header in " + bamFile);
return -1;
}
JvarkitException.BamBadSortOrder.verify(SortOrder.coordinate, header);
samples.addAll(header.getReadGroups().stream().map(RG -> this.partition.apply(RG, DEFAULT_PARTITION)).collect(Collectors.toSet()));
final SAMSequenceDictionary currDict = header.getSequenceDictionary();
if (currDict == null) {
LOG.error("SamFile doesn't contain a SAMSequenceDictionary : " + bamFile);
return -1;
}
if (dict == null) {
dict = currDict;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, currDict)) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict, currDict));
return -1;
}
}
if (samReaders.isEmpty()) {
LOG.error("no bam");
return -1;
}
if (dict == null) {
LOG.error("no dictionary");
return -1;
}
final QueryInterval[] intervals;
if (this.captureBed != null) {
LOG.info("Opening " + this.captureBed);
ContigNameConverter.setDefaultAliases(dict);
final List<QueryInterval> L = new ArrayList<>();
try (BedLineReader li = new BedLineReader(this.captureBed)) {
while (li.hasNext()) {
final BedLine bed = li.next();
if (bed == null)
continue;
final QueryInterval q = bed.toQueryInterval(dict);
L.add(q);
}
}
intervals = QueryInterval.optimizeIntervals(L.toArray(new QueryInterval[L.size()]));
} else {
intervals = null;
}
for (final SamReader samReader : samReaders) {
LOG.info("querying " + samReader.getResourceDescription());
final CloseableIterator<SAMRecord> iter;
if (intervals == null) {
iter = samReader.iterator();
} else {
iter = samReader.queryOverlapping(intervals);
}
samIterators.add(new FilterIterator<SAMRecord>(iter, R -> !R.getReadUnmappedFlag() && !filter.filterOut(R)));
}
if (this.refFile != null) {
indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.refFile);
final SAMSequenceDictionary refdict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
ContigNameConverter.setDefaultAliases(refdict);
if (!SequenceUtil.areSequenceDictionariesEqual(dict, refdict)) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict, refdict));
return -1;
}
}
out = openVariantContextWriter(this.outputFile);
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY);
metaData.add(new VCFFormatHeaderLine("DF", 1, VCFHeaderLineType.Integer, "Number of Reads on plus strand"));
metaData.add(new VCFFormatHeaderLine("DR", 1, VCFHeaderLineType.Integer, "Number of Reads on minus strand"));
metaData.add(new VCFInfoHeaderLine("AVG_DP", 1, VCFHeaderLineType.Float, "Mean depth"));
metaData.add(new VCFInfoHeaderLine("MEDIAN_DP", 1, VCFHeaderLineType.Float, "Median depth"));
metaData.add(new VCFInfoHeaderLine("MIN_DP", 1, VCFHeaderLineType.Integer, "Min depth"));
metaData.add(new VCFInfoHeaderLine("MAX_DP", 1, VCFHeaderLineType.Integer, "Max depth"));
metaData.add(new VCFHeaderLine(Biostar78285.class.getSimpleName() + ".SamFilter", this.filter.toString()));
for (final Integer treshold : this.minDepthTresholds) {
metaData.add(new VCFFilterHeaderLine("DP_LT_" + treshold, "All genotypes have DP< " + treshold));
metaData.add(new VCFInfoHeaderLine("NUM_DP_LT_" + treshold, 1, VCFHeaderLineType.Integer, "Number of genotypes having DP< " + treshold));
metaData.add(new VCFInfoHeaderLine("FRACT_DP_LT_" + treshold, 1, VCFHeaderLineType.Float, "Fraction of genotypes having DP< " + treshold));
}
if (indexedFastaSequenceFile != null) {
metaData.add(new VCFInfoHeaderLine("GC_PERCENT", 1, VCFHeaderLineType.Integer, "GC% window_size:" + this.gc_percent_window));
}
final List<Allele> refAlleles = Collections.singletonList(Allele.create("N", true));
final List<Allele> NO_CALLS = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
vcfHeader.setSequenceDictionary(dict);
out.writeHeader(vcfHeader);
final SAMRecordCoordinateComparator samRecordCoordinateComparator = new SAMRecordCoordinateComparator();
final PeekableIterator<SAMRecord> peekIter = new PeekableIterator<>(new MergingIterator<>((R1, R2) -> samRecordCoordinateComparator.fileOrderCompare(R1, R2), samIterators));
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
for (final SAMSequenceRecord ssr : dict.getSequences()) {
final IntervalTree<Boolean> capturePos;
if (intervals != null) {
if (!Arrays.stream(intervals).anyMatch(I -> I.referenceIndex == ssr.getSequenceIndex())) {
continue;
}
capturePos = new IntervalTree<>();
Arrays.stream(intervals).filter(I -> I.referenceIndex == ssr.getSequenceIndex()).forEach(I -> capturePos.put(I.start, I.end, true));
;
} else {
capturePos = null;
}
final GenomicSequence genomicSequence;
if (indexedFastaSequenceFile != null && indexedFastaSequenceFile.getSequenceDictionary().getSequence(ssr.getSequenceName()) != null) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, ssr.getSequenceName());
} else {
genomicSequence = null;
}
final List<SAMRecord> buffer = new ArrayList<>();
for (int ssr_pos = 1; ssr_pos <= ssr.getSequenceLength(); ++ssr_pos) {
if (capturePos != null && !capturePos.overlappers(ssr_pos, ssr_pos).hasNext())
continue;
progress.watch(ssr.getSequenceName(), ssr_pos);
while (peekIter.hasNext()) {
final SAMRecord rec = peekIter.peek();
if (rec.getReadUnmappedFlag()) {
// consumme
peekIter.next();
continue;
}
if (this.filter.filterOut(rec)) {
// consumme
peekIter.next();
continue;
}
if (rec.getReferenceIndex() < ssr.getSequenceIndex()) {
throw new IllegalStateException("should not happen");
}
if (rec.getReferenceIndex() > ssr.getSequenceIndex()) {
break;
}
if (rec.getAlignmentEnd() < ssr_pos) {
throw new IllegalStateException("should not happen");
}
if (rec.getAlignmentStart() > ssr_pos) {
break;
}
buffer.add(peekIter.next());
}
int x = 0;
while (x < buffer.size()) {
final SAMRecord R = buffer.get(x);
if (R.getReferenceIndex() != ssr.getSequenceIndex() || R.getAlignmentEnd() < ssr_pos) {
buffer.remove(x);
} else {
x++;
}
}
final Map<String, PosInfo> count = samples.stream().map(S -> new PosInfo(S)).collect(Collectors.toMap(P -> P.sample, Function.identity()));
for (final SAMRecord rec : buffer) {
if (rec.getReferenceIndex() != ssr.getSequenceIndex())
throw new IllegalStateException("should not happen");
if (rec.getAlignmentEnd() < ssr_pos)
continue;
if (rec.getAlignmentStart() > ssr_pos)
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
int refpos = rec.getAlignmentStart();
final String sample = this.partition.getPartion(rec, DEFAULT_PARTITION);
for (final CigarElement ce : cigar.getCigarElements()) {
if (refpos > ssr_pos)
break;
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
if (refpos <= ssr_pos && ssr_pos <= refpos + ce.getLength()) {
final PosInfo posInfo = count.get(sample);
if (posInfo != null) {
posInfo.dp++;
if (rec.getReadNegativeStrandFlag()) {
posInfo.negative_strand++;
}
}
break;
}
}
refpos += ce.getLength();
}
}
}
final VariantContextBuilder vcb = new VariantContextBuilder();
final Set<String> filters = new HashSet<>();
vcb.chr(ssr.getSequenceName());
vcb.start(ssr_pos);
vcb.stop(ssr_pos);
if (genomicSequence == null) {
vcb.alleles(refAlleles);
} else {
vcb.alleles(Collections.singletonList(Allele.create((byte) genomicSequence.charAt(ssr_pos - 1), true)));
final GenomicSequence.GCPercent gcp = genomicSequence.getGCPercent(Math.max((ssr_pos - 1) - this.gc_percent_window, 0), Math.min(ssr_pos + this.gc_percent_window, ssr.getSequenceLength()));
if (!gcp.isEmpty()) {
vcb.attribute("GC_PERCENT", gcp.getGCPercentAsInteger());
}
}
vcb.attribute(VCFConstants.DEPTH_KEY, count.values().stream().mapToInt(S -> S.dp).sum());
vcb.genotypes(count.values().stream().map(C -> new GenotypeBuilder(C.sample, NO_CALLS).DP(C.dp).attribute("DR", C.negative_strand).attribute("DF", C.dp - C.negative_strand).make()).collect(Collectors.toList()));
for (final Integer treshold : this.minDepthTresholds) {
final int count_lt = (int) count.values().stream().filter(S -> S.dp < treshold).count();
if (count_lt == samples.size()) {
filters.add("DP_LT_" + treshold);
}
vcb.attribute("NUM_DP_LT_" + treshold, count_lt);
if (!samples.isEmpty()) {
vcb.attribute("FRACT_DP_LT_" + treshold, count_lt / (float) samples.size());
}
}
if (!samples.isEmpty()) {
final int[] array = count.values().stream().mapToInt(S -> S.dp).toArray();
vcb.attribute("AVG_DP", Percentile.average().evaluate(array).getAsDouble());
vcb.attribute("MEDIAN_DP", Percentile.median().evaluate(array).getAsDouble());
vcb.attribute("MIN_DP", (int) Percentile.min().evaluate(array).getAsDouble());
vcb.attribute("MAX_DP", (int) Percentile.max().evaluate(array).getAsDouble());
}
if (filters.isEmpty()) {
vcb.passFilters();
} else {
vcb.filters(filters);
}
out.add(vcb.make());
}
}
progress.finish();
peekIter.close();
out.close();
out = null;
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
CloserUtil.close(samIterators);
CloserUtil.close(samReaders);
CloserUtil.close(indexedFastaSequenceFile);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class MakeMiniBam method doWork.
@Override
public int doWork(final List<String> args) {
ArchiveFactory archive = null;
int id_generator = 0;
final Set<String> outputFileNames = new HashSet<>();
try {
if (StringUtils.isBlank(this.filePrefix) || this.filePrefix.equals("now")) {
final SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyyMMdd");
this.filePrefix = simpleDateFormat.format(new Date()) + ".";
}
if (!this.filePrefix.endsWith(".")) {
this.filePrefix += ".";
}
IOUtil.assertDirectoryIsWritable(tmpDir);
final List<Path> bamFiles = IOUtils.unrollPaths(args);
if (bamFiles.isEmpty()) {
LOG.error("no bam file defined");
return -1;
}
final List<Locatable> locatables = this.intervalListProvider.enableBreakEndInterval(!disable_sv_bnd).enableSinglePoint().stream().collect(Collectors.toList());
if (locatables.isEmpty()) {
LOG.error("no position defined");
return -1;
}
final SAMFileWriterFactory swf = new SAMFileWriterFactory();
swf.setCompressionLevel(9);
swf.setCreateIndex(true);
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.referencePath != null)
srf.referenceSequence(this.referencePath);
archive = ArchiveFactory.open(this.outputFile);
archive.setCompressionLevel(Deflater.NO_COMPRESSION);
for (final Path bamFile : bamFiles) {
LOG.info(bamFile.toString());
final StopWatch stopWatch = new StopWatch();
stopWatch.start();
final SamReader sr = srf.open(bamFile);
if (!sr.hasIndex()) {
sr.close();
LOG.error("file " + bamFile + " is not indexed.");
return -1;
}
final SAMFileHeader header = sr.getFileHeader();
if (!header.getSortOrder().equals(SortOrder.coordinate)) {
sr.close();
LOG.error("file " + bamFile + " is not sorted on coordinate.");
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final Optional<String> dictLabel = SequenceDictionaryUtils.getBuildName(dict);
final String labelSuffix = (dictLabel.isPresent() ? "." + dictLabel.get() : "") + (locatables.size() == 1 ? "." + locatables.get(0).getContig() + "_" + locatables.get(0).getStart() + (locatables.get(0).getStart() == locatables.get(0).getEnd() ? "" : "_" + locatables.get(0).getEnd()) : "");
final ContigNameConverter ctgConvert = ContigNameConverter.fromOneDictionary(dict);
final QueryInterval[] array = locatables.stream().flatMap(loc -> {
if (this.bound_edge < 1 || loc.getLengthOnReference() <= this.bound_edge) {
return Collections.singletonList(loc).stream();
}
return Arrays.asList((Locatable) new SimpleInterval(loc.getContig(), loc.getStart(), loc.getStart()), (Locatable) new SimpleInterval(loc.getContig(), loc.getEnd(), loc.getEnd())).stream();
}).map(LOC -> {
final String contig = ctgConvert.apply(LOC.getContig());
if (StringUtils.isBlank(contig)) {
LOG.warn("Cannot find " + LOC.getContig() + " in " + bamFile);
return null;
}
final SAMSequenceRecord ssr = dict.getSequence(contig);
if (LOC.getStart() > ssr.getSequenceLength()) {
LOG.warn("pos " + LOC + " is greater than chromosome size " + ssr.getSequenceLength() + " in " + bamFile);
return null;
}
return new QueryInterval(ssr.getSequenceIndex(), Math.max(1, LOC.getStart() - extend), Math.min(ssr.getSequenceLength(), LOC.getEnd() + extend));
}).filter(Q -> Q != null).collect(HtsCollectors.optimizedQueryIntervals());
final Path tmpBam = Files.createTempFile(this.tmpDir, "tmp.", ".bam");
final SAMFileHeader header2 = header.clone();
final SAMProgramRecord prg = header2.createProgramRecord();
prg.setProgramName(this.getProgramName());
prg.setProgramVersion(this.getGitHash());
prg.setCommandLine(this.getProgramCommandLine());
JVarkitVersion.getInstance().addMetaData(this, header2);
header2.addComment("MiniBam : Bam was " + bamFile);
try (SAMFileWriter sfw = swf.makeBAMWriter(header2, true, tmpBam)) {
if (array.length > 0) {
try (CloseableIterator<SAMRecord> ssr = sr.query(array, false)) {
while (ssr.hasNext()) {
final SAMRecord rec = ssr.next();
if (this.samRecordFilter.filterOut(rec))
continue;
rec.setAttribute(SAMTag.PG.name(), prg.getId());
sfw.addAlignment(rec);
}
}
}
}
final Path tmpBai = SamFiles.findIndex(tmpBam);
if (!Files.exists(tmpBai)) {
LOG.error("Cannot find tmp bai Index for " + bamFile + " " + tmpBam);
return -1;
}
final String sampleName1 = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(null);
final String sampleName;
if (!StringUtils.isBlank(sampleName1)) {
sampleName = sampleName1;
} else if (this.allow_no_sample) {
sampleName = IOUtils.getFilenameWithoutSuffix(bamFile, 1);
LOG.warn("No Read group in " + bamFile + " using filename : " + sampleName);
} else {
throw new IllegalArgumentException("No Sample found in " + bamFile + ". Use --no-samples option ?");
}
String filename = this.filePrefix + sampleName + labelSuffix;
while (outputFileNames.contains(filename)) {
filename = this.filePrefix + sampleName + "." + (id_generator++) + labelSuffix;
}
outputFileNames.add(filename);
archive.copyTo(tmpBam, filename + FileExtensions.BAM);
archive.copyTo(tmpBai, filename + IOUtils.getFileSuffix(tmpBai));
stopWatch.stop();
LOG.info("Added " + StringUtils.niceFileSize(Files.size(tmpBam)) + "(bam) and " + StringUtils.niceFileSize(Files.size(tmpBai)) + " (Bai). " + StringUtils.niceDuration(stopWatch.getElapsedTime()));
Files.deleteIfExists(tmpBam);
Files.deleteIfExists(tmpBai);
}
if (!StringUtils.isBlank(this.userComment)) {
try (final PrintWriter pw = archive.openWriter(this.filePrefix + (this.filePrefix.endsWith(".") ? "" : ".") + "README.md")) {
pw.append(this.userComment);
pw.println();
pw.println("## BAMS");
pw.println();
for (final Path bamFile : bamFiles) pw.println(" * " + bamFile);
pw.println();
pw.println("## Date");
pw.println();
pw.println(new SimpleDateFormat("yyyyMMdd").format(new Date()));
pw.flush();
}
}
archive.close();
archive = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(archive);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class HmmMergeBed method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter pw = null;
try {
if (this.numTreshold < 1) {
LOG.error("bad treshold.");
return -1;
}
final Comparator<Base> locCompare;
if (this.dictPath != null) {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.dictPath);
locCompare = new ContigDictComparator(dict).createLocatableComparator();
} else {
locCompare = (A, B) -> {
int i = A.getContig().compareTo(B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
};
}
final List<Path> paths = IOUtils.unrollPaths(args);
if (this.numTreshold > paths.size()) {
LOG.error("bad treshold (> number of files).");
return -1;
}
if (paths.isEmpty()) {
LOG.error("empty input");
return -1;
}
final List<CloseableIterator<Base>> baseiterators = new ArrayList<>(paths.size());
for (final Path bedPath : paths) {
final BedLineIterator iter1 = new BedLineIterator(bedPath);
final BedLineToBaseIterator iter2 = new BedLineToBaseIterator(iter1);
baseiterators.add(iter2);
}
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
final BaseMergerIterator bmiter = new BaseMergerIterator(baseiterators, locCompare);
while (bmiter.hasNext()) {
final BaseMerge bm = bmiter.next();
if (!bm.isValid()) {
if (this.hide_invalid)
continue;
pw.print("#");
}
pw.print(bm.contig);
pw.print('\t');
pw.print(bm.start - 1);
pw.print('\t');
pw.print(bm.end);
pw.print('\t');
pw.print(bm.getOpcode());
pw.println();
}
bmiter.close();
pw.flush();
pw.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
Aggregations