use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class GroupByGene method read.
private void read(final String input) throws IOException {
SortingCollection<Call> sortingCollection = null;
VCFIterator vcfIterator = null;
try {
final BcfIteratorBuilder iterbuilder = new BcfIteratorBuilder();
vcfIterator = (input == null ? iterbuilder.open(stdin()) : iterbuilder.open(input));
final VCFHeader header = vcfIterator.getHeader();
this.contigDictComparator = new ContigDictComparator(SequenceDictionaryUtils.extractRequired(header));
sortingCollection = SortingCollection.newInstance(Call.class, new CallCodec(header), (C1, C2) -> C1.compare2(C2), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingCollection.setDestructiveIteration(true);
final GeneExtractorFactory geneExtractorFactory = new GeneExtractorFactory(header);
final List<GeneExtractorFactory.GeneExtractor> geneExtractors = geneExtractorFactory.parse(this.extractorsNames);
final List<String> sampleNames;
if (header.getSampleNamesInOrder() != null) {
sampleNames = header.getSampleNamesInOrder();
} else {
sampleNames = Collections.emptyList();
}
final Pedigree pedigree;
if (this.pedigreePath != null) {
final PedigreeParser pedParser = new PedigreeParser();
pedigree = pedParser.parse(this.pedigreePath);
} else {
pedigree = PedigreeParser.empty();
}
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(header).logger(LOG).build();
while (vcfIterator.hasNext()) {
final VariantContext ctx = progress.apply(vcfIterator.next());
if (!ctx.isVariant())
continue;
if (ignore_filtered && ctx.isFiltered())
continue;
// simplify line
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.noID();
vcb.log10PError(VariantContext.NO_LOG10_PERROR);
vcb.unfiltered();
vcb.attributes(Collections.emptyMap());
final VariantContext ctx2 = vcb.make();
final SortingCollection<Call> finalSorter = sortingCollection;
geneExtractors.stream().flatMap(EX -> EX.apply(ctx).keySet().stream()).forEach(KG -> {
final Call c = new Call();
c.ctx = ctx2;
c.gene = new GeneName(KG.getKey(), KG.getGene(), KG.getMethod());
finalSorter.add(c);
});
}
CloserUtil.close(vcfIterator);
vcfIterator = null;
sortingCollection.doneAdding();
progress.close();
/**
* dump
*/
final Set<String> casesSamples = pedigree.getAffectedSamples().stream().map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> controlsSamples = pedigree.getUnaffectedSamples().stream().map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> maleSamples = pedigree.getSamples().stream().filter(P -> P.isMale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Set<String> femaleSamples = pedigree.getSamples().stream().filter(P -> P.isFemale()).map(P -> P.getId()).filter(ID -> sampleNames.contains(ID)).collect(Collectors.toSet());
final Predicate<Genotype> genotypeFilter = genotype -> {
if (!genotype.isAvailable())
return false;
if (!genotype.isCalled())
return false;
if (genotype.isNoCall())
return false;
if (genotype.isHomRef())
return false;
if (this.ignore_filtered_genotype && genotype.isFiltered())
return false;
return true;
};
PrintStream pw = openPathOrStdoutAsPrintStream(this.outFile);
pw.print("#chrom");
pw.print('\t');
pw.print("min.POS");
pw.print('\t');
pw.print("max.POS");
pw.print('\t');
pw.print("gene.name");
pw.print('\t');
pw.print("gene.label");
pw.print('\t');
pw.print("gene.type");
pw.print('\t');
pw.print("samples.affected");
pw.print('\t');
pw.print("count.variations");
if (this.print_positions) {
pw.print('\t');
pw.print("positions");
}
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.cases");
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.controls");
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.males");
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print("pedigree.females");
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
pw.print('\t');
pw.print("fisher");
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample);
}
pw.println();
final CloseableIterator<Call> iter = sortingCollection.iterator();
final EqualRangeIterator<Call> eqiter = new EqualRangeIterator<>(iter, (C1, C2) -> C1.compareTo(C2));
while (eqiter.hasNext()) {
final List<Call> row = eqiter.next();
final Call first = row.get(0);
final List<VariantContext> variantList = row.stream().map(R -> R.ctx).collect(Collectors.toList());
final int minPos = variantList.stream().mapToInt(R -> R.getStart()).min().getAsInt();
final int maxPos = variantList.stream().mapToInt(R -> R.getEnd()).max().getAsInt();
final Set<String> sampleCarryingMut = new HashSet<String>();
final Counter<String> pedCasesCarryingMut = new Counter<String>();
final Counter<String> pedCtrlsCarryingMut = new Counter<String>();
final Counter<String> malesCarryingMut = new Counter<String>();
final Counter<String> femalesCarryingMut = new Counter<String>();
final Counter<String> sample2count = new Counter<String>();
for (final VariantContext ctx : variantList) {
for (final Genotype genotype : ctx.getGenotypes()) {
if (!genotypeFilter.test(genotype))
continue;
final String sampleName = genotype.getSampleName();
sample2count.incr(sampleName);
sampleCarryingMut.add(sampleName);
if (casesSamples.contains(sampleName)) {
pedCasesCarryingMut.incr(sampleName);
}
if (controlsSamples.contains(sampleName)) {
pedCtrlsCarryingMut.incr(sampleName);
}
if (maleSamples.contains(sampleName)) {
malesCarryingMut.incr(sampleName);
}
if (femaleSamples.contains(sampleName)) {
femalesCarryingMut.incr(sampleName);
}
}
}
pw.print(first.getContig());
pw.print('\t');
// convert to bed
pw.print(minPos - 1);
pw.print('\t');
pw.print(maxPos);
pw.print('\t');
pw.print(first.gene.name);
pw.print('\t');
pw.print(first.gene.label);
pw.print('\t');
pw.print(first.gene.type);
pw.print('\t');
pw.print(sampleCarryingMut.size());
pw.print('\t');
pw.print(variantList.size());
if (this.print_positions) {
pw.print('\t');
pw.print(variantList.stream().map(CTX -> String.valueOf(CTX.getStart()) + ":" + CTX.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/"))).collect(Collectors.joining(";")));
}
if (!casesSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCasesCarryingMut.getCountCategories());
}
if (!controlsSamples.isEmpty()) {
pw.print('\t');
pw.print(pedCtrlsCarryingMut.getCountCategories());
}
if (!maleSamples.isEmpty()) {
pw.print('\t');
pw.print(malesCarryingMut.getCountCategories());
}
if (!femaleSamples.isEmpty()) {
pw.print('\t');
pw.print(femalesCarryingMut.getCountCategories());
}
if (this.print_fisher && !controlsSamples.isEmpty() && !casesSamples.isEmpty()) {
int count_case_mut = 0;
int count_ctrl_mut = 0;
int count_case_wild = 0;
int count_ctrl_wild = 0;
for (final String sampleName : header.getSampleNamesInOrder()) {
final boolean has_mutation = variantList.stream().map(V -> V.getGenotype(sampleName)).anyMatch(G -> G != null && genotypeFilter.test(G));
if (controlsSamples.contains(sampleName)) {
if (has_mutation) {
count_ctrl_mut++;
} else {
count_ctrl_wild++;
}
} else if (casesSamples.contains(sampleName)) {
if (has_mutation) {
count_case_mut++;
} else {
count_case_wild++;
}
}
}
final FisherExactTest fisher = FisherExactTest.compute(count_case_mut, count_case_wild, count_ctrl_mut, count_ctrl_wild);
pw.print('\t');
pw.print(fisher.getAsDouble());
}
for (final String sample : sampleNames) {
pw.print('\t');
pw.print(sample2count.count(sample));
}
pw.println();
if (pw.checkError())
break;
}
eqiter.close();
iter.close();
pw.flush();
if (this.outFile != null)
pw.close();
} finally {
CloserUtil.close(vcfIterator);
if (sortingCollection != null)
sortingCollection.cleanup();
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class ExpansionHunterMerge method doWork.
@Override
public int doWork(List<String> args) {
SortingCollection<VariantContext> sorter = null;
try {
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.info("no input file.");
return -1;
}
final String REPID = "REPID";
final String RU = "RU";
final String fakeSample = "___FAKE";
SAMSequenceDictionary dict = null;
final Set<String> samples = new TreeSet<>();
final Set<VCFHeaderLine> metaData = new HashSet<>();
for (final Path path : inputs) {
try (VCFReader r = VCFReaderFactory.makeDefault().open(path, false)) {
final VCFHeader header = r.getHeader();
if (header.getInfoHeaderLine(REPID) == null) {
LOG.error("missing INFO/" + REPID);
return -1;
}
final SAMSequenceDictionary d = header.getSequenceDictionary();
if (d != null) {
if (dict == null) {
dict = d;
} else {
SequenceUtil.assertSequenceDictionariesEqual(d, dict);
}
}
if (header.getNGenotypeSamples() != 1) {
LOG.error("expected one and only one genotyped sample in " + path);
return -1;
}
final String sn = header.getSampleNamesInOrder().get(0);
if (sn.equals(fakeSample)) {
LOG.error(sn + " cannot be named " + fakeSample + " in " + path);
return -1;
}
if (samples.contains(sn)) {
LOG.error("duplicate sample " + sn + " in " + path);
return -1;
}
metaData.addAll(header.getMetaDataInInputOrder());
samples.add(sn);
}
}
final VCFHeader tmpHeader = new VCFHeader(metaData, Arrays.asList(fakeSample));
final VCFInfoHeaderLine sampleInfo = new VCFInfoHeaderLine("SRCSAMPLE", 1, VCFHeaderLineType.String, "SRC SAMPLE");
final Comparator<String> ctgComparator = dict == null ? (A, B) -> A.compareTo(B) : new ContigDictComparator(dict);
final BiFunction<VariantContext, String, String> getAtt = (V, A) -> {
final String s1 = V.getAttributeAsString(A, "");
if (StringUtils.isBlank(s1))
throw new IllegalStateException("INFO/" + A + " missing in " + V);
return s1;
};
final Comparator<VariantContext> comparator1 = (V1, V2) -> {
String s1 = V1.getContig();
String s2 = V2.getContig();
int i = ctgComparator.compare(s1, s2);
if (i != 0)
return i;
i = Integer.compare(V1.getStart(), V2.getStart());
if (i != 0)
return i;
s1 = getAtt.apply(V1, REPID);
s2 = getAtt.apply(V2, REPID);
return s1.compareTo(s2);
};
final Comparator<VariantContext> comparator2 = (V1, V2) -> {
final int i = comparator1.compare(V1, V2);
if (i != 0)
return i;
final String s1 = getAtt.apply(V1, sampleInfo.getID());
final String s2 = getAtt.apply(V2, sampleInfo.getID());
return s1.compareTo(s2);
};
tmpHeader.addMetaDataLine(sampleInfo);
sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(tmpHeader, false), comparator2, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorter.setDestructiveIteration(true);
for (final Path path : inputs) {
LOG.info("adding " + path);
try (VCFReader r = VCFReaderFactory.makeDefault().open(path, false)) {
try (CloseableIterator<VariantContext> iter = r.iterator()) {
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
final Genotype gt0 = ctx.getGenotype(0);
final Genotype gt = new GenotypeBuilder(gt0).name(fakeSample).make();
vcb.genotypes(Collections.singletonList(gt));
vcb.attribute(sampleInfo.getID(), gt0.getSampleName());
sorter.add(vcb.make());
}
}
}
}
sorter.doneAdding();
final VCFHeader outputHeader = new VCFHeader(metaData, samples);
if (dict != null)
outputHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, outputHeader);
try (VariantContextWriter w = writingVariants.dictionary(dict).open(this.outputFile)) {
w.writeHeader(outputHeader);
try (CloseableIterator<VariantContext> iter = sorter.iterator()) {
final EqualRangeIterator<VariantContext> eq = new EqualRangeIterator<>(iter, comparator1);
while (eq.hasNext()) {
final List<VariantContext> calls = eq.next();
final VariantContext first = calls.get(0);
final Set<Allele> altAllelesSet = calls.stream().flatMap(V -> V.getGenotypes().stream()).flatMap(GT -> GT.getAlleles().stream()).filter(A -> !(A.isReference() || A.isNoCall())).collect(Collectors.toSet());
if (altAllelesSet.isEmpty())
continue;
final List<Allele> altAllelesList = new ArrayList<>(altAllelesSet);
final List<Allele> vcAlleles = new ArrayList<>(altAllelesList.size() + 1);
vcAlleles.add(first.getReference());
vcAlleles.addAll(altAllelesList);
final List<Genotype> genotypes = new ArrayList<>(samples.size());
for (final String sn : samples) {
final VariantContext vcs = calls.stream().filter(V -> getAtt.apply(V, sampleInfo.getID()).equals(sn)).findFirst().orElse(null);
final Genotype gt;
if (vcs == null) {
gt = GenotypeBuilder.createMissing(sn, 2);
} else {
gt = new GenotypeBuilder(vcs.getGenotype(0)).name(sn).make();
}
genotypes.add(gt);
}
final VariantContextBuilder vcb = new VariantContextBuilder(null, first.getContig(), first.getStart(), first.getEnd(), vcAlleles);
vcb.attribute(VCFConstants.END_KEY, first.getEnd());
vcb.attribute(REPID, getAtt.apply(first, REPID));
vcb.attribute(RU, getAtt.apply(first, RU));
vcb.genotypes(genotypes);
w.add(vcb.make());
}
eq.close();
}
}
sorter.cleanup();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class DepthOfCoverage method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
if (this.auto_mask && this.faidx == null) {
LOG.error("Cannot auto mask if REF is not defined");
return -1;
}
if (this.maskBed != null && this.includeBed != null) {
LOG.error("both --mask and --bed both defined");
return -1;
}
ReferenceSequenceFile referenceSequenceFile = null;
try {
final Predicate<String> isRejectContigRegex;
if (!StringUtils.isBlank(this.skipContigExpr)) {
final Pattern pat = Pattern.compile(this.skipContigExpr);
isRejectContigRegex = S -> pat.matcher(S).matches();
} else {
isRejectContigRegex = S -> false;
}
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null) {
srf.referenceSequence(this.faidx);
srf.setUseAsyncIo(this.asyncIo);
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
}
out = super.openPathOrStdoutAsPrintWriter(this.outputFile);
out.print("#BAM\tSample\tContig\tContig-Length\tMasked-Contig-Length\tCount\tDepth\tMedian\tMin\tMax\tMaxPos");
for (RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(r.toString());
}
out.println();
for (final Path path : IOUtils.unrollPaths(args)) {
try (final SamReader sr = srf.open(path)) {
if (!sr.hasIndex()) {
LOG.error("File " + path + " is not indexed.");
return -1;
}
final SAMFileHeader header = sr.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final Set<String> rejectContigSet = dict.getSequences().stream().map(SSR -> SSR.getSequenceName()).filter(isRejectContigRegex).collect(Collectors.toCollection(HashSet::new));
rejectContigSet.addAll(dict.getSequences().stream().filter(SSR -> SSR.getSequenceLength() < this.skipContigLength).map(SSR -> SSR.getSequenceName()).collect(Collectors.toCollection(HashSet::new)));
if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
LOG.error("file is not sorted on coordinate :" + header.getSortOrder() + " " + path);
return -1;
}
final QueryInterval[] intervals;
if (this.useBamIndexFlag && this.includeBed != null) {
if (!sr.hasIndex()) {
LOG.error("Bam is not indexed. " + path);
return -1;
}
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
final List<QueryInterval> L = new ArrayList<>();
try (BedLineReader br = new BedLineReader(this.includeBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
final String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
final int tid = dict.getSequenceIndex(ctg);
if (tid < 0)
continue;
L.add(new QueryInterval(tid, bed.getStart(), bed.getEnd()));
}
}
intervals = QueryInterval.optimizeIntervals(L.toArray(new QueryInterval[L.size()]));
} else {
intervals = null;
}
Integer minCov = null;
Integer maxCov = null;
ContigPos maxCovPosition = null;
long count_raw_bases = 0L;
long count_bases = 0L;
long sum_coverage = 0L;
final DiscreteMedian<Integer> discreteMedian_wg = new DiscreteMedian<>();
final Counter<RangeOfIntegers.Range> countMap_wg = new Counter<>();
final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(path.toString());
int[] coverage = null;
String prevContig = null;
BitSet mask = null;
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
try (CloseableIterator<SAMRecord> iter = intervals == null ? sr.iterator() : sr.queryOverlapping(intervals)) {
for (; ; ) {
final SAMRecord rec = iter.hasNext() ? progress.apply(iter.next()) : null;
if (rec != null) {
if (!SAMRecordDefaultFilter.accept(rec, this.mapping_quality))
continue;
if (rejectContigSet.contains(rec.getContig()))
continue;
}
if (rec == null || !rec.getContig().equals(prevContig)) {
if (coverage != null) {
// DUMP
long count_bases_ctg = 0L;
long sum_coverage_ctg = 0L;
Integer minV_ctg = null;
Integer maxV_ctg = null;
ContigPos maxPos_ctg = null;
final DiscreteMedian<Integer> discreteMedian_ctg = new DiscreteMedian<>();
final Counter<RangeOfIntegers.Range> countMap_ctg = new Counter<>();
for (int i = 0; i < coverage.length; i++) {
if (mask.get(i))
continue;
final int covi = coverage[i];
if (covi > this.max_depth)
continue;
if (minV_ctg == null || minV_ctg.intValue() > covi)
minV_ctg = covi;
if (maxV_ctg == null || maxV_ctg.intValue() < covi) {
maxV_ctg = covi;
maxPos_ctg = new ContigPos(prevContig, i + 1);
}
countMap_ctg.incr(this.summaryCov.getRange(covi));
count_bases_ctg++;
sum_coverage_ctg += covi;
discreteMedian_ctg.add(covi);
}
out.print(path);
out.print("\t");
out.print(sample);
out.print("\t");
out.print(prevContig);
out.print("\t");
out.print(coverage.length);
out.print("\t");
out.print(count_bases_ctg);
out.print("\t");
out.print(sum_coverage_ctg);
out.print("\t");
if (count_bases_ctg > 0) {
out.printf("%.2f", sum_coverage_ctg / (double) count_bases_ctg);
} else {
out.print("N/A");
}
out.print("\t");
final OptionalDouble median = discreteMedian_ctg.getMedian();
if (median.isPresent()) {
out.print(median.getAsDouble());
} else {
out.print("N/A");
}
out.print("\t");
if (minV_ctg != null) {
out.print(minV_ctg);
} else {
out.print("N/A");
}
out.print("\t");
if (maxV_ctg != null) {
out.print(maxV_ctg);
out.print("\t");
out.print(maxPos_ctg);
} else {
out.print("N/A\tN/A");
}
for (final RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(countMap_ctg.count(r));
if (!countMap_ctg.isEmpty()) {
out.print(" ");
out.printf("(%.2f%%)", (countMap_ctg.count(r) / (countMap_ctg.getTotal() * 1.0)) * 100.0);
}
}
out.println();
if (minCov == null || (minV_ctg != null && minV_ctg.compareTo(minCov) < 0))
minCov = minV_ctg;
if (maxCov == null || (maxV_ctg != null && maxV_ctg.compareTo(maxCov) > 0)) {
maxCov = maxV_ctg;
maxCovPosition = maxPos_ctg;
}
count_bases += count_bases_ctg;
sum_coverage += sum_coverage_ctg;
count_raw_bases += coverage.length;
discreteMedian_wg.add(discreteMedian_ctg);
countMap_wg.putAll(countMap_ctg);
}
coverage = null;
mask = null;
// /
System.gc();
if (rec == null)
break;
final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(rec.getContig()));
coverage = new int[ssr.getSequenceLength()];
mask = new BitSet(ssr.getSequenceLength());
if (this.auto_mask && referenceSequenceFile != null) {
final byte[] refSeq = Objects.requireNonNull(referenceSequenceFile.getSequence(ssr.getSequenceName())).getBases();
for (int i = 0; i < refSeq.length; i++) {
if (AcidNucleics.isATGC(refSeq[i]))
continue;
mask.set(i);
}
}
/* read mask */
if (this.maskBed != null) {
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.maskBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
if (bed == null)
continue;
String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
if (!rec.getContig().equals(ctg))
continue;
for (int p1 = bed.getStart(); p1 <= bed.getEnd() && p1 <= coverage.length; ++p1) {
mask.set(p1 - 1);
}
}
}
} else if (this.includeBed != null) {
final List<Locatable> list = new ArrayList<>();
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.includeBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
if (bed == null)
continue;
final String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
if (!rec.getContig().equals(ctg))
continue;
list.add(new SimpleInterval(ctg, bed.getStart(), bed.getEnd()));
}
}
// sort on starts
Collections.sort(list, (A, B) -> Integer.compare(A.getStart(), B.getStart()));
int p1 = 1;
while (p1 <= coverage.length) {
while (!list.isEmpty() && list.get(0).getEnd() < p1) {
list.remove(0);
}
if (!list.isEmpty() && list.get(0).getStart() <= p1 && p1 <= list.get(0).getEnd()) {
++p1;
continue;
}
mask.set(p1 - 1);
p1++;
}
}
prevContig = rec.getContig();
}
int max_end1 = coverage.length;
if (!this.disable_paired_overlap_flag && rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) && rec.getAlignmentStart() < rec.getMateAlignmentStart() && rec.getAlignmentEnd() > rec.getMateAlignmentStart()) {
max_end1 = rec.getMateAlignmentStart() - 1;
}
for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
final int pos1 = block.getReferenceStart();
final int len = block.getLength();
for (int i = 0; i < len; i++) {
if (pos1 + i - 1 >= 0 && pos1 + i <= max_end1) {
coverage[pos1 + i - 1]++;
}
}
}
}
/* end rec */
}
/* end iter */
progress.close();
out.print(path);
out.print("\t");
out.print(sample);
out.print("\t");
out.print(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
out.print("\t");
out.print(count_raw_bases);
out.print("\t");
out.print(count_bases);
out.print("\t");
out.print(sum_coverage);
out.print("\t");
if (count_bases > 0) {
out.printf("%.2f", sum_coverage / (double) count_bases);
} else {
out.print("N/A");
}
out.print("\t");
final OptionalDouble median = discreteMedian_wg.getMedian();
if (median.isPresent()) {
out.print(median.getAsDouble());
} else {
out.print("N/A");
}
out.print("\t");
if (minCov != null) {
out.print(minCov);
} else {
out.print("N/A");
}
out.print("\t");
if (maxCov != null) {
out.print(maxCov + "\t" + maxCovPosition);
} else {
out.print("N/A\tN/A");
}
for (final RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(countMap_wg.count(r));
if (!countMap_wg.isEmpty()) {
out.print(" ");
out.printf("(%.2f%%)", (countMap_wg.count(r) / (countMap_wg.getTotal() * 1.0)) * 100.0);
}
}
out.println();
}
}
out.flush();
out.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(referenceSequenceFile);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class BuildDbsnp method doWork.
@Override
public int doWork(final List<String> args) {
try {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.faidx);
if (!StringUtils.isBlank(this.limitChrom)) {
if (dict.getSequence(this.limitChrom) == null) {
throw new JvarkitException.ContigNotFoundInDictionary(this.limitChrom, dict);
}
}
final List<VCFSource> sources = new ArrayList<>(args.size());
for (int i = 0; i < args.size(); i++) {
final String s = args.get(i);
final int colon = s.indexOf(':');
if (colon <= 0) {
LOG.error("colon missing in " + s + " expected vcf-name:vcf-path.");
return -1;
}
final String vcfName = s.substring(0, colon);
if (sources.stream().anyMatch(S -> S.name.equals(vcfName))) {
LOG.error("duplicate source name: \"" + vcfName + "\".");
return -1;
}
final Path path = Paths.get(s.substring(colon + 1));
IOUtil.assertFileIsReadable(path);
sources.add(new VCFSource(vcfName, path));
}
final Pattern rsIdPattern = Pattern.compile("[rR][sS][0-9]+");
try (VariantContextWriter w = writingVariantsDelegate.dictionary(dict).open(this.outputFile)) {
final VCFHeader header = new VCFHeader();
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
w.writeHeader(header);
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!StringUtils.isBlank(this.limitChrom) && !ssr.getContig().equals(this.limitChrom))
continue;
LOG.info(ssr.getContig());
sources.stream().forEach(SRC -> SRC.reset(ssr));
final List<CloseableIterator<Variant>> iterators = sources.stream().map(SRC -> SRC).collect(Collectors.toList());
final MergingIterator<Variant> merger = new MergingIterator<>((A, B) -> Integer.compare(A.pos, B.pos), iterators);
final EqualRangeIterator<Variant> equal_range = new EqualRangeIterator<>(merger, (A, B) -> Integer.compare(A.pos, B.pos));
while (equal_range.hasNext()) {
final List<Variant> variants0 = equal_range.next();
// all references in this set of variant
final Set<Allele> all_refs = variants0.stream().map(V -> V.alleles.get(0)).collect(Collectors.toCollection(TreeSet::new));
// loop over each ref
for (final Allele ref_allele : all_refs) {
// all variants with this ref allele
final List<Variant> variants = variants0.stream().filter(V -> V.alleles.get(0).equals(ref_allele)).collect(Collectors.toList());
final Variant variant = variants.get(0);
final Set<Allele> altSet = new HashSet<>();
variants.stream().forEach(V -> altSet.addAll(V.alleles.subList(1, V.alleles.size())));
altSet.remove(Allele.SPAN_DEL);
final List<Allele> alleles = new ArrayList<>(1 + altSet.size());
alleles.add(ref_allele);
alleles.addAll(altSet);
final VariantContextBuilder vcb = new VariantContextBuilder(null, ssr.getContig(), variant.pos, variant.pos + variant.alleles.get(0).length() - 1, alleles);
String id = variants.stream().filter(F -> // vcf spec: Semicolon-separated list of unique identifiers where available
Arrays.stream(CharSplitter.SEMICOLON.split(F.id)).anyMatch(S -> rsIdPattern.matcher(S).matches())).map(F -> F.id).findFirst().orElse(null);
// not an rs ID
if (StringUtils.isBlank(id))
id = variants.stream().map(F -> F.id + (this.add_filter_suffix && F.filtered ? "_F" : "")).findFirst().orElse(null);
vcb.id(variant.id);
w.add(vcb.make());
}
}
equal_range.close();
merger.close();
iterators.stream().forEach(R -> R.close());
}
}
sources.stream().forEach(SRC -> SRC.close());
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class BamSliceBed method scanIterator.
@Override
protected void scanIterator(final SAMFileHeader headerIn, final CloseableIterator<SAMRecord> iter, final SAMFileWriter sfw) {
final SAMRecordFactory samRecordFactory = new DefaultSAMRecordFactory();
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
final Collection<Interval> beds = bedIntervals.getOverlapping(rec);
if (beds.isEmpty())
continue;
final List<Base> align = new ArrayList<>(rec.getLengthOnReference());
int refpos = rec.getUnclippedStart();
int readpos = 0;
final byte[] bases = rec.getReadBases();
if (bases == SAMRecord.NULL_SEQUENCE)
continue;
final String quals;
final boolean qual_is_available;
if (rec.getBaseQualities() == SAMRecord.NULL_QUALS) {
quals = StringUtils.repeatCharNTimes('#', bases.length);
qual_is_available = false;
} else {
quals = rec.getBaseQualityString();
qual_is_available = true;
}
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
switch(op) {
case P:
break;
case D:
case N:
{
for (int i = 0; i < ce.getLength(); ++i) {
final Base b = new Base();
b.cigaroperator = op;
b.refpos = refpos;
align.add(b);
refpos++;
}
break;
}
case H:
/* ignore hard clip */
{
refpos += ce.getLength();
break;
}
case S:
case X:
case EQ:
case M:
{
for (int i = 0; i < ce.getLength(); ++i) {
final Base b = new Base();
b.refpos = refpos;
b.cigaroperator = op;
b.readbase = bases[readpos];
b.readqual = quals.charAt(readpos);
b.readpos = readpos;
align.add(b);
readpos++;
refpos++;
}
break;
}
case I:
{
for (int i = 0; i < ce.getLength(); ++i) {
final Base b = new Base();
b.cigaroperator = op;
b.readbase = bases[readpos];
b.readqual = quals.charAt(readpos);
b.readpos = readpos;
align.add(b);
readpos++;
}
break;
}
default:
throw new IllegalStateException();
}
}
for (final Interval bed : beds) {
final List<Base> copy = new ArrayList<>(align);
/* trim 5 prime */
int trim = -1;
for (int x = 0; x < copy.size(); x++) {
final Base b = copy.get(x);
if (b.cigaroperator.isAlignment() && b.refpos >= bed.getStart()) {
trim = x;
break;
}
}
if (trim > 0) {
final List<Base> subList = copy.subList(0, trim);
if (use_clip) {
for (Base b : subList) {
if (b.readpos != -1) {
b.cigaroperator = CigarOperator.SOFT_CLIP;
}
}
// N D
subList.removeIf(B -> B.readpos == -1);
} else {
subList.clear();
}
}
/* trim 3 prime */
trim = -1;
for (int x = copy.size() - 1; x >= 0; x--) {
final Base b = copy.get(x);
if (b.cigaroperator.isAlignment() && b.refpos <= bed.getEnd()) {
trim = x;
break;
}
}
if (trim != -1) {
final List<Base> subList = copy.subList(trim + 1, copy.size());
if (use_clip) {
for (Base b : subList) {
if (b.readpos != -1) {
b.cigaroperator = CigarOperator.SOFT_CLIP;
}
}
// N D
subList.removeIf(B -> B.readpos == -1);
} else {
subList.clear();
}
}
if (copy.stream().noneMatch(P -> P.cigaroperator.isAlignment()))
continue;
if (copy.isEmpty())
continue;
int nrefpos = copy.stream().filter(B -> B.refpos != -1).filter(B -> !B.cigaroperator.isClipping()).mapToInt(B -> B.refpos).findFirst().orElse(-1);
final SAMRecord newrec = samRecordFactory.createSAMRecord(sfw.getFileHeader());
newrec.setReadName(rec.getReadName() + "#" + bed.getContig() + ":" + bed.getStart() + ":" + bed.getEnd());
newrec.setMappingQuality(SAMRecord.UNKNOWN_MAPPING_QUALITY);
newrec.setAlignmentStart(nrefpos);
newrec.setReadString(copy.stream().filter(B -> B.readbase != NO_BASE).map(B -> String.valueOf((char) B.readbase)).collect(Collectors.joining()));
if (qual_is_available) {
newrec.setBaseQualityString(copy.stream().filter(B -> B.readqual != NO_QUAL).map(B -> String.valueOf(B.readqual)).collect(Collectors.joining()));
} else {
newrec.setBaseQualities(SAMRecord.NULL_QUALS);
}
newrec.setReadUnmappedFlag(false);
newrec.setReadNegativeStrandFlag(rec.getReadNegativeStrandFlag());
newrec.setReferenceName(rec.getReferenceName());
newrec.setCigar(Cigar.fromCigarOperators(copy.stream().map(B -> B.cigaroperator).collect(Collectors.toList())));
if (rec.hasAttribute("RG")) {
newrec.setAttribute("RG", rec.getAttribute("RG"));
}
for (final String att : this.keepAttributes) {
if (rec.hasAttribute(att)) {
newrec.setAttribute(att, rec.getAttribute(att));
}
}
newrec.setAttribute("PG", this.spr.getId());
sfw.addAlignment(newrec);
}
// end for interval
}
// end while
}
Aggregations