use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class Biostar489074 method processInput.
@Override
protected int processInput(final SAMFileHeader header, final CloseableIterator<SAMRecord> iter0) {
VariantContextWriter out = null;
GenomicSequence genome = null;
SortingCollection<Call> sorting = null;
try {
this.indexedFastaRef = ReferenceSequenceFileFactory.getReferenceSequenceFile(getRequiredReferencePath());
if (!(header.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted) || header.getSortOrder().equals(SAMFileHeader.SortOrder.queryname))) {
getLogger().error("input should be sorted with 'samtools sort -n' or 'samtools collate' but got " + header.getSortOrder());
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
this.writingVariantsDelegate.dictionary(dict);
sorting = SortingCollection.newInstance(Call.class, new CallCodec(), (A, B) -> A.compare2(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
final List<String> samples = header.getReadGroups().stream().map(R -> this.groupBy.apply(R)).filter(S -> !StringUtils.isBlank(S)).sorted().collect(Collectors.toSet()).stream().collect(Collectors.toList());
final Map<String, Integer> rgid2idx = new HashMap<>();
for (final SAMReadGroupRecord rg : header.getReadGroups()) {
final String sn = this.groupBy.apply(rg);
if (StringUtils.isBlank(sn))
continue;
int idx = samples.indexOf(sn);
if (idx == -1)
continue;
rgid2idx.put(rg.getId(), idx);
}
try (CloseableIterator<List<SAMRecord>> iter = new EqualIterator<>(iter0, (A, B) -> A.getReadName().compareTo(B.getReadName()))) {
while (iter.hasNext()) {
final List<SAMRecord> buffer = iter.next();
int read1_idx = -1;
int read2_idx = -1;
for (int i = 0; i < buffer.size(); i++) {
final SAMRecord rec = buffer.get(i);
if (!rec.getReadPairedFlag())
continue;
if (rec.getReadUnmappedFlag())
continue;
if (rec.getMateUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getReadBases() == SAMRecord.NULL_SEQUENCE)
continue;
if (rec.getFirstOfPairFlag()) {
read1_idx = i;
} else if (rec.getSecondOfPairFlag()) {
read2_idx = i;
}
}
if (read1_idx == -1 || read2_idx == -1 || read1_idx == read2_idx)
continue;
final SAMRecord rec1a = buffer.get(read1_idx);
final SAMRecord rec2a = buffer.get(read2_idx);
if (!rec1a.overlaps(rec2a))
continue;
final int chromStart = Math.max(rec1a.getStart(), rec2a.getStart());
final int chromEnd = Math.min(rec1a.getEnd(), rec2a.getEnd());
if (chromStart > chromEnd)
continue;
final Integer sampleid = rgid2idx.get(rec1a.getReadGroup().getId());
if (sampleid == null)
continue;
if (genome == null || !genome.contigsMatch(rec1a)) {
genome = new GenomicSequence(this.indexedFastaRef, rec1a.getContig());
}
final Set<Call> calls1 = new HashSet<>();
final Set<Call> calls2 = new HashSet<>();
for (int side = 0; side < 2; side++) {
final SAMRecord rec = (side == 0 ? rec1a : rec2a);
final Set<Call> calls = (side == 0 ? calls1 : calls2);
final byte[] bases = rec.getReadBases();
for (AlignmentBlock ab : rec.getAlignmentBlocks()) {
for (int n = 0; n < ab.getLength(); ++n) {
final int ref = ab.getReferenceStart() + n;
if (ref < chromStart)
continue;
if (ref > chromEnd)
break;
if (ref < 0 || ref >= genome.length())
continue;
// 0 based
final byte refBase = (byte) Character.toUpperCase(genome.charAt(ref - 1));
if (!AcidNucleics.isATGC(refBase))
continue;
// 0 based
final byte readBase = (byte) Character.toUpperCase(bases[(ab.getReadStart() - 1) + n]);
if (readBase == refBase)
continue;
final Call call = new Call();
call.sampleid = sampleid;
call.tid = rec1a.getReferenceIndex().intValue();
call.ref = refBase;
call.alt = readBase;
call.pos = ref;
calls.add(call);
}
}
}
calls1.retainAll(calls2);
if (calls1.isEmpty())
continue;
for (final Call c : calls1) {
sorting.add(c);
}
}
sorting.doneAdding();
out = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_COUNT_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_NUMBER_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_FREQUENCY_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
final VCFHeader header2 = new VCFHeader(metaData, samples);
header2.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header2);
out.writeHeader(header2);
try (CloseableIterator<Call> iter1 = sorting.iterator()) {
try (EqualRangeIterator<Call> iter2 = new EqualRangeIterator<>(iter1, (A, B) -> A.compare1(B))) {
while (iter2.hasNext()) {
final List<Call> calls = iter2.next();
if (calls.isEmpty())
continue;
final Call first = calls.get(0);
final Set<Allele> altAllelesSet = calls.stream().map(A -> Allele.create(A.alt, false)).collect(Collectors.toSet());
final List<Allele> altAllelesList = new ArrayList<>(altAllelesSet);
final List<Allele> vcAlleles = new ArrayList<>(altAllelesList.size() + 1);
vcAlleles.add(Allele.create(first.ref, true));
vcAlleles.addAll(altAllelesList);
final List<Genotype> genotypes = new ArrayList<>(samples.size());
int DP = 0;
for (int i = 0; i < samples.size(); i++) {
final String sn = samples.get(i);
final Counter<Allele> allele2count = new Counter<>();
for (Call c : calls) {
if (c.sampleid != i)
continue;
allele2count.incr(Allele.create(c.alt, false));
}
Genotype gt;
if (allele2count.isEmpty()) {
gt = GenotypeBuilder.createMissing(sn, this.ploidy);
} else {
final int[] array = new int[vcAlleles.size()];
for (int j = 0; j < vcAlleles.size(); j++) {
array[j] = (int) allele2count.count(vcAlleles.get(j));
}
final GenotypeBuilder gb = new GenotypeBuilder(sn, new ArrayList<>(allele2count.keySet()));
gb.AD(array);
gb.DP((int) allele2count.getTotal());
DP += (int) allele2count.getTotal();
gt = gb.make();
}
genotypes.add(gt);
}
final VariantContextBuilder vcb = new VariantContextBuilder(null, dict.getSequence(first.tid).getContig(), first.pos, first.pos, vcAlleles);
vcb.attribute(VCFConstants.DEPTH_KEY, DP);
vcb.genotypes(genotypes);
out.add(vcb.make());
}
}
}
}
sorting.cleanup();
out.close();
out = null;
out = null;
this.indexedFastaRef.close();
this.indexedFastaRef = null;
return 0;
} catch (final Throwable err) {
getLogger().error(err);
return -1;
} finally {
CloserUtil.close(out);
CloserUtil.close(this.indexedFastaRef);
;
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class Biostar352930 method doWork.
@Override
public int doWork(final List<String> args) {
SamReader samReader = null;
SAMFileWriter samFileWriter = null;
try {
long count_rescued_seq = 0L;
long count_rescued_qual = 0L;
final SamReaderFactory srf = super.createSamReaderFactory().validationStringency(ValidationStringency.SILENT);
final String input = oneFileOrNull(args);
if (input == null) {
samReader = srf.open(SamInputResource.of(stdin()));
} else {
samReader = srf.open(SamInputResource.of(input));
}
final SAMFileHeader header = samReader.getFileHeader();
if (header.getSortOrder() != SAMFileHeader.SortOrder.queryname) {
LOG.error("Expected SAM input to be sorted on " + SAMFileHeader.SortOrder.queryname + " but got " + header.getSortOrder());
return -1;
}
JVarkitVersion.getInstance().addMetaData(this, header);
samFileWriter = this.writingBamArgs.openSAMFileWriter(this.outputFile, header, true);
final CloseableIterator<SAMRecord> iter = samReader.iterator();
final Comparator<SAMRecord> readNameCmp = ReadNameSortMethod.picard.get();
final Iterator<List<SAMRecord>> eq_range = new EqualRangeIterator<>(iter, readNameCmp);
final Predicate<SAMRecord> noCigarOrNoHardClip = R -> R.getCigar() == null || R.getCigar().getCigarElements().stream().noneMatch(C -> C.getOperator().equals(CigarOperator.H));
while (eq_range.hasNext()) {
final List<SAMRecord> records = eq_range.next();
for (int choice = 0; choice < 3; ++choice) {
final Predicate<SAMRecord> selectRead;
switch(choice) {
case 0:
selectRead = R -> !R.getReadPairedFlag();
break;
case 1:
selectRead = R -> R.getReadPairedFlag() && R.getFirstOfPairFlag();
break;
case 2:
selectRead = R -> R.getReadPairedFlag() && R.getSecondOfPairFlag();
break;
default:
throw new IllegalStateException();
}
// get the strand+ SEQUENCE string
final String unclippedseq = records.stream().filter(selectRead).filter(noCigarOrNoHardClip).filter(R -> !R.getReadString().equals(SAMRecord.NULL_SEQUENCE_STRING)).map(R -> {
if (R.getReadNegativeStrandFlag()) {
return SequenceUtil.reverseComplement(R.getReadString());
} else {
return R.getReadString();
}
}).findFirst().orElse(null);
// get the strand+ QUAL string
final String unclippedqual = records.stream().filter(selectRead).filter(noCigarOrNoHardClip).filter(R -> !R.getBaseQualityString().equals(SAMRecord.NULL_QUALS_STRING)).map(R -> {
if (R.getReadNegativeStrandFlag()) {
return new StringBuilder(R.getBaseQualityString()).reverse().toString();
} else {
return R.getBaseQualityString();
}
}).findFirst().orElse(null);
// loop over the record and fix
for (final SAMRecord rec : records) {
if (!selectRead.test(rec))
continue;
String clippedseq = unclippedseq;
String clippedqual = unclippedqual;
if (rec.getReadUnmappedFlag() && rec.getCigar() != null && rec.getCigar().numCigarElements() > 1) {
final Cigar cigar = rec.getCigar();
CigarElement ce = cigar.getLastCigarElement();
if (ce.getOperator().equals(CigarOperator.H)) {
clippedseq = clippedseq.substring(0, clippedseq.length() - ce.getLength());
clippedqual = clippedqual.substring(0, clippedqual.length() - ce.getLength());
}
ce = cigar.getFirstCigarElement();
if (ce.getOperator().equals(CigarOperator.H)) {
clippedseq = clippedseq.substring(ce.getLength());
clippedqual = clippedqual.substring(ce.getLength());
}
}
if (!StringUtil.isBlank(clippedseq) && rec.getReadString().equals(SAMRecord.NULL_SEQUENCE_STRING)) {
if (rec.getReadNegativeStrandFlag()) {
rec.setReadString(SequenceUtil.reverseComplement(clippedseq));
} else {
rec.setReadString(clippedseq);
}
++count_rescued_seq;
}
if (!StringUtil.isBlank(clippedqual) && rec.getBaseQualityString().equals(SAMRecord.NULL_QUALS_STRING)) {
if (rec.getReadNegativeStrandFlag()) {
rec.setBaseQualityString(new StringBuilder(clippedqual).reverse().toString());
} else {
rec.setBaseQualityString(clippedqual);
}
++count_rescued_qual;
}
}
}
for (final SAMRecord R : records) samFileWriter.addAlignment(R);
}
CloserUtil.close(eq_range);
iter.close();
samFileWriter.close();
samFileWriter = null;
samReader.close();
samReader = null;
LOG.info("Done : rescued seq: " + count_rescued_seq + " rescued qual: " + count_rescued_qual);
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(samReader);
CloserUtil.close(samFileWriter);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class VCFShuffle method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator in, final VariantContextWriter out) {
SortingCollection<RLine> shuffled = null;
try {
final Random random = new Random(this.seed);
final VCFHeader header = in.getHeader();
final VCFEncoder vcfEncoder = new VCFEncoder(header, false, false);
shuffled = SortingCollection.newInstance(RLine.class, new RLineCodec(), (o1, o2) -> {
final int i = Long.compare(o1.rand, o2.rand);
if (i != 0)
return i;
return o1.line.compareTo(o2.line);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
shuffled.setDestructiveIteration(true);
while (in.hasNext()) {
final RLine rLine = new RLine();
rLine.rand = random.nextLong();
rLine.line = vcfEncoder.encode(in.next());
shuffled.add(rLine);
}
shuffled.doneAdding();
JVarkitVersion.getInstance().addMetaData(this, header);
out.writeHeader(header);
final VCFCodec vcfCodec = new VCFCodec();
vcfCodec.setVCFHeader(header, VCFHeaderVersion.VCF4_3);
try (final CloseableIterator<RLine> iter = shuffled.iterator()) {
while (iter.hasNext()) {
final VariantContext ctx = vcfCodec.decode(iter.next().line);
out.add(ctx);
}
}
shuffled.cleanup();
shuffled = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
if (shuffled != null)
shuffled.cleanup();
CloserUtil.close(shuffled);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class SamReadLengthDistribution method openSamIterator.
/**
* create input SAMRecord iterator
*/
private CloseableIterator<SAMRecord> openSamIterator(final SamReader sr) {
if (this.regionFiles != null) {
this.regionFiles.dictionary(SequenceDictionaryUtils.extractRequired(sr.getFileHeader()));
if (!sr.hasIndex()) {
final List<Interval> L = this.regionFiles.stream().map(x -> new Interval(x)).collect(Collectors.toList());
final SAMRecordIterator st0 = sr.iterator();
return new FilteringSamIterator(st0, new IntervalFilter(L, sr.getFileHeader()));
} else {
return sr.query(this.regionFiles.optimizedQueryIntervals(), false);
}
}
return sr.iterator();
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class BamStage method reloadData.
@Override
void reloadData() {
updateStatusBar(AlertType.NONE, "");
final int max_items = super.maxItemsLimitSpinner.getValue();
final List<SAMRecord> L = new ArrayList<SAMRecord>(max_items);
final String location = this.gotoField.getText().trim();
final CloseableIterator<SAMRecord> iter;
final java.util.function.Predicate<SAMRecord> recFilter = makeFlagPredicate();
Long canvasFirstRecordGenomicIndex = null;
Long canvasLastRecordGenomicIndex = null;
try {
if (location.isEmpty()) {
iter = this.getBamFile().iterator();
} else if (location.equalsIgnoreCase("unmapped")) {
iter = this.getBamFile().queryUnmapped();
} else {
final Interval interval = parseInterval(location);
if (interval == null) {
iter = null;
} else {
iter = this.getBamFile().iterator(interval.getContig(), interval.getStart(), interval.getEnd());
}
}
} catch (final IOException err) {
err.printStackTrace();
JfxNgs.showExceptionDialog(this, err);
return;
}
Optional<BamJavascripFilter> bamjsfilter = Optional.empty();
if (this.owner.javascriptCompiler.isPresent() && !this.javascriptArea.getText().trim().isEmpty()) {
try {
bamjsfilter = Optional.of(new BamJavascripFilter(this.getBamFile().getHeader(), Optional.of(this.owner.javascriptCompiler.get().compile(this.javascriptArea.getText()))));
} catch (final Exception err) {
LOG.warning(err.getMessage());
updateStatusBar(AlertType.ERROR, err);
bamjsfilter = Optional.empty();
}
}
final Map<ContigPos, Pileup> pos2pileup = new TreeMap<>();
final Function<ContigPos, Pileup> getpileup = new Function<ContigPos, Pileup>() {
@Override
public Pileup apply(ContigPos t) {
Pileup p = pos2pileup.get(t);
if (p == null) {
p = new Pileup(t.contig, t.position);
pos2pileup.put(t, p);
}
return p;
}
};
int count_items = 0;
while (iter != null && iter.hasNext() && count_items < max_items) {
final SAMRecord rec = iter.next();
++count_items;
if (bamjsfilter.isPresent()) {
if (bamjsfilter.get().eval(rec) == null)
continue;
}
if (!recFilter.test(rec))
continue;
L.add(rec);
/* get bounds for canvas genmic browser */
if (!rec.getReadUnmappedFlag() && rec.getCigar() != null) {
final long endIndex = convertContigPosToGenomicIndex(rec.getContig(), rec.getUnclippedEnd());
if (canvasFirstRecordGenomicIndex == null) {
canvasFirstRecordGenomicIndex = convertContigPosToGenomicIndex(rec.getContig(), rec.getUnclippedStart());
canvasLastRecordGenomicIndex = endIndex;
} else if (canvasLastRecordGenomicIndex < endIndex) {
canvasLastRecordGenomicIndex = endIndex;
}
}
/* FILL pileup */
if (!rec.getReadUnmappedFlag() && rec.getCigar() != null) {
int refpos = rec.getUnclippedStart();
int readpos = 0;
final byte[] bases = rec.getReadBases();
final byte[] quals = rec.getOriginalBaseQualities();
/**
* function getting the ith base
*/
final Function<Integer, Character> getBaseAt = new Function<Integer, Character>() {
@Override
public Character apply(Integer readPos) {
char c;
if (bases == null || bases.length <= readPos) {
return '?';
} else {
c = (char) bases[readPos];
}
c = (rec.getReadNegativeStrandFlag() ? Character.toLowerCase(c) : Character.toUpperCase(c));
return c;
}
};
/**
* function getting the ith base quality
*/
final Function<Integer, Character> getQualAt = new Function<Integer, Character>() {
@Override
public Character apply(Integer readPos) {
char c;
if (quals == null || quals.length <= readPos) {
return '#';
} else {
c = (char) quals[readPos];
}
return c;
}
};
for (final CigarElement ce : rec.getCigar()) {
switch(ce.getOperator()) {
case P:
break;
case N:
case D:
{
refpos += ce.getLength();
break;
}
case H:
{
for (int i = 0; i < ce.getLength(); ++i) {
final Pileup p = getpileup.apply(new ContigPos(rec.getContig(), refpos));
p.watch('-', '#', ce.getOperator());
++refpos;
}
break;
}
case I:
{
for (int i = 0; i < ce.getLength(); ++i) {
final Pileup p = getpileup.apply(new ContigPos(rec.getContig(), refpos));
p.watch('<', getQualAt.apply(readpos), ce.getOperator());
readpos++;
}
break;
}
case S:
for (int i = 0; i < ce.getLength(); ++i) {
final Pileup p = getpileup.apply(new ContigPos(rec.getContig(), refpos));
p.watch('-', getQualAt.apply(readpos), ce.getOperator());
++readpos;
++refpos;
}
break;
case EQ:
case X:
case M:
for (int i = 0; i < ce.getLength(); ++i) {
final Pileup p = getpileup.apply(new ContigPos(rec.getContig(), refpos));
p.watch(getBaseAt.apply(readpos), getQualAt.apply(readpos), ce.getOperator());
++readpos;
++refpos;
}
break;
}
}
}
}
if (iter != null)
iter.close();
this.canvasScrolVInCoverage.setMin(0);
final int max_depth = pos2pileup.values().stream().map(P -> P.depth()).max((A, B) -> (A.compareTo(B))).orElse(0);
this.canvasScrolVInCoverage.setMax(max_depth + 1);
this.canvasScrolVInCoverage.setValue(0);
this.recordTable.getItems().setAll(L);
this.pileupTable.getItems().setAll(pos2pileup.values());
if (canvasFirstRecordGenomicIndex != null && canvasLastRecordGenomicIndex != null && canvasFirstRecordGenomicIndex.longValue() < canvasLastRecordGenomicIndex.longValue()) {
this.canvasScrollHGenomicLoc.setMin(canvasFirstRecordGenomicIndex);
this.canvasScrollHGenomicLoc.setMax(canvasLastRecordGenomicIndex);
// this.canvasScrollHGenomicLoc.setUnitIncrement(1);
// this.canvasScrollHGenomicLoc.setBlockIncrement(Math.max(1.0,Math.min( canvasLastRecordGenomicIndex-canvasFirstRecordGenomicIndex, this.canvas.getWidth())));
this.canvasScrollHGenomicLoc.setValue(canvasFirstRecordGenomicIndex);
} else {
this.canvasScrollHGenomicLoc.setMin(0);
this.canvasScrollHGenomicLoc.setMax(0);
this.canvasScrollHGenomicLoc.setValue(0);
this.canvasScrollHGenomicLoc.setBlockIncrement(0);
this.canvasScrollHGenomicLoc.setUnitIncrement(1);
}
if (!this.recordTable.getItems().isEmpty()) {
final int last_index = this.recordTable.getItems().size() - 1;
if (!this.recordTable.getItems().get(0).getReadUnmappedFlag() && !this.recordTable.getItems().get(last_index).getReadUnmappedFlag()) {
super.seqDictionaryCanvas.setItemsInterval(new ContigPos(this.recordTable.getItems().get(0).getContig(), this.recordTable.getItems().get(0).getStart()), new ContigPos(this.recordTable.getItems().get(last_index).getContig(), this.recordTable.getItems().get(last_index).getEnd()));
if (this.recordTable.getItems().get(0).getContig().equals(this.recordTable.getItems().get(last_index).getContig())) {
this.gotoField.setText(this.recordTable.getItems().get(0).getContig() + ":" + this.recordTable.getItems().get(0).getStart() + "-" + this.recordTable.getItems().get(last_index).getEnd());
}
} else {
super.seqDictionaryCanvas.setItemsInterval(null, null);
}
} else {
super.seqDictionaryCanvas.setItemsInterval(null, null);
}
/* show hide columns for paired end data if no paired data found */
{
final boolean contains_paired = this.recordTable.getItems().stream().anyMatch(S -> S.getReadPairedFlag());
for (final TableColumn<SAMRecord, ?> tc : this.pairedEndColumns) tc.setVisible(contains_paired);
}
repaintCanvas();
}
Aggregations