use of com.github.lindenb.jvarkit.iterator.EqualIterator in project jvarkit by lindenb.
the class Biostar489074 method processInput.
@Override
protected int processInput(final SAMFileHeader header, final CloseableIterator<SAMRecord> iter0) {
VariantContextWriter out = null;
GenomicSequence genome = null;
SortingCollection<Call> sorting = null;
try {
this.indexedFastaRef = ReferenceSequenceFileFactory.getReferenceSequenceFile(getRequiredReferencePath());
if (!(header.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted) || header.getSortOrder().equals(SAMFileHeader.SortOrder.queryname))) {
getLogger().error("input should be sorted with 'samtools sort -n' or 'samtools collate' but got " + header.getSortOrder());
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
this.writingVariantsDelegate.dictionary(dict);
sorting = SortingCollection.newInstance(Call.class, new CallCodec(), (A, B) -> A.compare2(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
final List<String> samples = header.getReadGroups().stream().map(R -> this.groupBy.apply(R)).filter(S -> !StringUtils.isBlank(S)).sorted().collect(Collectors.toSet()).stream().collect(Collectors.toList());
final Map<String, Integer> rgid2idx = new HashMap<>();
for (final SAMReadGroupRecord rg : header.getReadGroups()) {
final String sn = this.groupBy.apply(rg);
if (StringUtils.isBlank(sn))
continue;
int idx = samples.indexOf(sn);
if (idx == -1)
continue;
rgid2idx.put(rg.getId(), idx);
}
try (CloseableIterator<List<SAMRecord>> iter = new EqualIterator<>(iter0, (A, B) -> A.getReadName().compareTo(B.getReadName()))) {
while (iter.hasNext()) {
final List<SAMRecord> buffer = iter.next();
int read1_idx = -1;
int read2_idx = -1;
for (int i = 0; i < buffer.size(); i++) {
final SAMRecord rec = buffer.get(i);
if (!rec.getReadPairedFlag())
continue;
if (rec.getReadUnmappedFlag())
continue;
if (rec.getMateUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getReadBases() == SAMRecord.NULL_SEQUENCE)
continue;
if (rec.getFirstOfPairFlag()) {
read1_idx = i;
} else if (rec.getSecondOfPairFlag()) {
read2_idx = i;
}
}
if (read1_idx == -1 || read2_idx == -1 || read1_idx == read2_idx)
continue;
final SAMRecord rec1a = buffer.get(read1_idx);
final SAMRecord rec2a = buffer.get(read2_idx);
if (!rec1a.overlaps(rec2a))
continue;
final int chromStart = Math.max(rec1a.getStart(), rec2a.getStart());
final int chromEnd = Math.min(rec1a.getEnd(), rec2a.getEnd());
if (chromStart > chromEnd)
continue;
final Integer sampleid = rgid2idx.get(rec1a.getReadGroup().getId());
if (sampleid == null)
continue;
if (genome == null || !genome.contigsMatch(rec1a)) {
genome = new GenomicSequence(this.indexedFastaRef, rec1a.getContig());
}
final Set<Call> calls1 = new HashSet<>();
final Set<Call> calls2 = new HashSet<>();
for (int side = 0; side < 2; side++) {
final SAMRecord rec = (side == 0 ? rec1a : rec2a);
final Set<Call> calls = (side == 0 ? calls1 : calls2);
final byte[] bases = rec.getReadBases();
for (AlignmentBlock ab : rec.getAlignmentBlocks()) {
for (int n = 0; n < ab.getLength(); ++n) {
final int ref = ab.getReferenceStart() + n;
if (ref < chromStart)
continue;
if (ref > chromEnd)
break;
if (ref < 0 || ref >= genome.length())
continue;
// 0 based
final byte refBase = (byte) Character.toUpperCase(genome.charAt(ref - 1));
if (!AcidNucleics.isATGC(refBase))
continue;
// 0 based
final byte readBase = (byte) Character.toUpperCase(bases[(ab.getReadStart() - 1) + n]);
if (readBase == refBase)
continue;
final Call call = new Call();
call.sampleid = sampleid;
call.tid = rec1a.getReferenceIndex().intValue();
call.ref = refBase;
call.alt = readBase;
call.pos = ref;
calls.add(call);
}
}
}
calls1.retainAll(calls2);
if (calls1.isEmpty())
continue;
for (final Call c : calls1) {
sorting.add(c);
}
}
sorting.doneAdding();
out = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_COUNT_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_NUMBER_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_FREQUENCY_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
final VCFHeader header2 = new VCFHeader(metaData, samples);
header2.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header2);
out.writeHeader(header2);
try (CloseableIterator<Call> iter1 = sorting.iterator()) {
try (EqualRangeIterator<Call> iter2 = new EqualRangeIterator<>(iter1, (A, B) -> A.compare1(B))) {
while (iter2.hasNext()) {
final List<Call> calls = iter2.next();
if (calls.isEmpty())
continue;
final Call first = calls.get(0);
final Set<Allele> altAllelesSet = calls.stream().map(A -> Allele.create(A.alt, false)).collect(Collectors.toSet());
final List<Allele> altAllelesList = new ArrayList<>(altAllelesSet);
final List<Allele> vcAlleles = new ArrayList<>(altAllelesList.size() + 1);
vcAlleles.add(Allele.create(first.ref, true));
vcAlleles.addAll(altAllelesList);
final List<Genotype> genotypes = new ArrayList<>(samples.size());
int DP = 0;
for (int i = 0; i < samples.size(); i++) {
final String sn = samples.get(i);
final Counter<Allele> allele2count = new Counter<>();
for (Call c : calls) {
if (c.sampleid != i)
continue;
allele2count.incr(Allele.create(c.alt, false));
}
Genotype gt;
if (allele2count.isEmpty()) {
gt = GenotypeBuilder.createMissing(sn, this.ploidy);
} else {
final int[] array = new int[vcAlleles.size()];
for (int j = 0; j < vcAlleles.size(); j++) {
array[j] = (int) allele2count.count(vcAlleles.get(j));
}
final GenotypeBuilder gb = new GenotypeBuilder(sn, new ArrayList<>(allele2count.keySet()));
gb.AD(array);
gb.DP((int) allele2count.getTotal());
DP += (int) allele2count.getTotal();
gt = gb.make();
}
genotypes.add(gt);
}
final VariantContextBuilder vcb = new VariantContextBuilder(null, dict.getSequence(first.tid).getContig(), first.pos, first.pos, vcAlleles);
vcb.attribute(VCFConstants.DEPTH_KEY, DP);
vcb.genotypes(genotypes);
out.add(vcb.make());
}
}
}
}
sorting.cleanup();
out.close();
out = null;
out = null;
this.indexedFastaRef.close();
this.indexedFastaRef = null;
return 0;
} catch (final Throwable err) {
getLogger().error(err);
return -1;
} finally {
CloserUtil.close(out);
CloserUtil.close(this.indexedFastaRef);
;
}
}
use of com.github.lindenb.jvarkit.iterator.EqualIterator in project jvarkit by lindenb.
the class BamToFastq method processInput.
@Override
protected int processInput(final SAMFileHeader header, final CloseableIterator<SAMRecord> iter0) {
final Comparator<SAMRecord> queryNameComparator = (A, B) -> A.getReadName().compareTo(B.getReadName());
SortingCollection<SAMRecord> sortingSAMRecord = null;
final ArrayList<SAMRecord> buffer = new ArrayList<>(50_000);
final CountIn<FastqRecord> singleCounter = new CountIn<>("single-end");
final CountIn<FastqRecord> unpairedCounter = new CountIn<>("unpaired");
final CountIn<FastqRecord> pairedCounter = new CountIn<>("paired");
final CountIn<SAMRecord> sortingCounter = new CountIn<>("sorting");
final PeekableIterator<SAMRecord> iter = new PeekableIterator<>(iter0);
try {
if (!SAMFileHeader.SortOrder.coordinate.equals(header.getSortOrder())) {
LOG.error("Input is not sorted on coordinate. got : " + header.getSortOrder());
return -1;
}
if (singleFastq != null)
FastqUtils.validateFastqFilename(singleFastq);
if (unpairedFile1 != null)
FastqUtils.validateFastqFilename(unpairedFile1);
if (unpairedFile2 != null)
FastqUtils.validateFastqFilename(unpairedFile2);
try (FastqWriter singleEndWriter = this.singleFastq == null ? new NullFastqWriter() : new BasicFastqWriter(this.singleFastq);
FastqWriter unpairedWriter1 = this.unpairedFile1 == null ? new NullFastqWriter() : new BasicFastqWriter(this.unpairedFile1);
FastqWriter unpairedWriter2 = this.unpairedFile2 == null ? new NullFastqWriter() : new BasicFastqWriter(this.unpairedFile2);
FastqPairedWriter R1R2writer = openFastqPairedWriter()) {
sortingSAMRecord = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(header), queryNameComparator, sortingCollection.getMaxRecordsInRam(), sortingCollection.getTmpPaths());
sortingSAMRecord.setDestructiveIteration(true);
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.isSecondaryOrSupplementary())
continue;
if (!rec.getReadPairedFlag()) {
singleEndWriter.write(singleCounter.apply(toFastq(rec)));
continue;
}
if ((rec.getReadUnmappedFlag() || rec.getMateUnmappedFlag()) && iter.hasNext()) {
final SAMRecord rec2 = iter.peek();
if (!rec2.isSecondaryOrSupplementary() && queryNameComparator.compare(rec, rec2) == 0) {
if (rec2.getFirstOfPairFlag() && rec.getSecondOfPairFlag()) {
// consumme
iter.next();
R1R2writer.write(pairedCounter.apply(toFastq(rec2)), pairedCounter.apply(toFastq(rec)));
continue;
} else if (rec.getFirstOfPairFlag() && rec2.getSecondOfPairFlag()) {
// consumme
iter.next();
R1R2writer.write(pairedCounter.apply(toFastq(rec)), pairedCounter.apply(toFastq(rec2)));
continue;
}
}
}
if (rec.getReadUnmappedFlag() || rec.getMateUnmappedFlag() || !rec.getReferenceName().equals(rec.getMateReferenceName()) || Math.abs(rec.getInferredInsertSize()) > this.distance) {
sortingSAMRecord.add(sortingCounter.apply(rec));
continue;
}
while (!buffer.isEmpty() && !buffer.get(0).getReferenceName().equals(rec.getReferenceName())) {
sortingSAMRecord.add(sortingCounter.apply(buffer.remove(0)));
}
while (!buffer.isEmpty() && (rec.getAlignmentStart() - buffer.get(0).getAlignmentStart()) > this.distance) {
sortingSAMRecord.add(sortingCounter.apply(buffer.remove(0)));
}
if (rec.getAlignmentStart() < rec.getMateAlignmentStart()) {
buffer.add(rec);
continue;
}
SAMRecord mate = null;
int i = 0;
while (i < buffer.size()) {
final SAMRecord rec2 = buffer.get(i);
if (queryNameComparator.compare(rec2, rec) == 0) {
mate = rec2;
buffer.remove(i);
break;
}
if (rec2.getAlignmentStart() > rec.getMateAlignmentStart()) {
break;
}
++i;
}
if (mate == null) {
(rec.getFirstOfPairFlag() ? unpairedWriter1 : unpairedWriter2).write(unpairedCounter.apply(toFastq(rec)));
} else if (mate.getFirstOfPairFlag() && rec.getSecondOfPairFlag()) {
R1R2writer.write(pairedCounter.apply(toFastq(mate)), pairedCounter.apply(toFastq(rec)));
} else if (rec.getFirstOfPairFlag() && mate.getSecondOfPairFlag()) {
R1R2writer.write(pairedCounter.apply(toFastq(rec)), pairedCounter.apply(toFastq(mate)));
} else {
(rec.getFirstOfPairFlag() ? unpairedWriter1 : unpairedWriter2).write(unpairedCounter.apply(toFastq(rec)));
(mate.getFirstOfPairFlag() ? unpairedWriter1 : unpairedWriter2).write(unpairedCounter.apply(toFastq(mate)));
}
}
// end while
for (final SAMRecord rec : buffer) {
sortingSAMRecord.add(sortingCounter.apply(rec));
}
buffer.clear();
sortingSAMRecord.doneAdding();
try (CloseableIterator<SAMRecord> iter2 = sortingSAMRecord.iterator()) {
try (EqualIterator<SAMRecord> eq = new EqualIterator<>(iter2, queryNameComparator)) {
while (eq.hasNext()) {
final List<SAMRecord> L = eq.next();
if (L.size() == 2) {
if (L.get(0).getFirstOfPairFlag() && L.get(1).getSecondOfPairFlag()) {
R1R2writer.write(pairedCounter.apply(toFastq(L.get(0))), pairedCounter.apply(toFastq(L.get(1))));
} else if (L.get(1).getFirstOfPairFlag() && L.get(0).getSecondOfPairFlag()) {
R1R2writer.write(pairedCounter.apply(toFastq(L.get(1))), pairedCounter.apply(toFastq(L.get(0))));
} else {
(L.get(0).getFirstOfPairFlag() ? unpairedWriter1 : unpairedWriter2).write(unpairedCounter.apply(toFastq(L.get(0))));
(L.get(1).getFirstOfPairFlag() ? unpairedWriter1 : unpairedWriter2).write(unpairedCounter.apply(toFastq(L.get(1))));
}
} else {
for (SAMRecord rec2 : L) {
(rec2.getFirstOfPairFlag() ? unpairedWriter1 : unpairedWriter2).write(unpairedCounter.apply(toFastq(rec2)));
}
}
}
}
}
}
sortingSAMRecord.cleanup();
unpairedCounter.log();
singleCounter.log();
pairedCounter.log();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
iter.close();
}
}
use of com.github.lindenb.jvarkit.iterator.EqualIterator in project jvarkit by lindenb.
the class BamPhased01 method scanIterator.
@Override
protected void scanIterator(final SAMFileHeader headerIn, final CloseableIterator<SAMRecord> iter0, final SAMFileWriter sfw) {
if (this.paired_mode) {
try (EqualIterator<SAMRecord> iter = new EqualIterator<>(iter0, (A, B) -> A.getReadName().compareTo(B.getReadName()))) {
while (iter.hasNext()) {
final LinkedList<SAMRecord> buffer = new LinkedList<>(iter.next());
SAMRecord R1 = null;
SAMRecord R2 = null;
while (!buffer.isEmpty()) {
final SAMRecord rec = buffer.pop();
if (rec.getReadUnmappedFlag()) {
failingSAMRecord(rec, sfw);
} else if (!rec.getReadPairedFlag() || rec.isSecondaryOrSupplementary()) {
scanVariants(Collections.singletonList(rec), sfw);
} else if (R1 == null && rec.getFirstOfPairFlag()) {
R1 = rec;
} else if (R2 == null && rec.getSecondOfPairFlag()) {
R2 = rec;
} else {
failingSAMRecord(rec, sfw);
}
}
if (R1 != null && R2 != null) {
if (R1.contigsMatch(R2)) {
scanVariants(Arrays.asList(R1, R2), sfw);
} else {
scanVariants(Collections.singletonList(R1), sfw);
scanVariants(Collections.singletonList(R2), sfw);
}
} else if (R1 != null && R2 == null) {
scanVariants(Collections.singletonList(R1), sfw);
} else if (R2 != null && R1 == null) {
scanVariants(Collections.singletonList(R2), sfw);
}
}
}
} else {
while (iter0.hasNext()) {
final SAMRecord rec = iter0.next();
if (rec.getReadUnmappedFlag()) {
failingSAMRecord(rec, sfw);
continue;
}
scanVariants(Collections.singletonList(rec), sfw);
}
}
}
use of com.github.lindenb.jvarkit.iterator.EqualIterator in project jvarkit by lindenb.
the class Biostar480685 method doWork.
@Override
public int doWork(final List<String> args) {
SamReader in = null;
SAMFileWriter out = null;
try {
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null) {
srf.referenceSequence(this.faidx);
writingBamArgs.setReferencePath(this.faidx);
}
final String input = oneFileOrNull(args);
if (input == null) {
in = srf.open(SamInputResource.of(stdin()));
} else {
in = srf.open(SamInputResource.of(input));
}
final SAMFileHeader header = in.getFileHeader();
if (!(header.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted) || header.getSortOrder().equals(SAMFileHeader.SortOrder.queryname))) {
LOG.error("input should be sorted with 'samtools sort -n' or 'samtools collate' but got " + header.getSortOrder());
return -1;
}
final ReadClipper clipper = new ReadClipper();
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
final SAMProgramRecord prg = header.createProgramRecord();
prg.setCommandLine(this.getProgramCommandLine());
prg.setProgramName(this.getProgramName());
prg.setProgramVersion(this.getGitHash());
JVarkitVersion.getInstance().addMetaData(this, header);
out = this.writingBamArgs.openSamWriter(this.outputFile, header, true);
try (CloseableIterator<List<SAMRecord>> iter = new EqualIterator<>(in.iterator(), (A, B) -> A.getReadName().compareTo(B.getReadName()))) {
while (iter.hasNext()) {
final List<SAMRecord> buffer = iter.next();
int read1_idx = -1;
int read2_idx = -1;
for (int i = 0; i < buffer.size(); i++) {
final SAMRecord rec = buffer.get(i);
if (!rec.getReadPairedFlag())
continue;
if (rec.getReadUnmappedFlag())
continue;
if (rec.getMateUnmappedFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getFirstOfPairFlag()) {
read1_idx = i;
} else if (rec.getSecondOfPairFlag()) {
read2_idx = i;
}
}
if (read1_idx == -1 || read2_idx == -1 || read1_idx == read2_idx)
continue;
final SAMRecord rec1a = buffer.get(read1_idx);
final SAMRecord rec2a = buffer.get(read2_idx);
if (!rec1a.overlaps(rec2a))
continue;
final int chromStart = Math.max(rec1a.getStart(), rec2a.getStart());
final int chromEnd = Math.min(rec1a.getEnd(), rec2a.getEnd());
if (chromStart > chromEnd)
continue;
final SimpleInterval rgn = new SimpleInterval(rec1a.getContig(), chromStart, chromEnd);
final SAMRecord rec1b = clipper.clip(rec1a, rgn);
if (rec1b == null || rec1b.getReadUnmappedFlag())
continue;
final SAMRecord rec2b = clipper.clip(rec2a, rgn);
if (rec2b == null || rec2b.getReadUnmappedFlag())
continue;
rec1b.setAttribute("PG", prg.getId());
rec2b.setAttribute("PG", prg.getId());
rec1b.setAlignmentStart(chromStart);
rec1b.setMateAlignmentStart(rec2b.getAlignmentStart());
rec2b.setAlignmentStart(chromStart);
rec2b.setMateAlignmentStart(rec1b.getAlignmentStart());
rec1b.setAttribute("MC", rec2b.getCigarString());
rec2b.setAttribute("MC", rec1b.getCigarString());
rec1b.setAttribute("NM", null);
rec2b.setAttribute("NM", null);
buffer.set(read1_idx, rec1b);
buffer.set(read2_idx, rec2b);
for (SAMRecord rec : buffer) {
out.addAlignment(rec);
}
}
}
in.close();
in = null;
out.close();
out = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.iterator.EqualIterator in project jvarkit by lindenb.
the class SetFileTools method fromBed.
private int fromBed(final List<String> args) throws IOException {
final String input = oneFileOrNull(args);
final Function<BedLine, String> bed2name = bed -> {
if (bed.getColumnCount() < 4)
throw new IllegalArgumentException("Expected 4 columns but got " + bed);
return bed.get(3);
};
try (BufferedReader br = super.openBufferedReader(input)) {
try (BedLineReader blr = new BedLineReader(br, input)) {
blr.setValidationStringency(validationStringency);
blr.setContigNameConverter(this.theCtgConverter);
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
final EqualIterator<BedLine> iter = new EqualIterator<BedLine>(blr.stream().iterator(), (A, B) -> bed2name.apply(A).compareTo(bed2name.apply(B)));
while (iter.hasNext()) {
final List<BedLine> lines = iter.next();
final List<Locatable> L = sortAndMerge(lines);
pw.print(bed2name.apply(lines.get(0)));
for (int i = 0; i < L.size(); i++) {
pw.print(i == 0 ? "\t" : ",");
final Locatable rec = L.get(i);
pw.print(noChr(rec.getContig()));
pw.print(":");
pw.print(rec.getStart());
pw.print("-");
pw.print(rec.getEnd());
}
pw.println();
}
iter.close();
pw.flush();
}
}
}
return 0;
}
Aggregations