use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gridss by PapenfussLab.
the class Manual method assembly_scoring_should_be_symmetrical.
// @Test
@Category(Hg19Tests.class)
public void assembly_scoring_should_be_symmetrical() throws FileNotFoundException {
File sam = new File("W:/debug/test.bam");
File ref = Hg19Tests.findHg19Reference();
ProcessingContext pc = new ProcessingContext(getFSContext(), ref, new SynchronousReferenceLookupAdapter(new IndexedFastaSequenceFile(ref)), null, getConfig());
SAMEvidenceSource ses = new SAMEvidenceSource(pc, sam, null, 0);
List<SAMRecord> reads = getRecords(new File("W:/debug/test.bam.gridss.working/test.bam.sv.bam"));
List<SplitReadEvidence> list = ses.iterator().stream().filter(e -> e instanceof SplitReadEvidence).map(e -> (SplitReadEvidence) e).filter(e -> e.getSAMRecord().getReadName().equals("variant.chr12-art1000553")).collect(Collectors.toList());
for (SplitReadEvidence e : list) {
e.getBreakpointQual();
}
}
use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gridss by PapenfussLab.
the class ExtractSVReadsTest method regression_should_extract_split_read_alignments_as_group.
// @Test
@Category(Hg38Tests.class)
public void regression_should_extract_split_read_alignments_as_group() throws IOException {
File ref = Hg38Tests.findHg38Reference();
ReferenceLookup lookup = new SynchronousReferenceLookupAdapter(new IndexedFastaSequenceFile(ref));
Files.copy(new File("src/test/resources/sa.split/test1.sam"), input);
ExtractSVReads extract = new ExtractSVReads();
// new ProcessingContext(getFSContext(), ref, lookup, null, getConfig());
extract.setReference(lookup);
extract.MIN_CLIP_LENGTH = 4;
extract.INSERT_SIZE_METRICS = new File("src/test/resources/sa.split/test.insert_size_metrics");
extract.READ_PAIR_CONCORDANCE_METHOD = ReadPairConcordanceMethod.PERCENTAGE;
extract.OUTPUT = output;
extract.INPUT = input;
try (SamReader reader = SamReaderFactory.make().open(input)) {
extract.setup(reader.getFileHeader(), input);
}
List<SAMRecord> records = getRecords(input);
List<Boolean> result = records.stream().map(r -> extract.shouldExtract(ImmutableList.of(r), lookup)[0]).collect(Collectors.toList());
/*
for (int i = 0; i < records.size(); i++) {
SAMRecord r = records.get(i);
System.out.print(r.getSupplementaryAlignmentFlag() ? "S" : " ");
System.out.print(r.getFirstOfPairFlag() ? "1" : "2");
System.out.print(" Extracted=" + (result.get(i) ? "y" : "n"));
System.out.print(" HasConcPair=" + (ExtractSVReads.hasReadPairingConsistentWithReference(extract.getReadPairConcordanceCalculator(), ImmutableList.of(r)) ? "y" : "n"));
boolean[] ra = ExtractSVReads.hasReadAlignmentConsistentWithReference(ImmutableList.of(r));
System.out.print(" HasConcRead=" + (ra[0] ? "y" : "n") + (ra[1] ? "y" : "n"));
System.out.println(" " + new ChimericAlignment(r).toString());
}
*/
Assert.assertEquals(records.stream().map(r -> r.getSecondOfPairFlag()).collect(Collectors.toList()), result);
lookup.close();
}
use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project gridss by PapenfussLab.
the class SynchronousReferenceLookupAdapterTest method multi_threaded_read.
@Test
@Category(Hg19Tests.class)
public void multi_threaded_read() throws Exception {
File ref = Hg19Tests.findHg19Reference();
IndexedFastaSequenceFile indexed = new IndexedFastaSequenceFile(ref);
SynchronousReferenceLookupAdapter a = new SynchronousReferenceLookupAdapter(indexed);
e = null;
Thread[] th = new Thread[16];
for (int t = 0; t < th.length; t++) {
th[t] = new Thread(() -> {
try {
Random rng = new Random();
for (int i = 0; i < 4096; i++) {
int pos = rng.nextInt(100000000);
assertEquals(a.getBase(indexed.getSequenceDictionary().getSequence("chr4").getSequenceIndex(), pos), indexed.getSubsequenceAt("chr4", pos, pos).getBases()[0]);
}
} catch (Exception ex) {
e = ex;
}
});
th[t].start();
}
for (int t = 0; t < th.length; t++) {
th[t].join();
}
a.close();
if (e != null)
throw e;
}
use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project jvarkit by lindenb.
the class LocalRealignReads method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidxFile == null) {
LOG.error("REFerence file missing;");
return -1;
}
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samReader = null;
SAMFileWriter w = null;
SAMRecordIterator iter = null;
GenomicSequence genomicSequence = null;
LongestCommonSequence matrix = new LongestCommonSequence();
try {
indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
samReader = openSamReader(oneFileOrNull(args));
final SAMFileHeader header1 = samReader.getFileHeader();
final SAMFileHeader header2 = header1.clone();
header2.setSortOrder(SortOrder.unsorted);
w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = samReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
w.addAlignment(rec);
continue;
}
final Cigar cigar = rec.getCigar();
final int nCigarElement = cigar.numCigarElements();
if (nCigarElement < 2) {
w.addAlignment(rec);
continue;
}
final CigarElement ce5 = cigar.getCigarElement(0);
final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
w.addAlignment(rec);
continue;
}
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
final CharSequence readseq = rec.getReadString();
/**
* short invert
*/
if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}
*/
}
if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}*/
}
/* other */
List<LongestCommonSequence.Hit> hits = new ArrayList<>();
align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
if (hits.size() < 2000)
continue;
for (LongestCommonSequence.Hit hit : hits) {
int readstart = 0;
boolean in_M = false;
for (int i = 0; i < nCigarElement; ++i) {
final CigarElement ce = cigar.getCigarElement(i);
if (ce.getOperator().consumesReadBases()) {
int readend = readstart + ce.getLength();
if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
in_M = true;
break;
}
}
readstart = readend;
}
}
if (in_M)
continue;
int align_size = hit.size();
System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
System.err.print("REF :");
for (int i = 0; i < align_size; ++i) {
System.err.print(genomicSequence.charAt(hit.getStartX() + i));
}
System.err.println();
System.err.print("READ :");
for (int i = 0; i < align_size; ++i) {
System.err.print(readseq.charAt(hit.getStartY() + i));
}
System.err.println();
}
System.err.println();
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
return wrapException(e);
} finally {
genomicSequence = null;
CloserUtil.close(iter);
CloserUtil.close(samReader);
CloserUtil.close(w);
CloserUtil.close(indexedFastaSequenceFile);
matrix = null;
}
}
use of htsjdk.samtools.reference.IndexedFastaSequenceFile in project jvarkit by lindenb.
the class MapUniProtFeatures method doWork.
@Override
public int doWork(List<String> args) {
PrintWriter pw = null;
try {
JAXBContext jc = JAXBContext.newInstance("org.uniprot");
Unmarshaller uniprotUnmarshaller = jc.createUnmarshaller();
Marshaller uniprotMarshaller = jc.createMarshaller();
pw = super.openFileOrStdoutAsPrintWriter(OUT);
LOG.info("read " + REF);
this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(REF);
if (this.indexedFastaSequenceFile.getSequenceDictionary() == null) {
LOG.error("fasta sequence is not indexed");
return -1;
}
LOG.info("readubf " + kgUri);
String line;
Pattern tab = Pattern.compile("[\t]");
BufferedReader r = IOUtils.openURIForBufferedReading(kgUri);
while ((line = r.readLine()) != null) {
String[] tokens = tab.split(line);
KnownGene kg = new KnownGene();
kg.setName(tokens[0]);
kg.setChrom(tokens[1]);
kg.setStrand(tokens[2].charAt(0));
kg.setTxStart(Integer.parseInt(tokens[3]));
kg.setTxEnd(Integer.parseInt(tokens[4]));
kg.setCdsStart(Integer.parseInt(tokens[5]));
kg.setCdsEnd(Integer.parseInt(tokens[6]));
kg.setExonBounds(Integer.parseInt(tokens[7]), tokens[8], tokens[9]);
List<KnownGene> L = prot2genes.get(tokens[10]);
if (L == null) {
L = new ArrayList<KnownGene>();
prot2genes.put(tokens[10], L);
}
if (indexedFastaSequenceFile.getSequenceDictionary().getSequence(kg.getContig()) == null) {
LOG.info("ignoring " + line);
continue;
}
L.add(kg);
}
r.close();
if (OUT != null) {
LOG.info("opening " + OUT);
pw = new PrintWriter(OUT);
}
LOG.info("knownGenes: " + this.prot2genes.size());
XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
xmlInputFactory.setXMLResolver(new XMLResolver() {
@Override
public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
return null;
}
});
// SortingCollection<UBed> mappedFeatures=SortingCollection.newInstance(UBed.class, new UBedCodec(),new UBedCmp(),super.MAX_RECORDS_IN_RAM);
// mappedFeatures.setDestructiveIteration(true);
LOG.info("Scanning " + UNIPROT);
Reader fr = IOUtils.openURIForBufferedReading(UNIPROT);
XMLEventReader rx = xmlInputFactory.createXMLEventReader(fr);
final QName uEntry = new QName(UNIPROT_NS, "entry");
GenomicSequence genomic = null;
while (rx.hasNext()) {
XMLEvent evt = rx.peek();
if (!(evt.isStartElement() && evt.asStartElement().getName().equals(uEntry))) {
rx.next();
continue;
}
JAXBElement<Entry> jaxbElement = uniprotUnmarshaller.unmarshal(rx, Entry.class);
Entry entry = jaxbElement.getValue();
if (entry.getFeature().isEmpty())
continue;
List<KnownGene> genes = null;
for (String acn : entry.getAccession()) {
genes = prot2genes.get(acn);
if (genes != null)
break;
}
if (genes == null)
continue;
for (KnownGene kg : genes) {
if (genomic == null || !genomic.getChrom().equals(kg.getChromosome())) {
genomic = new GenomicSequence(this.indexedFastaSequenceFile, kg.getChromosome());
}
KnownGene.Peptide pep = kg.getCodingRNA(genomic).getPeptide();
int sameSequenceLength = 0;
while (sameSequenceLength < entry.getSequence().getValue().length() && sameSequenceLength < pep.length()) {
if (Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength)) != Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength))) {
break;
}
sameSequenceLength++;
}
if (sameSequenceLength != pep.length()) {
System.err.println("Not Same sequence " + kg.getName() + " strand " + kg.getStrand() + " ok-up-to-" + sameSequenceLength);
System.err.println("P:" + pep.toString() + " " + pep.length());
System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
if (pep.toString().contains("?")) {
System.err.println("RNA:" + pep.getCodingRNA().toString());
}
}
if (sameSequenceLength == 0)
continue;
for (FeatureType feat : entry.getFeature()) {
if (feat.getType() == null || feat.getType().isEmpty())
continue;
LocationType locType = feat.getLocation();
if (locType == null)
continue;
int pepStart, pepEnd;
if (locType.getPosition() != null && locType.getPosition().getPosition() != null) {
pepEnd = locType.getPosition().getPosition().intValue();
pepStart = pepEnd - 1;
} else if (locType.getBegin() != null && locType.getEnd() != null && locType.getBegin().getPosition() != null && locType.getEnd().getPosition() != null) {
pepStart = locType.getBegin().getPosition().intValue() - 1;
pepEnd = locType.getEnd().getPosition().intValue();
} else {
continue;
}
if (pepEnd >= sameSequenceLength)
continue;
List<Integer> genomicPos = new ArrayList<Integer>();
while (pepStart < pepEnd) {
if (pepStart >= pep.length()) {
System.err.println();
System.err.println("P:" + pep.toString() + " " + pep.length() + " " + kg.getStrand());
System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
}
int[] codon = pep.convertToGenomicCoordinates(pepStart);
pepStart++;
for (int gP : codon) {
if (gP == -1) {
uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
LOG.error("error in genomoc for (" + pepStart + "/" + pepEnd + "):" + entry.getName());
System.exit(-1);
}
genomicPos.add(gP);
}
}
Collections.sort(genomicPos);
UBed ubed = new UBed();
ubed.tid = this.indexedFastaSequenceFile.getSequenceDictionary().getSequenceIndex(genomic.getChrom());
int k0 = 0;
while (k0 < genomicPos.size()) {
Range range = new Range(genomicPos.get(k0), genomicPos.get(k0) + 1);
k0++;
while (k0 < genomicPos.size() && range.end == genomicPos.get(k0)) {
range = new Range(range.start, range.end + 1);
++k0;
}
ubed.positions.add(range);
}
ubed.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
ubed.name = feat.getType();
ubed.print(pw);
}
}
}
rx.close();
fr.close();
LOG.info("End scan uniprot");
/*
mappedFeatures.doneAdding();
CloseableIterator<UBed> iter=mappedFeatures.iterator();
while(iter.hasNext())
{
UBed ubed=iter.next();
ubed.print();
}
mappedFeatures.cleanup();
*/
} catch (Exception err) {
err.printStackTrace();
if (OUT != null)
OUT.deleteOnExit();
return -1;
} finally {
pw.flush();
pw.close();
CloserUtil.close(this.indexedFastaSequenceFile);
}
return 0;
}
Aggregations