use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class VcfSimulator method doWork.
@Override
public int doWork(final List<String> args) {
if (this.indexedFastaSequenceFile == null) {
LOG.error("Reference is undefined");
return -1;
}
if (!args.isEmpty()) {
LOG.error("too many arguments");
return -1;
}
if (this.randomSeed == -1L) {
this.random = new Random(System.currentTimeMillis());
} else {
this.random = new Random(this.randomSeed);
}
if (this.numSamples < 0) {
this.numSamples = 1 + this.random.nextInt(10);
}
while (this.samples.size() < numSamples) this.samples.add("SAMPLE" + (1 + this.samples.size()));
VariantContextWriter writer = null;
PrintStream pw = null;
try {
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, "GT", "DP");
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, "AF", "AN", "AC", "DP");
VariantAttributesRecalculator calc = new VariantAttributesRecalculator();
final VCFHeader header = new VCFHeader(metaData, this.samples);
header.setSequenceDictionary(this.indexedFastaSequenceFile.getSequenceDictionary());
calc.setHeader(header);
pw = super.openFileOrStdoutAsPrintStream(this.outputFile);
writer = VCFUtils.createVariantContextWriterToOutputStream(pw);
writer.writeHeader(header);
long countVariantsSoFar = 0;
for (; ; ) {
if (pw.checkError())
break;
if (this.numberOfVariants >= 0 && countVariantsSoFar >= this.numberOfVariants)
break;
for (final SAMSequenceRecord ssr : this.indexedFastaSequenceFile.getSequenceDictionary().getSequences()) {
if (pw.checkError())
break;
final GenomicSequence genomic = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
for (int pos = 1; pos <= ssr.getSequenceLength(); ++pos) {
if (pw.checkError())
break;
char REF = Character.toUpperCase(genomic.charAt(pos - 1));
if (REF == 'N')
continue;
char ALT = 'N';
switch(REF) {
case 'A':
ALT = "TGC".charAt(random.nextInt(3));
break;
case 'T':
ALT = "AGC".charAt(random.nextInt(3));
break;
case 'G':
ALT = "TAC".charAt(random.nextInt(3));
break;
case 'C':
ALT = "TGA".charAt(random.nextInt(3));
break;
default:
ALT = 'N';
}
if (ALT == 'N')
continue;
final Allele refAllele = Allele.create((byte) REF, true);
Allele altAllele = Allele.create((byte) ALT, false);
final VariantContextBuilder cb = new VariantContextBuilder();
cb.chr(genomic.getChrom());
cb.start(pos);
cb.stop(pos);
List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
for (String sample : samples) {
final Allele a1 = (random.nextBoolean() ? refAllele : altAllele);
final Allele a2 = (random.nextBoolean() ? refAllele : altAllele);
GenotypeBuilder gb = new GenotypeBuilder(sample, Arrays.asList(a1, a2));
if (random.nextBoolean()) {
gb = new GenotypeBuilder(sample, Arrays.asList(a1, a2));
gb.DP(1 + random.nextInt(50));
}
genotypes.add(gb.make());
}
cb.genotypes(genotypes);
cb.alleles(Arrays.asList(refAllele, altAllele));
writer.add(calc.apply(cb.make()));
countVariantsSoFar++;
}
}
}
writer.close();
writer = null;
pw.flush();
pw.close();
pw = null;
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
CloserUtil.close(pw);
CloserUtil.close(writer);
}
}
use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class LocalRealignReads method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidxFile == null) {
LOG.error("REFerence file missing;");
return -1;
}
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samReader = null;
SAMFileWriter w = null;
SAMRecordIterator iter = null;
GenomicSequence genomicSequence = null;
LongestCommonSequence matrix = new LongestCommonSequence();
try {
indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
samReader = openSamReader(oneFileOrNull(args));
final SAMFileHeader header1 = samReader.getFileHeader();
final SAMFileHeader header2 = header1.clone();
header2.setSortOrder(SortOrder.unsorted);
w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = samReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
w.addAlignment(rec);
continue;
}
final Cigar cigar = rec.getCigar();
final int nCigarElement = cigar.numCigarElements();
if (nCigarElement < 2) {
w.addAlignment(rec);
continue;
}
final CigarElement ce5 = cigar.getCigarElement(0);
final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
w.addAlignment(rec);
continue;
}
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
final CharSequence readseq = rec.getReadString();
/**
* short invert
*/
if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}
*/
}
if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}*/
}
/* other */
List<LongestCommonSequence.Hit> hits = new ArrayList<>();
align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
if (hits.size() < 2000)
continue;
for (LongestCommonSequence.Hit hit : hits) {
int readstart = 0;
boolean in_M = false;
for (int i = 0; i < nCigarElement; ++i) {
final CigarElement ce = cigar.getCigarElement(i);
if (ce.getOperator().consumesReadBases()) {
int readend = readstart + ce.getLength();
if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
in_M = true;
break;
}
}
readstart = readend;
}
}
if (in_M)
continue;
int align_size = hit.size();
System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
System.err.print("REF :");
for (int i = 0; i < align_size; ++i) {
System.err.print(genomicSequence.charAt(hit.getStartX() + i));
}
System.err.println();
System.err.print("READ :");
for (int i = 0; i < align_size; ++i) {
System.err.print(readseq.charAt(hit.getStartY() + i));
}
System.err.println();
}
System.err.println();
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
return wrapException(e);
} finally {
genomicSequence = null;
CloserUtil.close(iter);
CloserUtil.close(samReader);
CloserUtil.close(w);
CloserUtil.close(indexedFastaSequenceFile);
matrix = null;
}
}
use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class MapUniProtFeatures method doWork.
@Override
public int doWork(List<String> args) {
PrintWriter pw = null;
try {
JAXBContext jc = JAXBContext.newInstance("org.uniprot");
Unmarshaller uniprotUnmarshaller = jc.createUnmarshaller();
Marshaller uniprotMarshaller = jc.createMarshaller();
pw = super.openFileOrStdoutAsPrintWriter(OUT);
LOG.info("read " + REF);
this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(REF);
if (this.indexedFastaSequenceFile.getSequenceDictionary() == null) {
LOG.error("fasta sequence is not indexed");
return -1;
}
LOG.info("readubf " + kgUri);
String line;
Pattern tab = Pattern.compile("[\t]");
BufferedReader r = IOUtils.openURIForBufferedReading(kgUri);
while ((line = r.readLine()) != null) {
String[] tokens = tab.split(line);
KnownGene kg = new KnownGene();
kg.setName(tokens[0]);
kg.setChrom(tokens[1]);
kg.setStrand(tokens[2].charAt(0));
kg.setTxStart(Integer.parseInt(tokens[3]));
kg.setTxEnd(Integer.parseInt(tokens[4]));
kg.setCdsStart(Integer.parseInt(tokens[5]));
kg.setCdsEnd(Integer.parseInt(tokens[6]));
kg.setExonBounds(Integer.parseInt(tokens[7]), tokens[8], tokens[9]);
List<KnownGene> L = prot2genes.get(tokens[10]);
if (L == null) {
L = new ArrayList<KnownGene>();
prot2genes.put(tokens[10], L);
}
if (indexedFastaSequenceFile.getSequenceDictionary().getSequence(kg.getContig()) == null) {
LOG.info("ignoring " + line);
continue;
}
L.add(kg);
}
r.close();
if (OUT != null) {
LOG.info("opening " + OUT);
pw = new PrintWriter(OUT);
}
LOG.info("knownGenes: " + this.prot2genes.size());
XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
xmlInputFactory.setXMLResolver(new XMLResolver() {
@Override
public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
return null;
}
});
// SortingCollection<UBed> mappedFeatures=SortingCollection.newInstance(UBed.class, new UBedCodec(),new UBedCmp(),super.MAX_RECORDS_IN_RAM);
// mappedFeatures.setDestructiveIteration(true);
LOG.info("Scanning " + UNIPROT);
Reader fr = IOUtils.openURIForBufferedReading(UNIPROT);
XMLEventReader rx = xmlInputFactory.createXMLEventReader(fr);
final QName uEntry = new QName(UNIPROT_NS, "entry");
GenomicSequence genomic = null;
while (rx.hasNext()) {
XMLEvent evt = rx.peek();
if (!(evt.isStartElement() && evt.asStartElement().getName().equals(uEntry))) {
rx.next();
continue;
}
JAXBElement<Entry> jaxbElement = uniprotUnmarshaller.unmarshal(rx, Entry.class);
Entry entry = jaxbElement.getValue();
if (entry.getFeature().isEmpty())
continue;
List<KnownGene> genes = null;
for (String acn : entry.getAccession()) {
genes = prot2genes.get(acn);
if (genes != null)
break;
}
if (genes == null)
continue;
for (KnownGene kg : genes) {
if (genomic == null || !genomic.getChrom().equals(kg.getChromosome())) {
genomic = new GenomicSequence(this.indexedFastaSequenceFile, kg.getChromosome());
}
KnownGene.Peptide pep = kg.getCodingRNA(genomic).getPeptide();
int sameSequenceLength = 0;
while (sameSequenceLength < entry.getSequence().getValue().length() && sameSequenceLength < pep.length()) {
if (Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength)) != Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength))) {
break;
}
sameSequenceLength++;
}
if (sameSequenceLength != pep.length()) {
System.err.println("Not Same sequence " + kg.getName() + " strand " + kg.getStrand() + " ok-up-to-" + sameSequenceLength);
System.err.println("P:" + pep.toString() + " " + pep.length());
System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
if (pep.toString().contains("?")) {
System.err.println("RNA:" + pep.getCodingRNA().toString());
}
}
if (sameSequenceLength == 0)
continue;
for (FeatureType feat : entry.getFeature()) {
if (feat.getType() == null || feat.getType().isEmpty())
continue;
LocationType locType = feat.getLocation();
if (locType == null)
continue;
int pepStart, pepEnd;
if (locType.getPosition() != null && locType.getPosition().getPosition() != null) {
pepEnd = locType.getPosition().getPosition().intValue();
pepStart = pepEnd - 1;
} else if (locType.getBegin() != null && locType.getEnd() != null && locType.getBegin().getPosition() != null && locType.getEnd().getPosition() != null) {
pepStart = locType.getBegin().getPosition().intValue() - 1;
pepEnd = locType.getEnd().getPosition().intValue();
} else {
continue;
}
if (pepEnd >= sameSequenceLength)
continue;
List<Integer> genomicPos = new ArrayList<Integer>();
while (pepStart < pepEnd) {
if (pepStart >= pep.length()) {
System.err.println();
System.err.println("P:" + pep.toString() + " " + pep.length() + " " + kg.getStrand());
System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
}
int[] codon = pep.convertToGenomicCoordinates(pepStart);
pepStart++;
for (int gP : codon) {
if (gP == -1) {
uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
LOG.error("error in genomoc for (" + pepStart + "/" + pepEnd + "):" + entry.getName());
System.exit(-1);
}
genomicPos.add(gP);
}
}
Collections.sort(genomicPos);
UBed ubed = new UBed();
ubed.tid = this.indexedFastaSequenceFile.getSequenceDictionary().getSequenceIndex(genomic.getChrom());
int k0 = 0;
while (k0 < genomicPos.size()) {
Range range = new Range(genomicPos.get(k0), genomicPos.get(k0) + 1);
k0++;
while (k0 < genomicPos.size() && range.end == genomicPos.get(k0)) {
range = new Range(range.start, range.end + 1);
++k0;
}
ubed.positions.add(range);
}
ubed.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
ubed.name = feat.getType();
ubed.print(pw);
}
}
}
rx.close();
fr.close();
LOG.info("End scan uniprot");
/*
mappedFeatures.doneAdding();
CloseableIterator<UBed> iter=mappedFeatures.iterator();
while(iter.hasNext())
{
UBed ubed=iter.next();
ubed.print();
}
mappedFeatures.cleanup();
*/
} catch (Exception err) {
err.printStackTrace();
if (OUT != null)
OUT.deleteOnExit();
return -1;
} finally {
pw.flush();
pw.close();
CloserUtil.close(this.indexedFastaSequenceFile);
}
return 0;
}
use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class VcfToBam method run.
private void run(VcfIterator vcfIterator) throws IOException {
long id_generator = 0L;
SAMFileWriter samFileWriter = null;
final VCFHeader header = vcfIterator.getHeader();
SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null)
throw new IOException("Sequence Dictionary missing in VCF");
if (!SequenceUtil.areSequenceDictionariesEqual(dict, indexedFastaSequenceFile.getSequenceDictionary())) {
LOG.warning("REF/VCF: not the same Sequence dictonary " + dict + " " + indexedFastaSequenceFile.getSequenceDictionary());
}
final SAMFileHeader samHeader = new SAMFileHeader();
samHeader.setSequenceDictionary(dict);
for (final String sample : new TreeSet<>(header.getSampleNamesInOrder())) {
final SAMReadGroupRecord rg = new SAMReadGroupRecord(sample);
rg.setSample(sample);
rg.setLibrary(sample);
rg.setDescription(sample);
rg.setLibrary("illumina");
samHeader.addReadGroup(rg);
}
samHeader.addComment("Generated with " + getProgramCommandLine());
samHeader.setSortOrder(SortOrder.unsorted);
samFileWriter = this.writingBamArgs.setReferenceFile(this.faidx).openSAMFileWriter(this.outputFile, samHeader, true);
/* looping over sequences */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
final LinkedList<VariantContext> variantBuffer = new LinkedList<>();
GenomicSequence genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
int x = 1;
while (x + this.fragmentSize <= ssr.getSequenceLength()) {
// pop on left
while (!variantBuffer.isEmpty() && (variantBuffer.getFirst().getStart() + this.fragmentSize * 2) < x) {
variantBuffer.removeFirst();
}
// refill buffer
while (vcfIterator.hasNext()) {
// buffer is already by enough
if (!variantBuffer.isEmpty() && variantBuffer.getLast().getStart() > x + 2 * fragmentSize) {
break;
}
final VariantContext ctx = vcfIterator.peek();
if (ctx == null)
break;
if (ctx.isIndel() || !ctx.isVariant()) {
LOG.warning("Cannot handle " + ctx.getReference() + "/" + ctx.getAlternateAlleles());
// consumme
vcfIterator.next();
continue;
}
final SAMSequenceRecord variantContig = dict.getSequence(ctx.getContig());
if (variantContig == null)
throw new IOException("Unknown contig " + ctx.getContig());
if (variantContig.getSequenceIndex() < ssr.getSequenceIndex()) {
// just consumme !
// https://github.com/lindenb/jvarkit/issues/86#issuecomment-326986654
// consumme
vcfIterator.next();
continue;
} else if (variantContig.getSequenceIndex() > ssr.getSequenceIndex()) {
break;
} else {
variantBuffer.add(vcfIterator.next());
}
}
if (variantBuffer.isEmpty()) {
LOG.info("no variant ?");
// no variant on this chromosome
break;
}
if (x < variantBuffer.getFirst().getStart() - 2 * fragmentSize) {
x = variantBuffer.getFirst().getStart() - 2 * fragmentSize;
}
for (int depth = 0; depth < 1; ++depth) {
for (String sample : header.getSampleNamesInOrder()) {
// loop over genomic strand
for (int g_strand = 0; g_strand < 2; ++g_strand) {
SAMRecord[] records = { new SAMRecord(samHeader), new SAMRecord(samHeader) };
String readName = String.format("%010d", ++id_generator);
for (int R = 0; R < 2; ++R) {
records[R].setReferenceName(ssr.getSequenceName());
records[R].setReadPairedFlag(true);
records[R].setReadName(readName);
records[R].setMappingQuality(60);
records[R].setProperPairFlag(true);
records[R].setMateReferenceName(ssr.getSequenceName());
records[R].setMateUnmappedFlag(false);
records[R].setReadUnmappedFlag(false);
records[R].setAttribute("RG", sample);
records[R].setReadNegativeStrandFlag(R == 1);
StringBuilder sequence = new StringBuilder(this.readSize);
int readLen = 0;
int refPos1 = (R == 0 ? x : x + this.fragmentSize - this.readSize);
records[R].setAlignmentStart(refPos1);
List<CigarElement> cigarElements = new ArrayList<>(this.readSize);
int NM = 0;
while (readLen < this.readSize) {
String base = null;
VariantContext overlap = null;
for (VariantContext vc : variantBuffer) {
int d = vc.getStart() - (refPos1 + readLen);
if (d == 0) {
overlap = vc;
break;
}
if (d > 0)
break;
}
if (overlap != null) {
Genotype genotype = overlap.getGenotype(sample);
if (genotype.isCalled() && !genotype.isMixed()) {
List<Allele> alleles = genotype.getAlleles();
if (alleles.size() != 2)
throw new RuntimeException("Not a diploid organism.");
Allele allele = null;
if (genotype.isPhased()) {
allele = alleles.get(g_strand);
} else {
allele = alleles.get(Math.random() < 0.5 ? 0 : 1);
}
if (allele.isSymbolic())
throw new RuntimeException("Cannot handle symbolic alleles");
if (allele.isReference()) {
cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.EQ));
} else if (allele.getBaseString().length() < overlap.getReference().getBaseString().length()) {
cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.D));
NM++;
} else if (allele.getBaseString().length() > overlap.getReference().getBaseString().length()) {
cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.I));
NM++;
} else // same size
{
cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.X));
NM++;
}
base = allele.getBaseString().toLowerCase();
}
}
if (base == null) {
base = String.valueOf(genomicSequence.charAt(refPos1 - 1 + readLen));
cigarElements.add(new CigarElement(1, CigarOperator.EQ));
}
sequence.append(base);
readLen += base.length();
}
while (sequence.length() > this.readSize) sequence.deleteCharAt(sequence.length() - 1);
records[R].setReadString(sequence.toString());
StringBuilder qual = new StringBuilder(sequence.length());
for (int i = 0; i < sequence.length(); ++i) qual.append("I");
records[R].setBaseQualityString(qual.toString());
// cigar
int c = 0;
while (c + 1 < cigarElements.size()) {
CigarElement c0 = cigarElements.get(c);
CigarElement c1 = cigarElements.get(c + 1);
if (c0.getOperator().equals(c1.getOperator())) {
cigarElements.set(c, new CigarElement(c0.getLength() + c1.getLength(), c0.getOperator()));
cigarElements.remove(c + 1);
} else {
++c;
}
}
records[R].setCigar(new Cigar(cigarElements));
records[R].setAttribute("NM", NM);
}
if (Math.random() < 0.5) {
records[0].setFirstOfPairFlag(true);
records[0].setSecondOfPairFlag(false);
records[1].setFirstOfPairFlag(false);
records[1].setSecondOfPairFlag(true);
} else {
records[0].setFirstOfPairFlag(false);
records[0].setSecondOfPairFlag(true);
records[1].setFirstOfPairFlag(true);
records[1].setSecondOfPairFlag(false);
}
records[0].setMateAlignmentStart(records[1].getAlignmentStart());
records[1].setMateAlignmentStart(records[0].getAlignmentStart());
records[0].setInferredInsertSize(records[1].getAlignmentStart() - records[0].getAlignmentStart());
records[1].setInferredInsertSize(records[0].getAlignmentStart() - records[1].getAlignmentStart());
samFileWriter.addAlignment(records[0]);
samFileWriter.addAlignment(records[1]);
}
}
}
++x;
}
}
samFileWriter.close();
}
use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.
the class NoZeroVariationVCF method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
final VCFHeader header = in.getHeader();
if (in.hasNext()) {
LOG.info("found a variant in VCF.");
VCFUtils.copyHeaderAndVariantsTo(in, out);
} else {
LOG.info("no a variant in VCF. Creating a fake Variant");
header.addMetaDataLine(new VCFFilterHeaderLine("FAKESNP", "Fake SNP created because vcf input was empty."));
VCFFormatHeaderLine gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_KEY);
if (gtHeaderLine == null) {
LOG.info("Adding GT to header");
header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
}
gtHeaderLine = header.getFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY);
if (gtHeaderLine == null) {
LOG.info("Adding GQ to header");
header.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality"));
}
out.writeHeader(header);
SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
// choose random chrom, best is 'random' , but not 1...23,X,Y, etc...
String chrom = dict.getSequence(0).getSequenceName();
for (final SAMSequenceRecord ssr : dict.getSequences()) {
String ssn = ssr.getSequenceName();
if (ssn.contains("_")) {
chrom = ssn;
break;
}
}
for (final SAMSequenceRecord ssr : dict.getSequences()) {
String ssn = ssr.getSequenceName();
if (ssn.toLowerCase().contains("random")) {
chrom = ssn;
break;
}
if (ssn.toLowerCase().contains("gl")) {
chrom = ssn;
break;
}
}
GenomicSequence gseq = new GenomicSequence(this.indexedFastaSequenceFile, chrom);
char ref = 'N';
char alt = 'N';
int POS = 0;
for (POS = 0; POS < gseq.length(); ++POS) {
ref = Character.toUpperCase(gseq.charAt(POS));
if (ref == 'N')
continue;
switch(ref) {
case 'A':
alt = 'T';
break;
case 'T':
alt = 'G';
break;
case 'G':
alt = 'C';
break;
case 'C':
alt = 'A';
break;
default:
break;
}
if (alt == 'N')
continue;
break;
}
if (alt == 'N')
throw new RuntimeException("found only N");
VariantContextBuilder vcb = new VariantContextBuilder();
Allele a1 = Allele.create((byte) ref, true);
Allele a2 = Allele.create((byte) alt, false);
List<Allele> la1a2 = new ArrayList<Allele>(2);
List<Genotype> genotypes = new ArrayList<Genotype>(header.getSampleNamesInOrder().size());
la1a2.add(a1);
la1a2.add(a2);
vcb.chr(gseq.getChrom());
vcb.start(POS + 1);
vcb.stop(POS + 1);
vcb.filter("FAKESNP");
vcb.alleles(la1a2);
vcb.log10PError(-0.1);
for (String sample : header.getSampleNamesInOrder()) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
if (genotypes.isEmpty()) {
gb.DP(1);
gb.GQ(1);
gb.alleles(la1a2);
gb.noAD();
gb.noPL();
genotypes.add(gb.make());
} else {
genotypes.add(GenotypeBuilder.createMissing(sample, 2));
}
}
vcb.genotypes(genotypes);
vcb.noID();
out.add(vcb.make());
}
return 0;
}
Aggregations