use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class ExtendReferenceWithReads method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidx == null) {
LOG.error("No REF defined");
return -1;
}
this.samReaders.clear();
PrintStream out = null;
try {
this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(faidx);
SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
if (dict == null) {
LOG.error("Reference file is missing a sequence dictionary (use picard)");
return -1;
}
final SamReaderFactory srf = super.createSamReaderFactory();
srf.setOption(Option.CACHE_FILE_BASED_INDEXES, true);
for (final String uri : IOUtils.unrollFiles(args)) {
LOG.info("opening BAM " + uri);
final SamReader sr = srf.open(SamInputResource.of(uri));
/* doesn't work with remote ?? */
if (!sr.hasIndex()) {
LOG.error("file " + uri + " is not indexed");
return -1;
}
final SAMFileHeader header = sr.getFileHeader();
if (!header.getSortOrder().equals(SortOrder.coordinate)) {
LOG.error("file " + uri + " not sorted on coordinate but " + header.getSortOrder());
return -1;
}
final SAMSequenceDictionary dict2 = header.getSequenceDictionary();
if (dict2 == null) {
LOG.error("file " + uri + " needs a sequence dictionary");
return -1;
}
samReaders.add(sr);
}
if (samReaders.isEmpty()) {
LOG.error("No BAM defined");
return -1;
}
out = super.openFileOrStdoutAsPrintStream(this.outputFile);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
for (final SAMSequenceRecord ssr : dict.getSequences()) {
final GenomicSequence genomic = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
int chromStart = 0;
int nPrinted = 0;
out.print(">");
out.print(ssr.getSequenceName());
for (final Rescue side : Rescue.values()) {
switch(side) {
case LEFT:
/* look before position 0 of chromosome */
{
final Map<Integer, Counter<Byte>> pos2bases = scanRegion(ssr, -1, -1, side);
int newstart = 0;
for (final Integer pos : pos2bases.keySet()) {
if (pos >= 0)
continue;
newstart = Math.min(newstart, pos);
}
while (newstart < 0) {
final Counter<Byte> count = pos2bases.get(newstart);
if (nPrinted % 60 == 0)
out.println();
out.print(consensus(count));
newstart++;
++nPrinted;
}
break;
}
case RIGHT:
/* look after position > length(chromosome) */
{
final Map<Integer, Counter<Byte>> pos2bases = this.scanRegion(ssr, -1, -1, side);
int newend = ssr.getSequenceLength();
for (final Integer pos : pos2bases.keySet()) {
if (pos < ssr.getSequenceLength())
continue;
newend = Math.max(newend, pos + 1);
}
for (int i = ssr.getSequenceLength(); i < newend; i++) {
final Counter<Byte> count = pos2bases.get(i);
if (nPrinted % 60 == 0)
out.println();
out.print(consensus(count));
++nPrinted;
}
break;
}
case CENTER:
/* 0 to chromosome-length */
{
chromStart = 0;
while (chromStart < genomic.length()) {
final char base = Character.toUpperCase(genomic.charAt(chromStart));
if (base != 'N') {
progress.watch(ssr.getSequenceName(), chromStart);
if (nPrinted % 60 == 0)
out.println();
out.print(base);
++chromStart;
++nPrinted;
continue;
}
int chromEnd = chromStart + 1;
while (chromEnd < genomic.length() && Character.toUpperCase(genomic.charAt(chromEnd)) == 'N') {
++chromEnd;
}
if (chromEnd - chromStart < this.minLenNNNNContig) {
while (chromStart < chromEnd) {
progress.watch(ssr.getSequenceName(), chromStart);
if (nPrinted % 60 == 0)
out.println();
out.print(base);
++chromStart;
++nPrinted;
}
continue;
}
final Map<Integer, Counter<Byte>> pos2bases = scanRegion(ssr, chromStart, chromEnd, side);
while (chromStart < chromEnd) {
final Counter<Byte> count = pos2bases.get(chromStart);
if (nPrinted % 60 == 0)
out.println();
if (count == null) {
out.print('N');
} else {
out.print(consensus(count));
}
++chromStart;
++nPrinted;
continue;
}
}
break;
}
}
// end switch type
}
out.println();
}
progress.finish();
out.flush();
out.close();
return RETURN_OK;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
for (final SamReader r : samReaders) CloserUtil.close(r);
samReaders.clear();
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class BamToFastq method doWork.
@Override
public int doWork(List<String> args) {
SamReader sfr = null;
SortingCollection<MappedFastq> fastqCollection = null;
try {
boolean found_single = false;
boolean found_paired = false;
long non_primary_alignmaned_flag = 0L;
sfr = super.openSamReader(oneFileOrNull(args));
fastqCollection = SortingCollection.newInstance(MappedFastq.class, new MappedFastqCodec(), new MappedFastqComparator(), this.maxRecordsInRam, this.tmpDir.toPath());
fastqCollection.setDestructiveIteration(true);
SAMRecordIterator iter = sfr.iterator();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader().getSequenceDictionary());
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.isSecondaryOrSupplementary()) {
if (non_primary_alignmaned_flag == 0) {
LOG.warn("SKIPPING NON-PRIMARY " + (non_primary_alignmaned_flag + 1) + " ALIGNMENTS");
}
non_primary_alignmaned_flag++;
continue;
}
MappedFastq m = new MappedFastq();
m.name = rec.getReadName();
if (m.name == null)
m.name = "";
m.seq = rec.getReadString();
if (m.seq.equals(SAMRecord.NULL_SEQUENCE_STRING))
m.seq = "";
m.qual = rec.getBaseQualityString();
if (m.qual.equals(SAMRecord.NULL_QUALS_STRING))
m.qual = "";
if (!rec.getReadUnmappedFlag() && rec.getReadNegativeStrandFlag()) {
m.seq = AcidNucleics.reverseComplement(m.seq);
m.qual = new StringBuilder(m.qual).reverse().toString();
}
if (m.seq.length() != m.qual.length()) {
LOG.error("length(seq)!=length(qual) in " + m.name);
continue;
}
if (m.seq.isEmpty() && m.qual.isEmpty()) {
m.seq = "N";
m.qual = "#";
}
if (rec.getReadPairedFlag()) {
found_paired = true;
if (found_single) {
sfr.close();
throw new RuntimeException("input is a mix of paired/singled reads");
}
m.side = (byte) (rec.getSecondOfPairFlag() ? 2 : 1);
} else {
found_single = true;
if (found_paired) {
sfr.close();
throw new RuntimeException("input is a mix of paired/singled reads");
}
m.side = (byte) 0;
}
fastqCollection.add(m);
}
iter.close();
CloserUtil.close(iter);
CloserUtil.close(sfr);
progress.finish();
fastqCollection.doneAdding();
LOG.info("Done reading.");
if (found_paired) {
FastqWriter fqw1 = null;
FastqWriter fqw2 = null;
if (forwardFile != null) {
LOG.info("Writing to " + forwardFile);
fqw1 = new BasicFastqWriter(forwardFile);
} else {
LOG.info("Writing to stdout");
fqw1 = new BasicFastqWriter(new PrintStream(stdout()));
}
if (reverseFile != null) {
LOG.info("Writing to " + reverseFile);
fqw2 = new BasicFastqWriter(reverseFile);
} else {
LOG.info("Writing to interlaced stdout");
fqw2 = fqw1;
}
List<MappedFastq> row = new ArrayList<MappedFastq>();
CloseableIterator<MappedFastq> r = fastqCollection.iterator();
for (; ; ) {
MappedFastq curr = null;
if (r.hasNext())
curr = r.next();
if (curr == null || (!row.isEmpty() && !row.get(0).name.equals(curr.name))) {
if (!row.isEmpty()) {
if (row.size() > 2) {
LOG.warn("WTF :" + row);
}
boolean found_F = false;
boolean found_R = false;
for (MappedFastq m : row) {
switch((int) m.side) {
case 1:
if (found_F)
throw new RuntimeException("two forward reads found for " + row.get(0).name);
found_F = true;
echo(fqw1, m);
break;
case 2:
if (found_R)
throw new RuntimeException("two reverse reads found for " + row.get(0).name);
found_R = true;
echo(fqw2, m);
break;
default:
throw new IllegalStateException("uh???");
}
}
if (!found_F) {
if (this.repair_missing_read) {
LOG.warn("forward not found for " + row.get(0));
MappedFastq pad = new MappedFastq();
pad.side = (byte) 1;
pad.name = row.get(0).name;
pad.seq = "N";
pad.qual = "#";
echo(fqw1, pad);
} else {
throw new RuntimeException("forward not found for " + row);
}
}
if (!found_R) {
if (repair_missing_read) {
LOG.warn("reverse not found for " + row.get(0));
MappedFastq pad = new MappedFastq();
pad.side = (byte) 2;
pad.name = row.get(0).name;
pad.seq = "N";
pad.qual = "#";
echo(fqw2, pad);
} else {
throw new RuntimeException("reverse not found for " + row);
}
}
}
if (curr == null)
break;
row.clear();
}
row.add(curr);
}
r.close();
fqw1.close();
fqw2.close();
} else if (found_single) {
FastqWriter fqw1 = null;
if (forwardFile != null) {
LOG.info("Writing to " + forwardFile);
fqw1 = new BasicFastqWriter(forwardFile);
} else {
LOG.info("Writing to stdout");
fqw1 = new BasicFastqWriter(new PrintStream(stdout()));
}
final CloseableIterator<MappedFastq> r = fastqCollection.iterator();
while (r.hasNext()) {
echo(fqw1, r.next());
}
r.close();
fqw1.close();
}
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (fastqCollection != null)
fastqCollection.cleanup();
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class BamTreePack method scan.
private void scan(final SamReader sfr) {
super.bindings.put("header", sfr.getFileHeader());
final SAMRecordIterator iter = sfr.iterator();
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader());
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
super.bindings.put("record", rec);
super.nodeFactoryChain.watch(rootNode, rec);
}
progress.finish();
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfToSql method read.
private void read(File filename) throws IOException {
/* insert ATGC */
this.alleleTable.insert(outputWriter, null, "A");
this.alleleTable.insert(outputWriter, null, "C");
this.alleleTable.insert(outputWriter, null, "G");
this.alleleTable.insert(outputWriter, null, "T");
/* insert this sample */
this.vcfFileTable.insert(outputWriter, null, filename);
final SelectStmt vcffile_id = new SelectStmt(this.vcfFileTable);
final Map<String, SelectStmt> sample2sampleid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> filter2filterid = new HashMap<String, SelectStmt>();
final Map<String, SelectStmt> chrom2chromId = new HashMap<String, SelectStmt>();
final VcfIterator r = VCFUtils.createVcfIteratorFromFile(filename);
final VCFHeader header = r.getHeader();
/* parse samples */
for (final String sampleName : header.getSampleNamesInOrder()) {
this.sampleTable.insert(outputWriter, null, sampleName);
SelectStmt sample_id = new SelectStmt(this.sampleTable, "name", sampleName);
sample2sampleid.put(sampleName, sample_id);
this.sample2fileTable.insert(outputWriter, null, vcffile_id, sample_id);
}
/* parse filters */
for (final VCFFilterHeaderLine filter : header.getFilterLines()) {
this.filterTable.insert(outputWriter, null, vcffile_id, filter.getID(), filter.getValue());
filter2filterid.put(filter.getID(), new SelectStmt(this.filterTable, "name", filter.getID()));
}
filter2filterid.put(VCFConstants.PASSES_FILTERS_v4, new SelectStmt(this.filterTable, "name", VCFConstants.PASSES_FILTERS_v4));
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict == null) {
throw new RuntimeException("dictionary missing in VCF");
}
/* parse sequence dict */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
this.chromosomeTable.insert(outputWriter, null, vcffile_id, ssr.getSequenceName(), ssr.getSequenceLength());
chrom2chromId.put(ssr.getSequenceName(), new SelectStmt(this.chromosomeTable, "name", ssr.getSequenceName()));
}
VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
int nVariants = 0;
while (r.hasNext()) {
if (this.outputWriter.checkError())
break;
VariantContext var = progress.watch(r.next());
++nVariants;
/* insert ref allele */
this.alleleTable.insert(outputWriter, null, var.getReference().getBaseString());
/* insert variant */
this.variantTable.insert(outputWriter, null, vcffile_id, nVariants, chrom2chromId.get(var.getContig()), var.getStart(), (var.hasID() ? var.getID() : null), new SelectStmt(this.alleleTable, "bases", var.getReference().getBaseString()), (var.hasLog10PError() ? var.getPhredScaledQual() : null));
SelectStmt variant_id = new SelectStmt(variantTable);
/* insert alternate alleles */
for (Allele alt : var.getAlternateAlleles()) {
/* insert alt allele */
this.alleleTable.insert(outputWriter, null, alt.getBaseString());
this.variant2altTable.insert(outputWriter, null, variant_id, new SelectStmt(this.alleleTable, "bases", alt.getBaseString()));
}
/* insert filters */
for (final String filter : var.getFilters()) {
if (filter2filterid.get(filter) == null) {
throw new IOException("VCF Error: filter " + filter + " is not defined in the VCF header.");
}
this.variant2filters.insert(outputWriter, null, variant_id, filter2filterid.get(filter));
}
if (!this.ignore_info) {
for (final VepPrediction pred : vepPredictionParser.getPredictions(var)) {
/*
vepPrediction.insert(
outputWriter,
null,
variant_id,
pred.getEnsemblGene(),
pred.getEnsemblTranscript(),
pred.getEnsemblProtein(),
pred.getSymbol()
);
SelectStmt pred_id = new SelectStmt(vepPrediction);
for(SequenceOntologyTree.Term t: pred.getSOTerms())
{
String term=t.getAcn().replace(':', '_');
soTermTable.insert(
outputWriter,
null,
term,
t.getAcn()
);//for bioportal compatibility
SelectStmt term_id = new SelectStmt(soTermTable,"acn",term);
vepPrediction2so.insert(
outputWriter,
null,
pred_id,
term_id
);
}
*/
}
}
/* insert genotypes */
for (final String sampleName : sample2sampleid.keySet()) {
final Genotype g = var.getGenotype(sampleName);
if (!g.isAvailable() || g.isNoCall())
continue;
genotypeTable.insert(outputWriter, null, variant_id, sample2sampleid.get(sampleName), g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(0).getBaseString()) : null, g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(1).getBaseString()) : null, g.hasDP() ? g.getDP() : null, g.hasGQ() ? g.getGQ() : null);
}
}
r.close();
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class Vcf2Xml method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator iterin, final VariantContextWriter out) {
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(iterin.getHeader());
out.writeHeader(iterin.getHeader());
while (iterin.hasNext()) {
out.add(progress.watch(iterin.next()));
}
progress.finish();
return 0;
}
Aggregations