use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class ConvertBamChromosomes method doWork.
@Override
public int doWork(List<String> args) {
SamReader sfr = null;
SAMFileWriter sfw = null;
final Set<String> unmappedChromosomes = new HashSet<>();
try {
final ContigNameConverter customMapping = ContigNameConverter.fromFile(mappingFile);
customMapping.setOnNotFound(this.onNotFound);
sfr = super.openSamReader(oneFileOrNull(args));
SAMFileHeader header1 = sfr.getFileHeader();
if (header1 == null) {
LOG.error("File header missing");
return -1;
}
SAMFileHeader header2 = header1.clone();
// create new sequence dict
final SAMSequenceDictionary dict1 = header1.getSequenceDictionary();
if (dict1 == null) {
LOG.error("Sequence dict missing");
return -1;
}
final List<SAMSequenceRecord> ssrs = new ArrayList<SAMSequenceRecord>(dict1.size());
for (int i = 0; i < dict1.size(); ++i) {
SAMSequenceRecord ssr = dict1.getSequence(i);
String newName = customMapping.apply(ssr.getSequenceName());
if (newName == null) {
// skip unknown chromosomes
continue;
}
ssr = new SAMSequenceRecord(newName, ssr.getSequenceLength());
ssrs.add(ssr);
}
header2.setSequenceDictionary(new SAMSequenceDictionary(ssrs));
SAMSequenceDictionary dict2 = new SAMSequenceDictionary(ssrs);
header2.setSequenceDictionary(dict2);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict1);
sfw = this.writingBamArgs.openSAMFileWriter(output, header2, true);
long num_ignored = 0L;
SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
SAMRecord rec1 = iter.next();
progress.watch(rec1);
String newName1 = null;
String newName2 = null;
if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
newName1 = customMapping.apply(rec1.getReferenceName());
}
if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
newName2 = customMapping.apply(rec1.getMateReferenceName());
}
rec1.setHeader(header2);
if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
if (newName1 == null) {
++num_ignored;
continue;
}
rec1.setReferenceName(newName1);
}
if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
if (newName2 == null) {
++num_ignored;
continue;
}
rec1.setMateReferenceName(newName2);
}
sfw.addAlignment(rec1);
}
if (!unmappedChromosomes.isEmpty()) {
LOG.warning("Unmapped chromosomes: " + unmappedChromosomes);
}
LOG.warning("num ignored read:" + num_ignored);
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class CoverageNormalizer method run.
private int run(SamReader sfr) throws IOException {
SAMSequenceDictionary dictionary = sfr.getFileHeader().getSequenceDictionary();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dictionary);
File tmpFile1 = File.createTempFile("cov_", ".dat.gz", this.writingSortingCollection.getTmpDirectories().get(0));
tmpFile1.deleteOnExit();
LOG.info("Opening tmp File " + tmpFile1);
GZIPOutputStream gos = null;
DataInputStream dis = null;
SortingCollection<Float> median = null;
try {
gos = new GZIPOutputStream(new FileOutputStream(tmpFile1));
DataOutputStream daos = new DataOutputStream(gos);
SAMRecordIterator iter = sfr.iterator();
int curr_tid = -1;
short[] array = null;
float minCov = Float.MAX_VALUE;
float maxCov = 0;
int[] num_written = new int[dictionary.size()];
Arrays.fill(num_written, 0);
for (; ; ) {
SAMRecord rec = null;
if (iter.hasNext()) {
rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
progress.watch(rec);
if (rec.isSecondaryOrSupplementary())
continue;
}
if (rec == null || curr_tid != rec.getReferenceIndex()) {
if (curr_tid != -1) {
LOG.info("Writing data for chromosome " + dictionary.getSequence(curr_tid).getSequenceName());
for (int i = 0; i + window_size <= array.length; i += window_shift) {
float sum = 0;
for (int j = 0; j < window_size; ++j) {
sum += array[i + j];
}
float v = sum / window_size;
daos.writeFloat(v);
minCov = (float) Math.min(minCov, v);
maxCov = (float) Math.max(maxCov, v);
num_written[curr_tid]++;
}
LOG.info("End writing data N=" + num_written[curr_tid]);
}
if (rec == null)
break;
curr_tid = rec.getReferenceIndex();
SAMSequenceRecord ssr = dictionary.getSequence(curr_tid);
LOG.info("allocate " + ssr.getSequenceLength() + " for " + ssr.getSequenceName());
array = null;
System.gc();
array = new short[ssr.getSequenceLength()];
LOG.info("done: allocate.");
Arrays.fill(array, (short) 0);
}
for (int i = rec.getAlignmentStart(); i < rec.getAlignmentEnd() && i < array.length; ++i) {
array[i] = (short) Math.min((int) Short.MAX_VALUE, 1 + (int) array[i]);
}
}
array = null;
LOG.info("Closing BAM");
CloserUtil.close(sfr);
LOG.info("Closing " + tmpFile1);
daos.flush();
gos.finish();
gos.flush();
gos.close();
// start normalizing min/max find median value
long nWritten = 0L;
median = SortingCollection.newInstance(Float.class, new FloatCodec(), new FloatCmp(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
median.setDestructiveIteration(true);
dis = new DataInputStream(new GZIPInputStream(new FileInputStream(tmpFile1)));
for (int n_items : num_written) {
for (int i = 0; i < n_items; ++i) {
float v = dis.readFloat();
if (v < min_coverage)
continue;
v = (float) ((v - minCov) / (double) (maxCov - minCov));
median.add(v);
++nWritten;
}
}
median.doneAdding();
CloserUtil.close(dis);
// get median
float median_value = 0f;
long half = nWritten / 2L;
CloseableIterator<Float> iterFloat = median.iterator();
while (iterFloat.hasNext() && half > 0) {
median_value = iterFloat.next();
--half;
if (half <= 0) {
LOG.info("median = " + median_value);
break;
}
}
CloserUtil.close(iterFloat);
median.cleanup();
median = null;
progress = new SAMSequenceDictionaryProgress(dictionary);
// dump data
dis = new DataInputStream(new GZIPInputStream(new FileInputStream(tmpFile1)));
for (int chrom_id = 0; chrom_id < num_written.length; ++chrom_id) {
int n_items = num_written[chrom_id];
int i = 0;
Float value_start = null;
while (i < n_items) {
if (value_start == null) {
value_start = dis.readFloat();
}
int j = i + 1;
Float value_end = null;
while (j < n_items) {
value_end = dis.readFloat();
if (value_start.intValue() == value_end.intValue()) {
++j;
} else {
break;
}
}
progress.watch(chrom_id, i * window_shift);
if (value_start >= min_coverage) {
System.out.print(dictionary.getSequence(chrom_id).getSequenceName());
System.out.print('\t');
System.out.print(i * window_shift);
System.out.print('\t');
System.out.print((j - 1) * window_shift + window_size);
System.out.print('\t');
System.out.print((float) ((value_start - minCov) / (double) (maxCov - minCov)));
System.out.print('\t');
System.out.print(median_value);
System.out.println();
if (System.out.checkError())
break;
}
i = j;
if (value_end == null)
break;
value_start = value_end;
}
}
CloserUtil.close(dis);
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(gos);
CloserUtil.close(dis);
if (tmpFile1 != null)
tmpFile1.delete();
if (median != null)
median.cleanup();
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class ReferenceToVCF method doWork.
@Override
public int doWork(List<String> args) {
if (this.bedFile != null) {
if (limitBed == null)
limitBed = new IntervalTreeMap<Boolean>();
try {
Pattern tab = Pattern.compile("[\t]");
BufferedReader r = IOUtils.openFileForBufferedReading(this.bedFile);
String line;
while ((line = r.readLine()) != null) {
if (BedLine.isBedHeader(line))
continue;
if (line.startsWith("#") || line.isEmpty())
continue;
String[] tokens = tab.split(line, 4);
limitBed.put(new Interval(tokens[0], 1 + Integer.parseInt(tokens[1]), 1 + Integer.parseInt(tokens[2])), true);
}
CloserUtil.close(r);
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
Random random = new Random(0L);
VariantContextWriter out = null;
try {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(oneAndOnlyOneFile(args)));
SAMSequenceDictionary dict = fasta.getSequenceDictionary();
out = super.openVariantContextWriter(this.outputFile);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
VCFHeader header = new VCFHeader();
header.setSequenceDictionary(dict);
out.writeHeader(header);
final List<List<Allele>> combination = new ArrayList<List<Allele>>(4);
// always keep REF as first allele please
combination.add(Arrays.asList(Allele.create("A", true), Allele.create("C", false), Allele.create("G", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("C", true), Allele.create("A", false), Allele.create("G", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("G", true), Allele.create("A", false), Allele.create("C", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("T", true), Allele.create("A", false), Allele.create("C", false), Allele.create("G", false)));
for (SAMSequenceRecord ssr : dict.getSequences()) {
GenomicSequence genome = new GenomicSequence(fasta, ssr.getSequenceName());
if (this.limitBed != null) {
Interval interval = new Interval(ssr.getSequenceName(), 1, genome.length());
if (!this.limitBed.containsOverlapping(interval))
continue;
}
for (int n = 0; n < genome.length(); ++n) {
progress.watch(ssr.getSequenceIndex(), n);
List<Allele> alleles = null;
byte ref = (byte) genome.charAt(n);
switch(ref) {
case 'a':
case 'A':
alleles = combination.get(0);
break;
case 'c':
case 'C':
alleles = combination.get(1);
break;
case 'g':
case 'G':
alleles = combination.get(2);
break;
case 't':
case 'T':
alleles = combination.get(3);
break;
default:
break;
}
if (alleles == null)
continue;
if (this.limitBed != null) {
Interval interval = new Interval(ssr.getSequenceName(), n + 1, n + 1);
if (!this.limitBed.containsOverlapping(interval))
continue;
}
if (!disjoint_alts) {
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.stop(n + 1);
vcb.alleles(alleles);
out.add(vcb.make());
} else {
for (// index 0 is always REF
int a = 1; // index 0 is always REF
a < 4; // index 0 is always REF
++a) {
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.stop(n + 1);
// index 0 is always REF
vcb.alleles(Arrays.asList(alleles.get(0), alleles.get(a)));
out.add(vcb.make());
}
}
if (insertion_size > 0 && n + 1 < genome.length()) {
alleles = new ArrayList<Allele>(2);
// REFERENCE
alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(n + 1), true));
StringBuilder sb = new StringBuilder(insertion_size + 2);
sb.append(genome.charAt(n));
for (int n2 = 0; n2 < insertion_size; ++n2) {
switch(random.nextInt(4)) {
case 0:
sb.append('A');
break;
case 1:
sb.append('C');
break;
case 2:
sb.append('G');
break;
case 3:
sb.append('T');
break;
}
}
sb.append(genome.charAt(n + 1));
alleles.add(Allele.create(sb.toString(), false));
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.alleles(alleles);
vcb.computeEndFromAlleles(alleles, n + 1);
out.add(vcb.make());
}
if (deletion_size > 0 && n + deletion_size + 1 < genome.length()) {
alleles = new ArrayList<Allele>(2);
// REF
StringBuilder sb = new StringBuilder(deletion_size + 2);
sb.append(genome.charAt(n));
int lastpos = n + 1;
for (int n2 = 0; n2 < deletion_size; ++n2, lastpos++) {
sb.append(genome.charAt(lastpos));
}
sb.append(genome.charAt(lastpos));
alleles.add(Allele.create(sb.toString(), true));
alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(lastpos), false));
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.alleles(alleles);
vcb.computeEndFromAlleles(alleles, n + 1);
out.add(vcb.make());
}
if (out.checkError())
break;
}
if (out.checkError())
break;
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class SamMaskAlignedBases method doWork.
@Override
public int doWork(List<String> args) {
final byte RESET_CHAR = (byte) 'N';
final byte RESET_QUAL = (byte) SAMUtils.fastqToPhred('#');
long nRecords = 0L;
long nRecordMasked = 0L;
long nBasesMasked = 0L;
long nBases = 0L;
SAMRecordIterator iter = null;
SamReader sfr = null;
SAMFileWriter sfw = null;
try {
sfr = openSamReader(oneFileOrNull(args));
final SAMFileHeader header1 = sfr.getFileHeader();
if (header1 == null) {
LOG.error("File header missing");
return -1;
}
final SAMFileHeader header2 = header1.clone();
header2.addComment(getProgramName() + ":" + getVersion() + ":" + getProgramCommandLine());
header2.setSortOrder(SortOrder.unsorted);
sfw = this.writingBamArgs.openSAMFileWriter(outputFile, header2, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
++nRecords;
nBases += rec.getReadLength();
if (rec.getReadUnmappedFlag()) {
SAMUtils.makeReadUnmapped(rec);
sfw.addAlignment(rec);
continue;
}
if (rec.isSecondaryOrSupplementary()) {
continue;
}
final Cigar cigar = rec.getCigar();
byte[] bases = rec.getReadBases();
byte[] quals = rec.getBaseQualities();
if (cigar == null || cigar.isEmpty()) {
SAMUtils.makeReadUnmapped(rec);
sfw.addAlignment(rec);
continue;
}
int readpos = 0;
for (final CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
if (op.consumesReadBases()) {
if (op.consumesReferenceBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
if (bases != null)
bases[readpos + x] = RESET_CHAR;
if (quals != null)
quals[readpos + x] = RESET_QUAL;
++nBasesMasked;
}
}
readpos += ce.getLength();
}
}
++nRecordMasked;
SAMUtils.makeReadUnmapped(rec);
sfw.addAlignment(rec);
}
iter.close();
sfw.close();
progress.finish();
LOG.info("done : reads masked " + nRecordMasked + "/" + nRecords + " Bases masked:" + nBasesMasked + "/" + nBases);
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class SamRetrieveSeqAndQual method doWork.
@Override
public int doWork(List<String> args) {
FastqReader[] fastqReaders = null;
SamReader samReader = null;
SAMFileWriter samWriter = null;
SAMRecordIterator iter = null;
try {
if (fastqFin == null) {
LOG.error("undefined fastq file");
return -1;
} else {
LOG.info("opening " + fastqFin);
FastqReader r1 = new FastqReader(fastqFin);
if (fastqRin == null) {
fastqReaders = new FastqReader[] { r1 };
} else {
LOG.info("opening " + fastqRin);
FastqReader r2 = new FastqReader(fastqRin);
fastqReaders = new FastqReader[] { r1, r2 };
}
}
samReader = super.openSamReader(oneFileOrNull(args));
final SAMFileHeader.SortOrder sortOrder = samReader.getFileHeader().getSortOrder();
if (sortOrder == null) {
LOG.warning("undefined sort order read are in the sam order");
} else if (!sortOrder.equals(SAMFileHeader.SortOrder.queryname)) {
LOG.error("Bad Sort Order. Sort this input on read name");
return -1;
}
SAMFileHeader header = samReader.getFileHeader().clone();
SAMProgramRecord prg = header.createProgramRecord();
prg.setCommandLine(this.getProgramCommandLine());
prg.setProgramName(this.getProgramName());
prg.setProgramVersion(this.getVersion());
samWriter = writingBamArgs.openSAMFileWriter(bamOut, header, true);
iter = samReader.iterator();
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
FastqRecord[] currFastq = new FastqRecord[] { null, null };
while (iter.hasNext()) {
SAMRecord rec = progress.watch(iter.next());
String readName = rec.getReadName();
int fastq_index = 0;
if (rec.getReadPairedFlag()) {
if (fastqReaders.length != 2) {
LOG.error("Not paired but number of fastq!=2");
return -1;
}
fastq_index = (rec.getFirstOfPairFlag() ? 0 : 1);
} else {
if (fastqReaders.length != 1) {
LOG.error("Not paired but number of fastq!=1");
return -1;
}
fastq_index = 0;
}
if (sortOrder == SAMFileHeader.SortOrder.queryname) {
while (currFastq[fastq_index] == null || normalizeFastqName(currFastq[fastq_index].getReadName()).compareTo(readName) < 0) {
if (!fastqReaders[fastq_index].hasNext()) {
LOG.error("Read Missing for " + readName);
return -1;
}
currFastq[fastq_index] = fastqReaders[fastq_index].next();
if (normalizeFastqName(currFastq[fastq_index].getReadName()).compareTo(readName) > 0) {
LOG.error("Read Missing for " + readName);
return -1;
}
}
} else {
if (!fastqReaders[fastq_index].hasNext()) {
LOG.error("Read Missing for " + readName);
return -1;
}
currFastq[fastq_index] = fastqReaders[fastq_index].next();
}
if (normalizeFastqName(currFastq[fastq_index].getReadName()).compareTo(readName) != 0) {
LOG.error("Read Missing/Error for " + readName + " current:" + currFastq[fastq_index].getReadName());
return -1;
}
String fastqBases = currFastq[fastq_index].getReadString();
String fastqQuals = currFastq[fastq_index].getBaseQualityString();
/* handle orientation */
if (!rec.getReadUnmappedFlag() && rec.getReadNegativeStrandFlag()) {
fastqBases = AcidNucleics.reverseComplement(fastqBases);
StringBuilder sb = new StringBuilder(fastqQuals.length());
for (int i = fastqQuals.length() - 1; i >= 0; --i) sb.append(fastqQuals.charAt(i));
fastqQuals = sb.toString();
}
/* remove hard clip */
Cigar cigar = rec.getCigar();
if (cigar != null) {
List<CigarElement> ceList = cigar.getCigarElements();
if (!ceList.isEmpty()) {
CigarElement ce = ceList.get(ceList.size() - 1);
if (ce.getOperator() == CigarOperator.HARD_CLIP) {
fastqBases = fastqBases.substring(0, fastqBases.length() - ce.getLength());
fastqQuals = fastqQuals.substring(0, fastqQuals.length() - ce.getLength());
}
ce = ceList.get(0);
if (ce.getOperator() == CigarOperator.HARD_CLIP) {
fastqBases = fastqBases.substring(ce.getLength());
fastqQuals = fastqQuals.substring(ce.getLength());
}
}
}
rec.setBaseQualityString(fastqQuals);
rec.setReadString(fastqBases);
samWriter.addAlignment(rec);
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
if (fastqReaders != null)
for (FastqReader r : fastqReaders) CloserUtil.close(r);
CloserUtil.close(iter);
CloserUtil.close(samReader);
CloserUtil.close(samWriter);
}
}
Aggregations