use of htsjdk.samtools.SAMRecordComparator in project jvarkit by lindenb.
the class OneThousandBam method doWork.
@Override
public int doWork(final List<String> args) {
SAMFileWriter sfw = null;
QueryInterval[] queryIntervalArray = null;
try {
if (this.bedFile == null && !this.exitAfterBaiDownload) {
LOG.error("bed file is missing");
return -1;
}
IOUtil.assertDirectoryIsWritable(this.baiCacheDir);
if (args.isEmpty()) {
LOG.error("no urls");
} else if (args.size() == 1 && args.get(0).endsWith(".list")) {
Files.lines(Paths.get(args.get(0))).filter(L -> !StringUtils.isBlank(L)).filter(L -> !L.startsWith("#")).map(L -> new Sample1KG(L)).forEach(S -> this.samples.add(S));
} else {
args.stream().map(L -> new Sample1KG(L)).forEach(S -> this.samples.add(S));
}
if (this.samples.isEmpty()) {
LOG.error("no sample defined");
return -1;
}
for (int x = 0; x < this.samples.size(); x++) {
LOG.debug("bai " + (x + 1) + "/" + this.samples.size());
this.samples.get(x).downloadBaiToCache();
}
if (this.exitAfterBaiDownload) {
LOG.info("Bai downloaded");
return 0;
}
for (final Sample1KG sample : this.samples) {
LOG.debug("get sam file header for " + sample.url);
try (final SamReader sr = sample.open()) {
final SAMFileHeader header = sr.getFileHeader();
if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
throw new RuntimeIOException(sample.url + " is not sorted");
}
sample.sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElseThrow(() -> new IllegalArgumentException("Cannot find sample in " + sample.url));
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (this.dict == null) {
LOG.debug("read bed file " + this.bedFile);
final List<QueryInterval> queryIntervals = new ArrayList<>();
this.dict = dict;
// load bed here
ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(this.dict);
final BedLineCodec codec = new BedLineCodec();
BufferedReader br = IOUtils.openPathForBufferedReading(this.bedFile);
String line;
while ((line = br.readLine()) != null) {
final BedLine bed = codec.decode(line);
if (bed == null)
continue;
String newCtg = ctgConverter.apply(bed.getContig());
if (StringUtils.isBlank(newCtg)) {
throw new JvarkitException.ContigNotFoundInDictionary(bed.getContig(), this.dict);
}
final int tid = this.dict.getSequenceIndex(newCtg);
final QueryInterval queryInterval = new QueryInterval(tid, bed.getStart(), bed.getEnd());
queryIntervals.add(queryInterval);
}
br.close();
queryIntervalArray = QueryInterval.optimizeIntervals(queryIntervals.toArray(new QueryInterval[queryIntervals.size()]));
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, this.dict)) {
throw new JvarkitException.DictionariesAreNotTheSame(dict, this.dict);
}
}
}
if (queryIntervalArray == null || queryIntervalArray.length == 0) {
LOG.error("no query interval defined");
return -1;
}
final SAMRecordComparator samRecordComparator = new SAMRecordCoordinateComparator();
final SAMFileHeader ouSamFileHeader = new SAMFileHeader(this.dict);
ouSamFileHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
for (final Sample1KG sample : this.samples) {
sample.readGroupRecord = new SAMReadGroupRecord(sample.sampleName);
sample.readGroupRecord.setSample(sample.sampleName);
sample.readGroupRecord.setLibrary(sample.sampleName);
sample.readGroupRecord.setAttribute("URL", sample.url);
ouSamFileHeader.addReadGroup(sample.readGroupRecord);
}
final SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
sfwf.setCreateIndex(false);
sfwf.setCreateMd5File(false);
if (this.outputFile == null) {
sfw = sfwf.makeBAMWriter(ouSamFileHeader, true, stdout());
} else {
sfw = sfwf.makeSAMOrBAMWriter(ouSamFileHeader, true, this.outputFile);
}
int interval_index = 0;
while (interval_index < queryIntervalArray.length) {
LOG.debug("interval_index= " + interval_index);
/* current interval fetched by bam, can be updated */
QueryInterval interval = queryIntervalArray[interval_index];
LOG.debug("interval= " + interval);
final List<Path> tmpFiles = new ArrayList<>(this.samples.size());
boolean done = false;
while (!done) {
done = true;
for (final Path p : tmpFiles) Files.delete(p);
tmpFiles.clear();
for (final Sample1KG sample : this.samples) {
if (!done)
break;
// try forever until it works !
for (; ; ) {
// try to download data for the current interval
final Path tmpBam = Files.createTempFile(this.baiCacheDir, "tmp.", ".bam");
SamReader srf = null;
SAMFileWriter rgnWriter = null;
SAMRecordIterator samIter = null;
try {
srf = sample.open();
// LOG.debug("query= "+sample.sampleName+" "+interval);
samIter = srf.query(new QueryInterval[] { interval }, false);
rgnWriter = sfwf.makeBAMWriter(ouSamFileHeader, true, tmpBam);
while (samIter.hasNext()) {
final SAMRecord rec = samIter.next();
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getReadUnmappedFlag())
continue;
if (rec.getAlignmentEnd() > interval.end && /* current read goes beyond current interval */
interval_index + 1 < queryIntervalArray.length && /* there is a new interval */
interval.referenceIndex == queryIntervalArray[interval_index + 1].referenceIndex && /* on same chromosome */
queryIntervalArray[interval_index + 1].start <= rec.getAlignmentEnd()) {
// update current interval, merge with next
interval = new QueryInterval(interval.referenceIndex, interval.start, queryIntervalArray[interval_index + 1].end);
interval_index++;
LOG.info("extending interval to " + interval);
done = false;
break;
}
rec.setAttribute(SAMTag.RG.name(), sample.readGroupRecord.getId());
rgnWriter.addAlignment(rec);
}
samIter.close();
samIter = null;
srf.close();
srf = null;
rgnWriter.close();
rgnWriter = null;
// LOG.debug("donequery= "+sample.url);
} catch (final Exception err) {
LOG.error(err);
Files.delete(tmpBam);
continue;
} finally {
CloserUtil.close(samIter);
CloserUtil.close(srf);
CloserUtil.close(rgnWriter);
}
tmpFiles.add(tmpBam);
break;
}
if (!done)
break;
}
// end of download each sample
if (!done)
continue;
}
// end of while(!done)
// LOG.info("merging "+interval);
// merge everything
final List<SamReader> chunkReaders = new ArrayList<>(samples.size());
final List<CloseableIterator<SAMRecord>> chunkIterators = new ArrayList<>(samples.size());
for (final Path tmpPath : tmpFiles) {
final SamReader tmpReader = samReaderFactory.open(tmpPath);
chunkReaders.add(tmpReader);
chunkIterators.add(tmpReader.iterator());
}
final MergingIterator<SAMRecord> merger = new MergingIterator<>(samRecordComparator, chunkIterators);
while (merger.hasNext()) {
sfw.addAlignment(merger.next());
}
merger.close();
CloserUtil.close(chunkIterators);
CloserUtil.close(chunkReaders);
// cleanup
for (final Path p : tmpFiles) Files.delete(p);
interval_index++;
}
sfw.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
}
}
Aggregations