use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class KnimeVariantHelper method processVcfMulti.
/**
* process the VCF file,
*
* @param vcfIn input file name
* @param fun functional
* @return the output file name
* @throws IOException
*/
public String processVcfMulti(final String vcfIn, final Function<VariantContext, List<VariantContext>> fun) throws IOException {
this.lastVariantCount = 0;
if (vcfIn == null) {
final String msg = "Vcf Input URI/FIle is null.";
LOG.error(msg);
throw new IllegalArgumentException(msg);
}
File outVcfFile = null;
File outVcfIndexFile = null;
final File STOP_FILE = new File(this.workfingDirectory, "STOP");
if (STOP_FILE.exists()) {
final String msg = "There is a stop file in " + STOP_FILE;
LOG.error(msg);
throw new IOException(msg);
}
boolean fail_flag = false;
VcfIterator iter = null;
VariantContextWriter variantContextWriter = null;
try {
IOUtil.assertDirectoryIsReadable(this.workfingDirectory);
IOUtil.assertDirectoryIsWritable(this.workfingDirectory);
if (!IOUtil.isUrl(vcfIn)) {
IOUtil.assertFileIsReadable(new File(vcfIn));
}
final String extension;
if (this.forceSuffix.equals(ForceSuffix.ForceTabix)) {
extension = ".vcf.gz";
} else if (this.forceSuffix.equals(ForceSuffix.ForceTribble)) {
extension = ".vcf";
} else if (vcfIn.endsWith(".gz")) {
extension = ".vcf.gz";
} else {
extension = ".vcf";
}
final String filename = this.createOutputFile(vcfIn, extension);
final String indexFilename;
if (extension.endsWith(".gz")) {
indexFilename = filename + Tribble.STANDARD_INDEX_EXTENSION;
} else {
indexFilename = filename + TabixUtils.STANDARD_INDEX_EXTENSION;
}
outVcfFile = new File(filename);
outVcfIndexFile = new File(indexFilename);
LOG.info("opening " + vcfIn);
iter = VCFUtils.createVcfIterator(vcfIn);
super.init(iter.getHeader());
final VCFHeader vcfHeader2;
if (this.getExtraVcfHeaderLines().isEmpty()) {
vcfHeader2 = iter.getHeader();
} else {
vcfHeader2 = new VCFHeader(iter.getHeader());
for (final VCFHeaderLine extra : this.getExtraVcfHeaderLines()) {
vcfHeader2.addMetaDataLine(extra);
}
// clear vcf header line now they 've been added to the header.
this.getExtraVcfHeaderLines().clear();
}
final SAMSequenceDictionary dict = this.getHeader().getSequenceDictionary();
if (dict == null) {
final String msg = "There is no dictionary (##contig lines) in " + vcfIn + " but they are required.";
LOG.error(msg);
throw new IllegalArgumentException(msg);
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
progress.setLogPrefix(this.filePrefix);
LOG.info("writing " + outVcfFile + ". Emergency stop file is " + STOP_FILE);
variantContextWriter = this.variantContextWriterBuilder.setOutputFile(outVcfFile).setReferenceDictionary(dict).build();
long lastTick = System.currentTimeMillis();
variantContextWriter.writeHeader(vcfHeader2);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
final List<VariantContext> array = fun.apply(ctx);
if (array != null) {
for (final VariantContext ctx2 : array) {
variantContextWriter.add(ctx2);
this.lastVariantCount++;
}
}
// check STOP File
final long now = System.currentTimeMillis();
if (// 10sec
(now - lastTick) > 10 * 1000) {
lastTick = now;
if (STOP_FILE.exists()) {
LOG.warn("STOP FILE detected " + STOP_FILE + " Aborting.");
fail_flag = true;
break;
}
}
}
progress.finish();
iter.close();
iter = null;
variantContextWriter.close();
variantContextWriter = null;
return outVcfFile.getPath();
} catch (final Exception err) {
fail_flag = true;
LOG.error(err);
throw new IOException(err);
} finally {
CloserUtil.close(iter);
CloserUtil.close(variantContextWriter);
if (fail_flag) {
if (outVcfFile != null && outVcfFile.exists()) {
LOG.warn("deleting " + outVcfFile);
outVcfFile.delete();
}
if (outVcfIndexFile != null && outVcfIndexFile.exists()) {
LOG.warn("deleting " + outVcfIndexFile);
outVcfIndexFile.delete();
}
}
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class CompareBamAndBuild method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
SortingCollection<Match> database = null;
if (this.chainFile == null) {
LOG.error("Chain file is not defined Option");
return -1;
}
if (args.size() != 2) {
LOG.error("Illegal number of arguments. Expected two indexed BAMS.");
return -1;
}
try {
LOG.info("load chain file");
this.liftOver = new LiftOver(this.chainFile);
database = SortingCollection.newInstance(Match.class, new MatchCodec(), new MatchOrdererInSortingCollection(), this.maxRecordsInRam, this.tmpDir.toPath());
database.setDestructiveIteration(true);
for (int currentSamFileIndex = 0; currentSamFileIndex < 2; currentSamFileIndex++) {
final File samFile = new File(args.get(currentSamFileIndex));
LOG.info("read " + samFile);
this.bamFiles[currentSamFileIndex] = samFile;
SamReader samFileReader = CompareBamAndBuild.this.openSamReader(samFile.getPath());
final SAMSequenceDictionary dict = samFileReader.getFileHeader().getSequenceDictionary();
this.sequenceDictionaries[currentSamFileIndex] = dict;
if (dict.isEmpty()) {
samFileReader.close();
LOG.error("Empty Dict in " + samFile);
return -1;
}
final Interval interval;
if (REGION != null) {
IntervalParser intervalParser = new IntervalParser(dict);
interval = intervalParser.parse(this.REGION);
if (interval == null) {
samFileReader.close();
LOG.error("Cannot parse " + REGION + " (bad syntax or not in dictionary");
return -1;
}
} else {
interval = null;
}
final SAMRecordIterator iter;
if (interval == null) {
iter = samFileReader.iterator();
} else {
iter = samFileReader.queryOverlapping(interval.getContig(), interval.getStart(), interval.getEnd());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.isSecondaryOrSupplementary())
continue;
final Match m = new Match();
m.flag = rec.getFlags();
m.readName = rec.getReadName();
m.firstBamFile = currentSamFileIndex == 0;
if (rec.getReadUnmappedFlag()) {
m.tid = -1;
m.pos = -1;
} else {
m.tid = rec.getReferenceIndex();
m.pos = rec.getAlignmentStart();
}
database.add(m);
}
iter.close();
samFileReader.close();
samFileReader = null;
LOG.info("Close " + samFile);
}
database.doneAdding();
LOG.info("Writing results....");
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
// compute the differences for each read
out.print("#READ-Name\tCOMPARE");
for (final File f : this.bamFiles) {
out.print("\t" + f);
}
out.println();
/* create an array of set<Match> */
final MatchComparator match_comparator = new MatchComparator();
final List<Set<Match>> matches = new ArrayList<Set<CompareBamAndBuild.Match>>(2);
while (matches.size() < 2) {
matches.add(new TreeSet<CompareBamAndBuild.Match>(match_comparator));
}
CloseableIterator<Match> iter = database.iterator();
String currReadName = null;
int curr_num_in_pair = -1;
for (; ; ) {
final Match nextMatch;
if (iter.hasNext()) {
nextMatch = iter.next();
} else {
nextMatch = null;
}
if (nextMatch == null || (currReadName != null && !currReadName.equals(nextMatch.readName)) || (curr_num_in_pair != -1 && curr_num_in_pair != nextMatch.indexInPair())) {
if (currReadName != null) {
out.print(currReadName);
if (curr_num_in_pair > 0) {
out.print("/");
out.print(curr_num_in_pair);
}
out.print("\t");
if (same(matches.get(0), matches.get(1))) {
out.print("EQ");
} else {
out.print("NE");
}
for (int x = 0; x < 2; ++x) {
out.print("\t");
print(out, matches.get(x));
}
out.println();
}
if (nextMatch == null)
break;
for (final Set<Match> set : matches) set.clear();
}
currReadName = nextMatch.readName;
curr_num_in_pair = nextMatch.indexInPair();
matches.get(nextMatch.firstBamFile ? 0 : 1).add(nextMatch);
}
iter.close();
out.flush();
out.close();
database.cleanup();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
this.liftOver = null;
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class CompareBams method doWork.
@Override
public int doWork(final List<String> args) {
this.IN.addAll(args.stream().map(S -> new File(S)).collect(Collectors.toList()));
SortingCollection<Match> database = null;
SamReader samFileReader = null;
CloseableIterator<Match> iter = null;
try {
if (this.IN.size() < 2) {
LOG.error("Need more bams please");
return -1;
}
database = SortingCollection.newInstance(Match.class, new MatchCodec(), new MatchOrderer(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
this.samSequenceDictAreTheSame = true;
database.setDestructiveIteration(true);
for (int currentSamFileIndex = 0; currentSamFileIndex < this.IN.size(); currentSamFileIndex++) {
final File samFile = this.IN.get(currentSamFileIndex);
LOG.info("Opening " + samFile);
samFileReader = super.createSamReaderFactory().open(samFile);
final SAMSequenceDictionary dict = samFileReader.getFileHeader().getSequenceDictionary();
if (dict == null || dict.isEmpty()) {
LOG.error("Empty Dict in " + samFile);
return -1;
}
if (!this.sequenceDictionaries.isEmpty() && !SequenceUtil.areSequenceDictionariesEqual(this.sequenceDictionaries.get(0), dict)) {
this.samSequenceDictAreTheSame = false;
LOG.warn("FOOL !! THE SEQUENCE DICTIONARIES ARE **NOT** THE SAME. I will try to compare anyway but it will be slower.");
}
this.sequenceDictionaries.add(dict);
final Optional<Interval> interval;
if (REGION != null && !REGION.trim().isEmpty()) {
final IntervalParser dix = new IntervalParser(dict);
interval = Optional.ofNullable(dix.parse(REGION));
if (!interval.isPresent()) {
LOG.error("Cannot parse " + REGION + " (bad syntax or not in dictionary)");
return -1;
}
} else {
interval = Optional.empty();
}
SAMRecordIterator it = null;
if (!interval.isPresent()) {
it = samFileReader.iterator();
} else {
it = samFileReader.queryOverlapping(interval.get().getContig(), interval.get().getStart(), interval.get().getEnd());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
while (it.hasNext()) {
final SAMRecord rec = progress.watch(it.next());
if (!rec.getReadUnmappedFlag()) {
if (rec.getMappingQuality() < this.min_mapq)
continue;
if (rec.isSecondaryOrSupplementary())
continue;
}
final Match m = new Match();
if (rec.getReadPairedFlag()) {
m.num_in_pair = (rec.getFirstOfPairFlag() ? 1 : 2);
} else {
m.num_in_pair = 0;
}
m.readName = rec.getReadName();
m.bamIndex = currentSamFileIndex;
m.flag = rec.getFlags();
m.cigar = rec.getCigarString();
if (m.cigar == null)
m.cigar = "";
if (rec.getReadUnmappedFlag()) {
m.tid = -1;
m.pos = -1;
} else {
m.tid = rec.getReferenceIndex();
m.pos = rec.getAlignmentStart();
}
database.add(m);
}
it.close();
samFileReader.close();
samFileReader = null;
LOG.info("Close " + samFile);
}
database.doneAdding();
LOG.info("Writing results....");
this.out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
// compute the differences for each read
this.out.print("#READ-Name\t");
for (int x = 0; x < this.IN.size(); ++x) {
for (int y = x + 1; y < this.IN.size(); ++y) {
if (!(x == 0 && y == 1))
this.out.print("|");
this.out.print(IN.get(x));
this.out.print(" ");
this.out.print(IN.get(y));
}
}
for (int x = 0; x < this.IN.size(); ++x) {
this.out.print("\t" + IN.get(x));
}
this.out.println();
/* create an array of set<Match> */
final MatchComparator match_comparator = new MatchComparator();
final List<Set<Match>> matches = new ArrayList<Set<CompareBams.Match>>(this.IN.size());
while (matches.size() < this.IN.size()) {
matches.add(new TreeSet<CompareBams.Match>(match_comparator));
}
iter = database.iterator();
String currReadName = null;
int curr_num_in_pair = -1;
for (; ; ) {
Match nextMatch = null;
if (iter.hasNext()) {
nextMatch = iter.next();
}
if (nextMatch == null || (currReadName != null && !currReadName.equals(nextMatch.readName)) || (curr_num_in_pair != -1 && curr_num_in_pair != nextMatch.num_in_pair)) {
if (currReadName != null) {
this.out.print(currReadName);
if (curr_num_in_pair > 0) {
this.out.print("/");
this.out.print(curr_num_in_pair);
}
this.out.print("\t");
for (int x = 0; x < this.IN.size(); ++x) {
final Set<Match> first = matches.get(x);
for (int y = x + 1; y < this.IN.size(); ++y) {
if (!(x == 0 && y == 1))
this.out.print("|");
Set<Match> second = matches.get(y);
if (same(first, second)) {
this.out.print("EQ");
} else {
this.out.print("NE");
}
}
}
for (int x = 0; x < this.IN.size(); ++x) {
this.out.print("\t");
print(matches.get(x), sequenceDictionaries.get(x));
}
this.out.println();
}
if (nextMatch == null)
break;
for (Set<Match> set : matches) set.clear();
}
currReadName = nextMatch.readName;
curr_num_in_pair = nextMatch.num_in_pair;
matches.get(nextMatch.bamIndex).add(nextMatch);
if (this.out.checkError())
break;
}
iter.close();
this.out.flush();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (database != null)
database.cleanup();
CloserUtil.close(samFileReader);
CloserUtil.close(this.out);
this.out = null;
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfBiomart method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator iter, final VariantContextWriter out) {
HttpGet httpGet = null;
final Pattern tab = Pattern.compile("[\t]");
try {
final TransformerFactory factory = TransformerFactory.newInstance();
final Transformer transformer = factory.newTransformer();
// transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
final VCFHeader header = iter.getHeader();
StringBuilder desc = new StringBuilder("Biomart query. Format: ");
desc.append(this.attributes.stream().map(S -> this.printLabels ? S + "|" + S : S).collect(Collectors.joining("|")));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
header.addMetaDataLine(new VCFInfoHeaderLine(this.TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, desc.toString()));
out.writeHeader(header);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
while (iter.hasNext()) {
final VariantContext ctx = progress.watch(iter.next());
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.rmAttribute(this.TAG);
this.filterColumnContig.set(ctx.getContig());
this.filterColumnStart.set(String.valueOf(ctx.getStart()));
this.filterColumnEnd.set(String.valueOf(ctx.getEnd()));
final StringWriter domToStr = new StringWriter();
transformer.transform(new DOMSource(this.domQuery), new StreamResult(domToStr));
final URIBuilder builder = new URIBuilder(this.serviceUrl);
builder.addParameter("query", domToStr.toString());
// System.err.println("\nwget -O - 'http://grch37.ensembl.org/biomart/martservice?query="+escapedQuery+"'\n");
// escapedQuery = URLEncoder.encode(escapedQuery,"UTF-8");
httpGet = new HttpGet(builder.build());
final CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
int responseCode = httpResponse.getStatusLine().getStatusCode();
if (responseCode != 200) {
throw new RuntimeIOException("Response code was not 200. Detected response was " + responseCode);
}
InputStream response = httpResponse.getEntity().getContent();
if (this.teeResponse) {
response = new TeeInputStream(response, stderr(), false);
}
final BufferedReader br = new BufferedReader(new InputStreamReader(response));
final Set<String> infoAtts = br.lines().filter(L -> !StringUtil.isBlank(L)).filter(L -> !L.equals("[success]")).map(L -> tab.split(L)).map(T -> {
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < this.attributes.size(); i++) {
if (i > 0)
sb.append("|");
if (this.printLabels)
sb.append(escapeInfo(this.attributes.get(i))).append("|");
sb.append(i < T.length ? escapeInfo(T[i]) : "");
}
return sb.toString();
}).collect(Collectors.toCollection(LinkedHashSet::new));
CloserUtil.close(br);
CloserUtil.close(response);
CloserUtil.close(httpResponse);
if (!infoAtts.isEmpty()) {
vcb.attribute(this.TAG, new ArrayList<>(infoAtts));
}
out.add(vcb.make());
}
progress.finish();
return 0;
} catch (final Exception err) {
LOG.error(err);
throw new RuntimeIOException(err);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfEnsemblVepRest method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator vcfIn, final VariantContextWriter out) {
try {
final VCFHeader header = vcfIn.getHeader();
final List<VariantContext> buffer = new ArrayList<>(this.batchSize + 1);
final VCFHeader h2 = new VCFHeader(header);
addMetaData(h2);
for (final String tag : this.outputTags) {
h2.addMetaDataLine(new VCFInfoHeaderLine(tag, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "__CUSTOM_DESCRIPTION__" + tag));
}
out.writeHeader(h2);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
for (; ; ) {
VariantContext ctx = null;
if (vcfIn.hasNext()) {
buffer.add((ctx = progress.watch(vcfIn.next())));
}
if (ctx == null || buffer.size() >= this.batchSize) {
if (!buffer.isEmpty()) {
final Map<String, EnsVepPrediction> input2pred = vep(buffer);
for (final VariantContext ctx2 : buffer) {
final String inputStr = createInputContext(ctx2);
final EnsVepPrediction pred = input2pred.get(inputStr);
final VariantContextBuilder vcb = new VariantContextBuilder(ctx2);
for (final String tag : this.outputTags) {
vcb.rmAttribute(tag);
}
if (pred == null) {
LOG.info("No Annotation found for " + inputStr);
out.add(vcb.make());
continue;
}
for (final String tag : this.outputTags) {
final Set<String> info = pred.tag2infoLines.get(tag);
if (info == null || info.isEmpty())
continue;
vcb.attribute(tag, new ArrayList<>(info));
}
out.add(vcb.make());
}
// end of loop over variants
}
// end of if buffer is not empty
if (ctx == null)
break;
buffer.clear();
}
}
progress.finish();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
}
}
Aggregations