use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class BedCluster method doWork.
@Override
public int doWork(final List<String> args) {
final BedLineCodec codec = new BedLineCodec();
ArchiveFactory archiveFactory = null;
PrintWriter manifest = null;
if (save_as_interval_list && this.refFaidx == null) {
LOG.error("REF must be specified when saving as interval list");
return -1;
}
if (this.number_of_jobs > 0 && this.consecutive_flags) {
LOG.error("--consecutive cannot be set when --jobs is used.");
return -1;
}
if (this.number_of_jobs < 1 && this.long_length_per_bin < 1L) {
LOG.error("at least --jobs or --size must be specified.");
return -1;
}
if (this.number_of_jobs > 0 && this.long_length_per_bin > 0) {
LOG.error(" --jobs OR --size must be specified. Not both.");
return -1;
}
try {
final SAMSequenceDictionary dict;
final ContigNameConverter converter;
final Comparator<String> contigComparator;
final Comparator<SimpleInterval> intervalComparator;
if (this.refFaidx == null) {
dict = null;
converter = ContigNameConverter.getIdentity();
contigComparator = (A, B) -> A.compareTo(B);
intervalComparator = this.defaultIntervalCmp;
} else {
dict = SequenceDictionaryUtils.extractRequired(this.refFaidx);
converter = ContigNameConverter.fromOneDictionary(dict);
final ContigDictComparator cmp2 = new ContigDictComparator(dict);
contigComparator = cmp2;
intervalComparator = cmp2.createLocatableComparator();
}
archiveFactory = ArchiveFactory.open(this.outputFile);
if (!this.save_as_interval_list && this.do_compress && archiveFactory.isTarOrZipArchive()) {
archiveFactory.setCompressionLevel(0);
}
manifest = new PrintWriter(this.manifestFile == null ? new NullOuputStream() : IOUtils.openPathForWriting(this.manifestFile));
manifest.print("#");
if (this.group_by_contig) {
manifest.print("chrom");
manifest.print("\t");
manifest.print("start");
manifest.print("\t");
manifest.print("end");
manifest.print("\t");
}
manifest.print("filename");
manifest.print("\t");
manifest.print("number_of_records");
manifest.print("\t");
manifest.print("sum-length");
manifest.print("\t");
manifest.print("avg-length");
manifest.print("\t");
manifest.print("stddev-size");
manifest.println();
try (BufferedReader br = super.openBufferedReader(oneFileOrNull(args))) {
final Stream<SimpleInterval> st1 = br.lines().filter(L -> !BedLine.isBedHeader(L)).map(L -> codec.decode(L)).filter(B -> B != null).filter(B -> !StringUtils.isBlank(converter.apply(B.getContig()))).map(B -> new SimpleInterval(converter.apply(B.getContig()), B.getStart(), B.getEnd()));
;
if (this.group_by_contig) {
final Map<String, List<SimpleInterval>> contig2lines = new TreeMap<>(contigComparator);
contig2lines.putAll(st1.collect(Collectors.groupingBy(B -> B.getContig())));
for (final String ctg : contig2lines.keySet()) {
apply_cluster(manifest, archiveFactory, contig2lines.get(ctg), intervalComparator, dict);
}
} else {
apply_cluster(manifest, archiveFactory, st1.collect(Collectors.toList()), intervalComparator, dict);
}
}
manifest.flush();
manifest.close();
archiveFactory.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(manifest);
CloserUtil.close(archiveFactory);
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class GtfUpstreamOrf method doWork.
@Override
public int doWork(final List<String> args) {
GtfReader gtfReader = null;
PrintWriter pw = null;
try {
this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
final ContigDictComparator ctgDictComparator = new ContigDictComparator(refDict);
final String input = oneFileOrNull(args);
gtfReader = input == null ? new GtfReader(stdin()) : new GtfReader(input);
gtfReader.setContigNameConverter(this.refCtgNameConverter);
final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.hasStrand()).sorted((A, B) -> {
int i = ctgDictComparator.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
}).collect(Collectors.toList());
gtfReader.close();
gtfReader = null;
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
for (final KozakSequence.Strength f : KozakSequence.Strength.values()) {
pw.println("#kozak." + f.name() + "=" + kozakStrengthToScore(f));
}
final String gtfSource = getProgramName().toLowerCase();
for (final SAMSequenceRecord ssr : refDict.getSequences()) {
pw.println("##contig " + ssr.getSequenceName() + ": length:" + ssr.getSequenceLength());
}
pw.println("#" + gtfSource + ":" + JVarkitVersion.getInstance().toString());
if (!StringUtils.isBlank(input)) {
pw.println("#gtf:" + input);
}
final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().dictionary(refDict).logger(LOG).build();
for (final Gene gene : genes) {
progress.apply(gene);
/* new reference sequence */
if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(gene.getContig())) {
this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, gene.getContig());
}
final List<RNASequence> rnas = gene.getTranscripts().stream().filter(T -> T.isCoding()).filter(T -> T.hasStrand()).filter(T -> T.hasCodonStartDefined()).filter(T -> T.getTranscriptUTR5().isPresent()).map(T -> new RNASequence(T)).collect(Collectors.toList());
if (rnas.isEmpty())
continue;
final Set<OpenReadingFrame> orfs = rnas.stream().flatMap(R -> R.getUpstreamOpenReadingFrames().stream()).collect(Collectors.toSet());
if (orfs.isEmpty())
continue;
boolean gene_printed = false;
for (final OpenReadingFrame uORF : orfs) {
/* is there any other RNA containing this uORF ?*/
if (rnas.stream().filter(other -> !other.getTranscript().getId().equals(uORF.mRNA.getTranscript().getId())).anyMatch(other -> {
// other must have atg in frame and ATG before the observed one
final int other_atg0 = other.getATG0InRNA();
if (uORF.in_rna_atg0 % 3 != other_atg0 % 3)
return false;
return uORF.in_rna_atg0 >= other_atg0;
}))
continue;
final String transcript_id = uORF.getTranscript().getId() + ".uorf" + (1 + uORF.in_rna_atg0);
if (!gene_printed) {
gene_printed = true;
pw.print(gene.getContig());
pw.print("\t");
// source
pw.print(gtfSource);
pw.print("\t");
pw.print("gene");
pw.print("\t");
// start
pw.print(gene.getStart());
pw.print("\t");
// end
pw.print(gene.getEnd());
pw.print("\t");
// score
pw.print(".");
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(".");
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.println();
}
// TRANSCRIPT
final UTR utr_5_prime = uORF.getTranscript().getTranscriptUTR5().get();
pw.print(gene.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("transcript");
pw.print("\t");
if (gene.isPositiveStrand()) {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_atg0] + 1);
pw.print("\t");
// pw.print(transcriptSequence.mrnaIndex0ToBase[uORF.in_rna_stop0]+1);
if (uORF.in_rna_stop0 == NPOS) {
pw.print(uORF.mRNA.getTranscript().getTxEnd());
} else {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_stop0] + 1);
}
} else {
if (uORF.in_rna_stop0 == NPOS) {
pw.print(uORF.mRNA.getTranscript().getTxStart());
} else {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_stop0] + 1);
}
pw.print("\t");
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_atg0] + 1);
}
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print("0");
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("transcript_biotype", "uORF"));
pw.print(keyvalue("kozak-seq", uORF.kozak.getString()));
pw.print(keyvalue("kozak-strength", uORF.kozak.getStrength()));
pw.print(keyvalue("translation", uORF.peptide));
pw.print(keyvalue("uORF-atg-in-frame-with-transcript-atg", uORF.uorf_atg_in_frame));
pw.print(keyvalue("utr", utr_5_prime.toString() + " " + utr_5_prime.getStart() + "-" + utr_5_prime.getEnd()));
pw.println();
// Exon
for (final Exon exon : uORF.getTranscript().getExons()) {
pw.print(exon.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("exon");
pw.print("\t");
pw.print(exon.getStart());
pw.print("\t");
pw.print(exon.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(exon.getStrand());
pw.print("\t");
// phase
pw.print(0);
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.println();
}
final List<Interval> startBlocks = uORF.mRNA.getCodonBlocks(uORF.in_rna_atg0, uORF.in_rna_atg0 + 1, uORF.in_rna_atg0 + 2);
final List<Interval> stopBlocks = uORF.in_rna_stop0 != NPOS ? uORF.mRNA.getCodonBlocks(uORF.in_rna_stop0, uORF.in_rna_stop0 + 1, uORF.in_rna_stop0 + 2) : Collections.emptyList();
// CDS
if (!stopBlocks.isEmpty()) {
final int cdsStart = startBlocks.stream().mapToInt(B -> B.getStart()).min().orElseThrow(IllegalStateException::new);
final int cdsEnd = stopBlocks.stream().mapToInt(B -> B.getEnd()).max().orElseThrow(IllegalStateException::new);
for (final Exon exon : uORF.getTranscript().getExons()) {
if (exon.getEnd() < cdsStart)
continue;
if (exon.getStart() > cdsEnd)
break;
pw.print(exon.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("CDS");
pw.print("\t");
pw.print(Math.max(cdsStart, exon.getStart()));
pw.print("\t");
pw.print(Math.min(cdsEnd, exon.getEnd()));
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(exon.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(Math.max(cdsStart, exon.getStart())));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.println();
}
}
// CODON START
for (final Interval startc : startBlocks) {
PARANOID.assertLe(startc.getStart(), startc.getEnd());
pw.print(startc.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("start_codon");
pw.print("\t");
pw.print(startc.getStart());
pw.print("\t");
pw.print(startc.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(startc.getStart()));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("distance-mrna-atg", uORF.mRNA.getATG0InRNA() - uORF.in_rna_atg0));
pw.print(keyvalue("pos0-in-mrna", uORF.in_rna_atg0));
pw.print(keyvalue("spliced", startBlocks.size() > 1));
pw.println();
}
// CODON END
for (final Interval stopc : stopBlocks) /* might be empty */
{
PARANOID.assertLe(stopc.getStart(), stopc.getEnd());
pw.print(stopc.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("stop_codon");
pw.print("\t");
pw.print(stopc.getStart());
pw.print("\t");
pw.print(stopc.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(stopc.getStart()));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("spliced", stopBlocks.size() > 1));
pw.println();
}
}
}
progress.close();
pw.flush();
pw.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class VcfBurdenGtf method runBurden.
@Override
protected void runBurden(PrintWriter pw, VCFReader vcfReader, VariantContextWriter vcw) throws IOException {
final SAMSequenceDictionary vcfDict = SequenceDictionaryUtils.extractRequired(vcfReader.getHeader());
final List<Gene> all_genes;
try (GtfReader gtfReader = new GtfReader(this.gtfFile)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(vcfDict));
all_genes = gtfReader.getAllGenes().stream().filter(G -> StringUtil.isBlank(this.intergenic_contig) || this.intergenic_contig.equals("*") || this.intergenic_contig.equals(G.getContig())).sorted(new ContigDictComparator(vcfDict).createLocatableComparator()).collect(Collectors.toCollection(ArrayList::new));
}
pw.print("#chrom");
pw.print("\t");
pw.print("start0");
pw.print("\t");
pw.print("end");
pw.print("\t");
pw.print("name");
pw.print("\t");
pw.print("length");
pw.print("\t");
pw.print("gene");
pw.print("\t");
pw.print("type");
pw.print("\t");
pw.print("strand");
pw.print("\t");
pw.print("transcript");
pw.print("\t");
pw.print("gene-id");
pw.print("\t");
pw.print("intervals");
pw.print("\t");
pw.print("p-value");
pw.print("\t");
pw.print("affected_alt");
pw.print("\t");
pw.print("affected_hom");
pw.print("\t");
pw.print("unaffected_alt");
pw.print("\t");
pw.print("unaffected_hom");
pw.print("\t");
pw.print("variants.count");
pw.println();
final List<SimpleInterval> all_intergenic = new ArrayList<>();
if (!StringUtil.isBlank(this.intergenic_contig)) {
for (final SAMSequenceRecord ssr : vcfDict.getSequences()) {
if (!(this.intergenic_contig.equals("*") || this.intergenic_contig.equals(ssr.getSequenceName())))
continue;
final BitSet filled = new BitSet(ssr.getSequenceLength() + 2);
all_genes.stream().filter(G -> G.getContig().equals(ssr.getSequenceName())).forEach(G -> filled.set(G.getStart(), 1 + /* bit set is 0 based */
Math.min(G.getEnd(), ssr.getSequenceLength())));
int i = 1;
while (i < ssr.getSequenceLength()) {
if (filled.get(i)) {
i++;
continue;
}
int j = i;
while (j < ssr.getSequenceLength() && !filled.get(j)) {
j++;
}
all_intergenic.add(new SimpleInterval(ssr.getSequenceName(), i, j));
i = j + 1;
}
all_genes.removeIf(G -> G.getContig().equals(ssr.getSequenceName()));
}
all_genes.clear();
}
final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
/* run genes */
for (final Gene gene : all_genes) {
progress.apply(gene);
final IntervalTree<VariantContext> intervalTree = new IntervalTree<>();
vcfReader.query(gene).stream().filter(V -> accept(V)).forEach(V -> intervalTree.put(V.getStart(), V.getEnd(), V));
if (intervalTree.size() == 0)
continue;
for (final Transcript transcript : gene.getTranscripts()) {
final List<SubPartOfTranscript> parts = new ArrayList<>();
parts.addAll(transcript.getExons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
parts.addAll(transcript.getIntrons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
final int intron_window_size = 1000;
final int intron_window_shift = 500;
for (final Intron intron : transcript.getIntrons()) {
if (intron.getLengthOnReference() <= intron_window_size)
continue;
int start_pos = intron.getStart();
while (start_pos + intron_window_size <= intron.getEnd()) {
int xend = Math.min(intron.getEnd(), start_pos + intron_window_size - 1);
int xstart = xend - intron_window_size - 1;
parts.add(new SubPartOfTranscript(transcript, intron.getName() + ".Sliding", Collections.singletonList(new SimpleInterval(intron.getContig(), xstart, xend))));
start_pos += intron_window_shift;
}
}
for (final UTR utr : transcript.getUTRs()) {
parts.add(new SubPartOfTranscript(transcript, utr.getName(), utr.getIntervals()));
}
if (transcript.getExonCount() > 1) {
parts.add(new SubPartOfTranscript(transcript, "AllExons", transcript.getExons().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
}
if (transcript.hasCodonStartDefined() && transcript.hasCodonStopDefined() && transcript.getAllCds().size() > 1) {
parts.add(new SubPartOfTranscript(transcript, "AllCds", transcript.getAllCds().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
}
final int L = transcript.getTranscriptLength();
final int[] index2genomic = new int[L];
int pos = 0;
for (final Exon exon : transcript.getExons()) {
for (int i = exon.getStart(); i <= exon.getEnd(); i++) {
index2genomic[pos] = i;
pos++;
}
}
final int window_size = 200;
final int window_shift = 100;
int array_index = 0;
while (array_index < index2genomic.length) {
final List<Locatable> intervals = new ArrayList<>();
int prev_pos = -1;
int start_pos = index2genomic[array_index];
int i = 0;
while (i < window_size && array_index + i < index2genomic.length) {
final int curr_pos = index2genomic[array_index + i];
if (i > 0 && prev_pos + 1 != curr_pos) {
intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
start_pos = curr_pos;
}
prev_pos = curr_pos;
i++;
}
intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
parts.add(new SubPartOfTranscript(transcript, "Sliding", intervals));
array_index += window_shift;
}
for (final SubPartOfTranscript part : parts) {
final List<VariantContext> variants = new ArrayList<>();
for (final Locatable loc : part.intervals) {
Iterator<IntervalTree.Node<VariantContext>> iter = intervalTree.overlappers(loc.getStart(), loc.getEnd());
while (iter.hasNext()) variants.add(iter.next().getValue());
}
if (variants.isEmpty())
continue;
final FisherResult fisher = runFisher(variants);
if (fisher.p_value > this.fisherTreshold)
continue;
if (vcw != null) {
for (final VariantContext ctx : variants) {
vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(part.label)).make());
}
}
pw.print(part.getContig());
pw.print("\t");
pw.print(part.getStart() - 1);
pw.print("\t");
pw.print(part.getEnd());
pw.print("\t");
pw.print(part.label);
pw.print("\t");
pw.print(part.getLengthOnReference());
pw.print("\t");
pw.print(transcript.getProperties().getOrDefault("gene_name", "."));
pw.print("\t");
pw.print(transcript.getProperties().getOrDefault("transcript_type", "."));
pw.print("\t");
pw.print(gene.getStrand());
pw.print("\t");
pw.print(transcript.getId());
pw.print("\t");
pw.print(gene.getId());
pw.print("\t");
pw.print(part.intervals.stream().map(R -> String.valueOf(R.getStart()) + "-" + R.getEnd()).collect(Collectors.joining(";")));
pw.print("\t");
pw.print(fisher.p_value);
pw.print("\t");
pw.print(fisher.affected_alt);
pw.print("\t");
pw.print(fisher.affected_hom);
pw.print("\t");
pw.print(fisher.unaffected_alt);
pw.print("\t");
pw.print(fisher.unaffected_hom);
pw.print("\t");
pw.print(variants.size());
pw.println();
}
}
}
progress.close();
final ProgressFactory.Watcher<SimpleInterval> progress2 = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
/**
* scan intergenics ...
*/
for (final SimpleInterval intergenic : all_intergenic) {
progress2.apply(intergenic);
final int intergenic_window_size = 2000;
final int intergenic_window_shifr = 100;
final List<SimpleInterval> parts = new ArrayList<>();
if (intergenic.getLengthOnReference() <= intergenic_window_size)
continue;
int start_pos = intergenic.getStart();
while (start_pos + intergenic_window_size <= intergenic.getEnd()) {
int xend = Math.min(intergenic.getEnd(), start_pos + intergenic_window_size - 1);
int xstart = xend - intergenic_window_size - 1;
parts.add(new SimpleInterval(intergenic.getContig(), xstart, xend));
start_pos += intergenic_window_shifr;
}
for (final SimpleInterval part : parts) {
final List<VariantContext> variants = vcfReader.query(part).stream().filter(V -> accept(V)).collect(Collectors.toList());
if (variants.isEmpty())
continue;
final FisherResult fisher = runFisher(variants);
if (fisher.p_value > this.fisherTreshold)
continue;
final String label = "intergenic_" + part.getStart() + "_" + part.getEnd();
if (vcw != null) {
for (final VariantContext ctx : variants) {
vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(label)).make());
}
}
pw.print(part.getContig());
pw.print("\t");
pw.print(part.getStart() - 1);
pw.print("\t");
pw.print(part.getEnd());
pw.print("\t");
pw.print(label);
pw.print("\t");
pw.print(part.getLengthOnReference());
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print("intergenic");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print("" + part.getStart() + "-" + part.getEnd());
pw.print("\t");
pw.print(fisher.p_value);
pw.print("\t");
pw.print(fisher.affected_alt);
pw.print("\t");
pw.print(fisher.affected_hom);
pw.print("\t");
pw.print(fisher.unaffected_alt);
pw.print("\t");
pw.print(fisher.unaffected_hom);
pw.print("\t");
pw.print(variants.size());
pw.println();
}
}
progress2.close();
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class HmmMergeBed method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter pw = null;
try {
if (this.numTreshold < 1) {
LOG.error("bad treshold.");
return -1;
}
final Comparator<Base> locCompare;
if (this.dictPath != null) {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.dictPath);
locCompare = new ContigDictComparator(dict).createLocatableComparator();
} else {
locCompare = (A, B) -> {
int i = A.getContig().compareTo(B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
};
}
final List<Path> paths = IOUtils.unrollPaths(args);
if (this.numTreshold > paths.size()) {
LOG.error("bad treshold (> number of files).");
return -1;
}
if (paths.isEmpty()) {
LOG.error("empty input");
return -1;
}
final List<CloseableIterator<Base>> baseiterators = new ArrayList<>(paths.size());
for (final Path bedPath : paths) {
final BedLineIterator iter1 = new BedLineIterator(bedPath);
final BedLineToBaseIterator iter2 = new BedLineToBaseIterator(iter1);
baseiterators.add(iter2);
}
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
final BaseMergerIterator bmiter = new BaseMergerIterator(baseiterators, locCompare);
while (bmiter.hasNext()) {
final BaseMerge bm = bmiter.next();
if (!bm.isValid()) {
if (this.hide_invalid)
continue;
pw.print("#");
}
pw.print(bm.contig);
pw.print('\t');
pw.print(bm.start - 1);
pw.print('\t');
pw.print(bm.end);
pw.print('\t');
pw.print(bm.getOpcode());
pw.println();
}
bmiter.close();
pw.flush();
pw.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.util.samtools.ContigDictComparator in project jvarkit by lindenb.
the class IbdToVcf method loadMarkers.
private void loadMarkers() throws IOException {
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.markerBedPath)) {
String line;
while ((line = br.readLine()) != null) {
if (StringUtils.isBlank(line) || line.startsWith("#"))
continue;
final String[] tokens = CharSplitter.TAB.split(line);
if (tokens.length != 4)
throw new JvarkitException.TokenErrors(4, tokens);
final String contig = tokens[0];
final SAMSequenceRecord ssr = this.dictionary.getSequence(tokens[0]);
if (ssr == null)
throw new JvarkitException.ContigNotFoundInDictionary(contig, this.dictionary);
int start = Integer.parseInt(tokens[1]);
if (start < 1 || start > ssr.getLengthOnReference()) {
throw new IndexOutOfBoundsException("pos 0<" + start + "<" + ssr.getSequenceLength() + " in " + line);
}
final String name = tokens[3];
final Marker marker = new Marker(ssr.getSequenceIndex(), start + 1, name);
this.all_markers.add(marker);
}
}
if (this.all_markers.isEmpty())
throw new IOException("Not enough markers in " + this.markerBedPath);
Collections.sort(this.all_markers, new ContigDictComparator(this.dictionary).createLocatableComparator());
for (int i = 0; i < this.all_markers.size(); i++) {
this.all_markers.get(i).index = i;
}
LOG.info("markers N=" + this.all_markers.size());
}
Aggregations