use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class ContigNameConverter method fromPath.
public static ContigNameConverter fromPath(final Path mappingFile) {
IOUtil.assertFileIsReadable(mappingFile);
final MapBasedContigNameConverter mapper = new MapBasedContigNameConverter();
mapper.name = mappingFile.getFileName().toString();
BufferedReader in = null;
try {
final CharSplitter tab = CharSplitter.TAB;
in = IOUtils.openPathForBufferedReading(mappingFile);
String line;
while ((line = in.readLine()) != null) {
if (StringUtils.isBlank(line) || line.startsWith("#"))
continue;
final String[] tokens = tab.split(line);
if (tokens.length != 2 || StringUtils.isBlank(tokens[0]) || StringUtils.isBlank(tokens[1])) {
in.close();
in = null;
throw new IOException("Bad mapping line: \"" + line + "\" in " + mappingFile);
}
tokens[0] = tokens[0].trim();
tokens[1] = tokens[1].trim();
if (mapper.map.containsKey(tokens[0])) {
in.close();
throw new IOException("Mapping defined twice for: \"" + tokens[0] + "\"");
}
mapper.map.put(tokens[0], tokens[1]);
}
return mapper;
} catch (final IOException err) {
throw new RuntimeIOException(err);
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class ConvertBamChromosomes method readMappingFile.
private Map<String, String> readMappingFile(final SAMSequenceDictionary dictIn, final SAMSequenceDictionary dictOut) throws IOException {
final Map<String, String> mapper = new LinkedHashMap<>();
BufferedReader in = null;
final CharSplitter tab = CharSplitter.TAB;
try {
in = IOUtils.openPathForBufferedReading(this.mappingFile);
String line;
while ((line = in.readLine()) != null) {
if (StringUtil.isBlank(line) || line.startsWith("#"))
continue;
final String[] tokens = tab.split(line);
if (tokens.length != 2 || StringUtil.isBlank(tokens[0]) || StringUtil.isBlank(tokens[1])) {
in.close();
in = null;
throw new IOException("Bad mapping line: \"" + line + "\"");
}
tokens[0] = tokens[0].trim();
tokens[1] = tokens[1].trim();
if (tokens[0].equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) || tokens[1].equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
in.close();
throw new IOException("Bad name for: " + line);
}
if (dictIn.getSequence(tokens[0]) == null) {
LOG.warn("Input dictionary doesn't contain " + tokens[0] + ", skipping");
continue;
}
if (dictOut != null && dictOut.getSequence(tokens[1]) == null) {
LOG.warn("Output dictionary doesn't contain " + tokens[1] + ", skipping");
continue;
}
if (mapper.containsKey(tokens[0])) {
in.close();
throw new IOException("Mapping defined twice for src \"" + tokens[0] + "\"");
}
if (mapper.values().contains(tokens[1])) {
throw new IOException("Mapping defined twice for dest \"" + tokens[1] + "\"");
}
mapper.put(tokens[0], tokens[1]);
}
return mapper;
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class ConvertBamChromosomes method doWork.
@Override
public int doWork(final List<String> args) {
final CharSplitter SEMICOLON_PAT = CharSplitter.SEMICOLON;
final CharSplitter COMMA_PAT = CharSplitter.COMMA;
if (this.dictSource == null && this.mappingFile == null) {
LOG.error("dict source undefined and mapping file both undefined");
return -1;
}
SamReader sr = null;
SAMFileWriter sfw = null;
try {
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null)
srf.referenceSequence(this.faidx);
final String input = oneFileOrNull(args);
if (input == null) {
sr = srf.open(SamInputResource.of(stdin()));
} else {
sr = srf.open(SamInputResource.of(input));
}
final SAMFileHeader inHeader = sr.getFileHeader();
if (inHeader == null) {
LOG.error("File header missing");
return -1;
}
final SAMSequenceDictionary dictIn = SequenceDictionaryUtils.extractRequired(inHeader);
if (dictIn.isEmpty()) {
LOG.error("input Sequence dict is empty");
return -1;
}
final SAMFileHeader outHeader = inHeader.clone();
// create new sequence dict
SAMSequenceDictionary dictOut = null;
Map<String, String> contigMap = null;
if (this.dictSource != null) {
dictOut = SAMSequenceDictionaryExtractor.extractDictionary(this.dictSource);
}
if (this.mappingFile != null) {
contigMap = this.readMappingFile(dictIn, dictOut);
}
if (dictOut == null && contigMap != null) {
contigMap = this.readMappingFile(dictIn, dictOut);
final List<SAMSequenceRecord> ssrs = new ArrayList<SAMSequenceRecord>(dictIn.size());
for (final String ci : contigMap.keySet()) {
final SAMSequenceRecord ssr1 = dictIn.getSequence(ci);
if (ssr1 == null)
continue;
final SAMSequenceRecord ssr2 = new SAMSequenceRecord(contigMap.get(ci), ssr1.getSequenceLength());
for (final Map.Entry<String, String> atts : ssr1.getAttributes()) {
ssr2.setAttribute(atts.getKey(), atts.getValue());
}
ssrs.add(ssr2);
}
dictOut = new SAMSequenceDictionary(ssrs);
this.mapper = ContigNameConverter.fromMap(contigMap);
} else if (dictOut != null && contigMap == null) {
this.mapper = ContigNameConverter.fromDictionaries(dictIn, dictOut);
}
if (this.mapper == null || dictOut == null) {
LOG.error("Illegal state mapper==null or dict-out=null");
return -1;
}
outHeader.setSequenceDictionary(dictOut);
// check order of dictionaries, do we need to set the sort order ?
int prev_out_index = -1;
boolean output_is_sorted = true;
for (int i = 0; i < dictIn.size(); i++) {
final String si = dictIn.getSequence(i).getSequenceName();
final String so = this.mapper.apply(si);
if (so == null)
continue;
final int o_index = dictOut.getSequenceIndex(so);
if (o_index < prev_out_index) {
output_is_sorted = false;
LOG.info("output will NOT be sorted");
break;
}
prev_out_index = o_index;
}
if (!output_is_sorted) {
outHeader.setSortOrder(SortOrder.unsorted);
}
sfw = this.writingBamArgs.openSamWriter(this.output, outHeader, true);
long num_ignored = 0L;
SAMRecordIterator iter = sr.iterator();
while (iter.hasNext()) {
final SAMRecord rec1 = iter.next();
String newName1 = null;
String newName2 = null;
if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
newName1 = convert(rec1.getReferenceName(), dictOut);
}
if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
newName2 = convert(rec1.getMateReferenceName(), dictOut);
}
rec1.setHeader(outHeader);
if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getReferenceName())) {
if (newName1 == null) {
++num_ignored;
continue;
} else {
rec1.setReferenceName(newName1);
}
}
if (rec1.getReadPairedFlag() && !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(rec1.getMateReferenceName())) {
if (newName2 == null) {
++num_ignored;
continue;
}
rec1.setMateReferenceName(newName2);
}
if (rec1.hasAttribute(SAMTag.SA.name())) {
final Object sa = rec1.getStringAttribute(SAMTag.SA.name());
if (sa != null && (sa instanceof String)) {
final String[] tokens = SEMICOLON_PAT.split(String.class.cast(sa));
final List<String> L = new ArrayList<>(tokens.length);
for (final String semiColonStr : tokens) {
final String[] commaStrs = COMMA_PAT.split(semiColonStr);
final String newctg = convert(commaStrs[0], dictOut);
if (newctg == null)
continue;
commaStrs[0] = newctg;
L.add(String.join(",", commaStrs));
}
if (L.isEmpty()) {
rec1.setAttribute(SAMTag.SA.name(), null);
} else {
rec1.setAttribute(SAMTag.SA.name(), String.join(";", L));
}
}
}
sfw.addAlignment(rec1);
}
if (!this.unmappedChromosomes.isEmpty()) {
LOG.warning("Unmapped chromosomes: " + unmappedChromosomes);
}
LOG.warning("num ignored read:" + num_ignored);
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sr);
CloserUtil.close(sfw);
}
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class ConvertBedChromosomes method doWork.
@SuppressWarnings("resource")
protected int doWork(final BufferedReader in, PrintStream out) throws IOException {
String line;
final CharSplitter tab = CharSplitter.of(this.delimStr);
final Pattern headerRegex = Pattern.compile(this.ignoreLinesPattern);
while ((line = in.readLine()) != null) {
final Matcher match = headerRegex.matcher(line);
if (match.find() && match.start() == 0) {
out.println(line);
continue;
}
boolean ok = true;
final String[] tokens = tab.split(line);
for (final int chromColumn0 : this.chromColumns0) {
if (chromColumn0 < 0 || chromColumn0 >= tokens.length) {
ok = false;
continue;
}
final String chrom = this.customMapping.apply(tokens[chromColumn0]);
if (StringUtils.isBlank(chrom)) {
if (this.unmappedChromosomes.add(tokens[chromColumn0])) {
LOG.warn("cannot convert " + tokens[chromColumn0]);
}
ok = false;
continue;
}
tokens[chromColumn0] = chrom;
}
if (!ok) {
switch(this.onNotFound) {
case RAISE_EXCEPTION:
throw new IOException("Cannot convert chrom in : " + line);
case SKIP:
continue;
case RETURN_ORIGINAL:
break;
}
}
out.println(String.join(String.valueOf(tab.getDelimiter()), tokens));
}
out.flush();
return 0;
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class BimToVcf method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter w = null;
BufferedReader r = null;
ReferenceSequenceFile faidx = null;
GenomicSequence genomic = null;
long number_non_snvs = 0L;
try {
faidx = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(faidx);
this.writingVariants.dictionary(dict);
r = super.openBufferedReader(oneFileOrNull(args));
final Set<VCFHeaderLine> headerLines = new HashSet<>();
final VCFInfoHeaderLine morgan = new VCFInfoHeaderLine("MORGAN", 1, VCFHeaderLineType.Float, "Centimorgan");
final VCFInfoHeaderLine svtype = new VCFInfoHeaderLine("SVTYPE", 1, VCFHeaderLineType.String, "Variation type");
VCFStandardHeaderLines.addStandardInfoLines(headerLines, false, "");
// super.addMetaData(headerLines);
headerLines.add(morgan);
headerLines.add(svtype);
final List<String> genotypeSampleNames = Collections.emptyList();
final VCFHeader header = new VCFHeader(headerLines, genotypeSampleNames);
header.setSequenceDictionary(dict);
w = this.writingVariants.open(this.outputFile);
w.writeHeader(header);
final CharSplitter tab = CharSplitter.TAB;
String line;
final Predicate<String> iupacATGC = S -> AcidNucleics.isATGC(S);
final List<String> convertCtg = Arrays.asList("23", "X", "23", "chrX", "24", "Y", "24", "chrY", "26", "MT", "26", "chrM");
while ((line = r.readLine()) != null) {
final String[] tokens = tab.split(line);
if (tokens.length != 6) {
throw new JvarkitException.TokenErrors(6, tokens);
}
Allele a1 = null;
Allele a2 = null;
Allele ref = null;
final String contig = tokens[0];
SAMSequenceRecord ssr = null;
ssr = dict.getSequence(contig);
for (int i = 0; ssr == null && i + 1 < convertCtg.size(); i += 2) {
if (!convertCtg.get(i).equals(contig))
continue;
ssr = dict.getSequence(convertCtg.get(i + 1));
}
if (ssr == null && contig.equals("25")) {
LOG.warn("ignoring " + line);
++number_non_snvs;
continue;
}
if (ssr == null) {
throw new JvarkitException.ContigNotFoundInDictionary(contig, dict);
}
if (genomic == null || !ssr.getSequenceName().equals(genomic.getChrom())) {
genomic = new GenomicSequence(faidx, ssr.getSequenceName());
}
int pos1 = Integer.parseInt(tokens[3]);
if (tokens[4].equals("0"))
tokens[4] = tokens[5];
if (tokens[5].equals("0"))
tokens[5] = tokens[4];
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.attribute(morgan.getID(), Float.parseFloat(tokens[2]));
if (iupacATGC.test(tokens[4]) && iupacATGC.test(tokens[5])) {
if (tokens[4].length() != 1 || tokens[5].length() != -1) {
LOG.warn("Skipping " + line + " because I only handle SNVs.");
number_non_snvs++;
continue;
}
final String refBase = String.valueOf(genomic.charAt(pos1 - 1));
ref = Allele.create(refBase, true);
a1 = refBase.equalsIgnoreCase(tokens[4]) ? ref : Allele.create(tokens[4], false);
;
a2 = refBase.equalsIgnoreCase(tokens[5]) ? ref : Allele.create(tokens[5], false);
final String type;
if (a1.isReference() && a2.isReference()) {
type = "NOVARIATION";
} else if (tokens[4].length() < tokens[5].length()) {
type = "INS";
} else if (tokens[4].length() > tokens[5].length()) {
type = "DEL";
} else {
type = "SNV";
}
vcb.attribute(svtype.getID(), type);
} else if ((tokens[4].equals("-") && iupacATGC.test(tokens[5])) || (tokens[5].equals("-") && iupacATGC.test(tokens[4]))) {
// shift left
pos1--;
String refBase = String.valueOf(genomic.charAt(pos1 - 1));
a1 = Allele.create(refBase, false);
ref = Allele.create(refBase + tokens[tokens[4].equals("-") ? 5 : 4], true);
a2 = a1;
vcb.attribute(svtype.getID(), "DEL");
} else if (tokens[4].equals("-") && tokens[5].equals("-")) {
// shift left
pos1--;
String refBase = String.valueOf(genomic.charAt(pos1 - 1));
a1 = Allele.create(refBase, false);
ref = Allele.create(refBase + genomic.charAt(pos1), true);
a2 = a1;
vcb.attribute(svtype.getID(), "DEL");
} else {
LOG.warn("Skipping " + line + ".");
number_non_snvs++;
continue;
}
final Set<Allele> alleles = new HashSet<>();
alleles.add(ref);
alleles.add(a1);
alleles.add(a2);
vcb.start(pos1);
vcb.stop(pos1 + ref.length() - 1);
if (!tokens[1].isEmpty())
vcb.id(tokens[1]);
vcb.alleles(alleles);
w.add(vcb.make());
}
r.close();
r = null;
w.close();
w = null;
if (number_non_snvs > 0)
LOG.warn("Number lines skipped:" + number_non_snvs);
return 0;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(faidx);
CloserUtil.close(w);
CloserUtil.close(r);
}
}
Aggregations