use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class VcfRenameSamples method parseNames.
private void parseNames(final Path f) throws IOException {
final CharSplitter tab = CharSplitter.TAB;
final BufferedReader in = IOUtils.openPathForBufferedReading(f);
String line;
while ((line = in.readLine()) != null) {
if (line.startsWith("#"))
continue;
if (StringUtil.isBlank(line))
continue;
final String[] tokens = tab.split(line);
if (tokens.length != 2)
throw new IOException("Expected two columns in \"" + line + "\" : " + f);
tokens[0] = tokens[0].trim();
tokens[1] = tokens[1].trim();
if (StringUtil.isBlank(tokens[0]))
throw new IOException("Empty src-name in \"" + line + "\" : " + f);
if (StringUtil.isBlank(tokens[1]))
throw new IOException("Empty dest-name in \"" + line + "\" : " + f);
if (tokens[0].equals(tokens[1])) {
LOG.warning("Same src/dest in in \"" + line + "\" : " + f);
continue;
}
if (this.oldNameToNewName.containsKey(tokens[0])) {
final String dest = this.oldNameToNewName.get(tokens[0]);
if (dest.equals(tokens[1])) {
LOG.warning(tokens[0] + " -> " + tokens[1] + " defined twice");
continue;
} else {
throw new IOException("Empty src-name was first defined as " + dest + " and now as " + tokens[1] + " in \"" + line + "\" : " + f);
}
}
this.oldNameToNewName.put(tokens[0], tokens[1]);
}
in.close();
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class BackLocate method run.
private void run(final PrintStream out, final BufferedReader in) throws IOException {
final CharSplitter tab = CharSplitter.TAB;
String line;
while ((line = in.readLine()) != null) {
if (line.startsWith("#") || StringUtils.isBlank(line))
continue;
final String[] tokens = JvarkitException.TokenErrors.atLeast(2, tab.split(line, 3));
final String geneName = tokens[0].trim();
if (StringUtils.isBlank(geneName))
throw new IOException("Bad line. No gene in " + geneName);
final String mut = tokens[1].trim();
int x0 = 0, x1 = 0;
while (x1 < mut.length() && Character.isLetter(mut.charAt(x1))) {
++x1;
}
String substr = mut.substring(x0, x1);
final AminoAcids.AminoAcid aa1 = substr.length() == 1 ? AminoAcids.getAminoAcidFromOneLetterCode(substr.charAt(0)) : AminoAcids.getAminoAcidFromThreeLettersCode(substr);
if (aa1 == null)
throw new JvarkitException.UserError("Bad mutation " + mut + " (cannot parse left AA)");
x0 = x1;
while (x1 < mut.length() && Character.isDigit(mut.charAt(x1))) {
++x1;
}
substr = mut.substring(x0, x1);
if (!StringUtils.isInteger(substr))
throw new JvarkitException.UserError("Bad mutation " + mut + " (cannot parse position)");
final int position1 = Integer.parseInt(substr);
if (position1 == 0)
throw new IOException("Bad position in protein (" + substr + ") in " + line);
substr = mut.substring(x1);
final AminoAcids.AminoAcid aa2 = substr.length() == 1 ? AminoAcids.getAminoAcidFromOneLetterCode(substr.charAt(0)) : AminoAcids.getAminoAcidFromThreeLettersCode(substr);
if (aa2 == null)
throw new JvarkitException.UserError("Bad mutation " + mut + " (cannot parse right AA)");
final List<Transcript> transcripts = this.name2transcripts.get(geneName.toUpperCase());
if (transcripts == null || transcripts.isEmpty()) {
LOG.warn("no transcript found for " + geneName);
continue;
}
for (final Transcript transcript : transcripts) {
backLocate(out, transcript, geneName, aa1, aa2, position1, tokens.length > 2 ? tokens[2] : ".");
}
}
}
use of com.github.lindenb.jvarkit.lang.CharSplitter in project jvarkit by lindenb.
the class Biostar214299 method doWork.
@Override
public int doWork(final List<String> args) {
if (this.positionFile == null) {
LOG.error("position File is not defined.");
return -1;
}
final String UNAFFECTED_SAMPLE = "UNAFFECTED";
final String AMBIGOUS_SAMPLE = "AMBIGOUS";
final String UNMAPPED = "UNMAPPED";
SamReader sfr = null;
SAMFileWriter sfw = null;
final IntervalTreeMap<Position> positionsTreeMap = new IntervalTreeMap<>();
final Set<String> samples = new HashSet<>();
try {
sfr = openSamReader(oneFileOrNull(args));
final SAMFileHeader header = sfr.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.positionFile)) {
String line;
final CharSplitter tab = CharSplitter.TAB;
while ((line = br.readLine()) != null) {
if (StringUtils.isBlank(line) || line.startsWith("#"))
continue;
final String[] tokens = tab.split(line);
if (tokens.length < 4) {
LOG.error("Not enough columns in " + line);
return -1;
}
final String contig = tokens[0];
if (dict.getSequence(contig) == null) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(contig, dict));
return -1;
}
final int refpos = Integer.parseInt(tokens[1]);
final Interval interval = new Interval(contig, refpos, refpos);
Position position = positionsTreeMap.get(interval);
if (position == null) {
position = new Position();
// position.contig = contig;
position.refpos = refpos;
positionsTreeMap.put(interval, position);
}
final String bases = tokens[2].toUpperCase();
if (bases.length() != 1 || !bases.matches("[ATGC]")) {
LOG.error("in " + line + " bases should be one letter and ATGC");
return -1;
}
if (position.base2sample.containsKey(bases.charAt(0))) {
LOG.error("in " + line + " bases already defined for this position");
return -1;
}
final String sampleName = tokens[3].trim();
if (sampleName.isEmpty()) {
LOG.error("sample name cannot be empty");
return -1;
}
samples.add(sampleName);
position.base2sample.put(bases.charAt(0), sampleName);
}
} catch (final IOException err) {
LOG.error(err);
return -1;
}
if (samples.contains(UNAFFECTED_SAMPLE)) {
LOG.error("Sample cannot be named " + UNAFFECTED_SAMPLE);
return -1;
}
if (samples.contains(AMBIGOUS_SAMPLE)) {
LOG.error("Sample cannot be named " + AMBIGOUS_SAMPLE);
return -1;
}
if (samples.contains(UNMAPPED)) {
LOG.error("Sample cannot be named " + UNMAPPED);
return -1;
}
samples.add(UNAFFECTED_SAMPLE);
samples.add(AMBIGOUS_SAMPLE);
samples.add(UNMAPPED);
final SAMFileHeader newHeader = new SAMFileHeader();
newHeader.setSortOrder(header.getSortOrder());
newHeader.setSequenceDictionary(dict);
newHeader.addComment("generated with " + getProgramName() + " " + getVersion() + " Pierre Lindenbaum : " + getProgramCommandLine());
/* create groups */
for (final String sample : samples) {
final SAMReadGroupRecord rg = new SAMReadGroupRecord(sample);
rg.setSample(sample);
rg.setLibrary(sample);
newHeader.addReadGroup(rg);
}
sfw = this.writingBamArgs.setReferencePath(this.reference).openSamWriter(this.outputFile, newHeader, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
final SAMRecordIterator iter = sfr.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
rec.setAttribute("RG", null);
if (rec.getReadUnmappedFlag()) {
rec.setAttribute("RG", UNMAPPED);
sfw.addAlignment(rec);
continue;
}
final Cigar cigar = rec.getCigar();
final Collection<Position> snps = positionsTreeMap.getContained(new Interval(rec.getContig(), rec.getUnclippedStart(), rec.getUnclippedEnd()));
if (snps == null || snps.isEmpty()) {
rec.setAttribute("RG", UNAFFECTED_SAMPLE);
sfw.addAlignment(rec);
continue;
}
final Map<Integer, Position> index2pos = snps.stream().collect(Collectors.toMap(P -> P.refpos, P -> P));
final Set<String> selectedSamples = new HashSet<>();
final byte[] bases = rec.getReadBases();
if (bases == null || bases.equals(SAMRecord.NULL_SEQUENCE)) {
LOG.error("Bases missing in read " + rec);
return -1;
}
int refPos1 = rec.getUnclippedStart();
int readPos0 = 0;
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
final boolean consummeReadBaseOrSoftClip = op.consumesReadBases() || op.equals(CigarOperator.S);
if (op.consumesReferenceBases() && consummeReadBaseOrSoftClip) {
for (int i = 0; i < ce.getLength(); ++i) {
final int nowRefPos1 = (refPos1 + i);
final int nowReadPos0 = (readPos0 + i);
final Position position = index2pos.get(nowRefPos1);
if (position == null)
continue;
if (nowReadPos0 >= bases.length)
continue;
final char base = (char) Character.toUpperCase(bases[nowReadPos0]);
final String sample = position.base2sample.get(base);
if (sample == null)
continue;
selectedSamples.add(sample);
index2pos.remove(nowRefPos1);
if (index2pos.isEmpty())
break;
}
}
if (op.consumesReferenceBases() || op.isClipping()) {
refPos1 += ce.getLength();
}
if (consummeReadBaseOrSoftClip) {
readPos0 += ce.getLength();
}
}
if (selectedSamples.isEmpty()) {
rec.setAttribute("RG", UNAFFECTED_SAMPLE);
} else if (selectedSamples.size() == 1) {
rec.setAttribute("RG", selectedSamples.iterator().next());
} else {
rec.setAttribute("RG", AMBIGOUS_SAMPLE);
}
sfw.addAlignment(rec);
}
progress.finish();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfr);
CloserUtil.close(sfw);
}
}
Aggregations