use of beast.evolution.datatype.UserDataType in project beast2 by CompEvol.
the class NexusParser method processCharstatelabelsTokens.
private ArrayList<UserDataType> processCharstatelabelsTokens(ArrayList<String> tokens, int[] maxNumberOfStates) throws IOException {
ArrayList<UserDataType> charDescriptions = new ArrayList<>();
final int CHAR_NR = 0, CHAR_NAME = 1, STATES = 2;
int mode = CHAR_NR;
int charNumber = -1;
String charName = "";
ArrayList<String> states = new ArrayList<>();
for (String token : tokens) {
switch(mode) {
case CHAR_NR:
charNumber = Integer.parseInt(token);
mode = CHAR_NAME;
break;
case CHAR_NAME:
if (token.equals("/")) {
mode = STATES;
} else if (token.equals(",")) {
if (charNumber > charDescriptions.size() + 1) {
throw new IOException("Character descriptions should go in the ascending order and there " + "should not be any description missing.");
}
charDescriptions.add(new UserDataType(charName, states));
maxNumberOfStates[0] = Math.max(maxNumberOfStates[0], states.size());
charNumber = -1;
charName = "";
states = new ArrayList<>();
mode = CHAR_NR;
} else {
charName = token;
}
break;
case STATES:
if (token.equals(",")) {
if (charNumber > charDescriptions.size() + 1) {
throw new IOException("Character descriptions should go in the ascending order and there " + "should not be any description missing.");
}
charDescriptions.add(new UserDataType(charName, states));
maxNumberOfStates[0] = Math.max(maxNumberOfStates[0], states.size());
charNumber = -1;
charName = "";
states = new ArrayList<>();
mode = CHAR_NR;
} else {
states.add(token);
}
default:
break;
}
}
return charDescriptions;
}
use of beast.evolution.datatype.UserDataType in project beast2 by CompEvol.
the class NexusParser method parseDataBlock.
// parseCalibrations
/**
* parse data block and create Alignment *
*/
public Alignment parseDataBlock(final BufferedReader fin) throws IOException {
final Alignment alignment = new Alignment();
String str;
int taxonCount = -1;
int charCount = -1;
int totalCount = 4;
String missing = "?";
String gap = "-";
// indicates character matches the one in the first sequence
String matchChar = null;
do {
str = nextLine(fin);
// dimensions ntax=12 nchar=898;
if (str.toLowerCase().contains("dimensions")) {
str = getNextDataBlock(str, fin);
final String character = getAttValue("nchar", str);
if (character == null) {
throw new IOException("nchar attribute expected (e.g. 'dimensions char=123') expected, not " + str);
}
charCount = Integer.parseInt(character);
final String taxa = getAttValue("ntax", str);
if (taxa != null) {
taxonCount = Integer.parseInt(taxa);
}
} else if (str.toLowerCase().contains("format")) {
str = getNextDataBlock(str, fin);
// format datatype=dna interleave=no gap=-;
final String dataTypeName = getAttValue("datatype", str);
final String symbols;
if (getAttValue("symbols", str) == null) {
symbols = getAttValue("symbols", str);
} else {
symbols = getAttValue("symbols", str).replaceAll("\\s", "");
}
if (dataTypeName == null) {
Log.warning.println("Warning: expected datatype (e.g. something like 'format datatype=dna;') not '" + str + "' Assuming integer dataType");
alignment.dataTypeInput.setValue("integer", alignment);
if (symbols != null && (symbols.equals("01") || symbols.equals("012"))) {
totalCount = symbols.length();
}
} else if (dataTypeName.toLowerCase().equals("rna") || dataTypeName.toLowerCase().equals("dna") || dataTypeName.toLowerCase().equals("nucleotide")) {
alignment.dataTypeInput.setValue("nucleotide", alignment);
totalCount = 4;
} else if (dataTypeName.toLowerCase().equals("aminoacid") || dataTypeName.toLowerCase().equals("protein")) {
alignment.dataTypeInput.setValue("aminoacid", alignment);
totalCount = 20;
} else if (dataTypeName.toLowerCase().equals("standard")) {
alignment.dataTypeInput.setValue("standard", alignment);
totalCount = symbols.length();
// if (symbols == null || symbols.equals("01")) {
// alignment.dataTypeInput.setValue("binary", alignment);
// totalCount = 2;
// } else {
// alignment.dataTypeInput.setValue("standard", alignment);
// totalCount = symbols.length();
// }
} else if (dataTypeName.toLowerCase().equals("binary")) {
alignment.dataTypeInput.setValue("binary", alignment);
totalCount = 2;
} else {
alignment.dataTypeInput.setValue("integer", alignment);
if (symbols != null && (symbols.equals("01") || symbols.equals("012"))) {
totalCount = symbols.length();
}
}
final String missingChar = getAttValue("missing", str);
if (missingChar != null) {
missing = missingChar;
}
final String gapChar = getAttValue("gap", str);
if (gapChar != null) {
gap = gapChar;
}
matchChar = getAttValue("matchchar", str);
}
} while (!str.trim().toLowerCase().startsWith("matrix") && !str.toLowerCase().contains("charstatelabels"));
if (alignment.dataTypeInput.get().equals("standard")) {
StandardData type = new StandardData();
type.setInputValue("nrOfStates", totalCount);
// type.setInputValue("symbols", symbols);
type.initAndValidate();
alignment.setInputValue("userDataType", type);
}
// reading CHARSTATELABELS block
if (str.toLowerCase().contains("charstatelabels")) {
if (!alignment.dataTypeInput.get().equals("standard")) {
throw new IllegalArgumentException("If CHARSTATELABELS block is specified then DATATYPE has to be Standard");
}
StandardData standardDataType = (StandardData) alignment.userDataTypeInput.get();
int[] maxNumberOfStates = new int[] { 0 };
ArrayList<String> tokens = readInCharstatelablesTokens(fin);
ArrayList<UserDataType> charDescriptions = processCharstatelabelsTokens(tokens, maxNumberOfStates);
// while (true) {
// str = nextLine(fin);
// if (str.contains(";")) {
// break;
// }
// String[] strSplit = str.split("/");
// ArrayList<String> states = new ArrayList<>();
//
// if (strSplit.length < 2) {
// charDescriptions.add(new UserDataType(strSplit[0], states));
// continue;
// }
//
// String stateStr = strSplit[1];
//
// //add a comma at the end of the string if the last non-whitespace character is not a comma or all the
// // characters are whitespaces in the string. Also remove whitespaces at the end of the string.
// for (int i=stateStr.length()-1; i>=0; i--) {
// if (!Character.isWhitespace(stateStr.charAt(i))) {
// if (stateStr.charAt(i-1) != ',') {
// stateStr = stateStr.substring(0, i)+",";
// break;
// }
// }
// if (i==0) {
// stateStr = stateStr.substring(0, i)+",";
// }
// }
// if (stateStr.isEmpty()) {
// stateStr = stateStr+",";
// }
//
// final int WAITING=0, WORD=1, PHRASE_IN_QUOTES=2;
// int mode =WAITING; //0 waiting for non-space letter, 1 reading a word; 2 reading a phrase in quotes
// int begin =0, end;
//
// for (int i=0; i< stateStr.length(); i++) {
// switch (mode) {
// case WAITING:
// while (stateStr.charAt(i) == ' ') {
// i++;
// }
// mode = stateStr.charAt(i) == '\'' ? PHRASE_IN_QUOTES : WORD;
// begin = i;
// break;
// case WORD:
// end = stateStr.indexOf(" ", begin) != -1 ? stateStr.indexOf(" ", begin) : stateStr.indexOf(",", begin);
// states.add(stateStr.substring(begin, end));
// i=end;
// mode = WAITING;
// break;
// case PHRASE_IN_QUOTES:
// end = begin;
// do {
// end = stateStr.indexOf("'", end+2);
// } while (stateStr.charAt(end+1) == '\'' || end == -1);
// if (end == -1) {
// Log.info.println("Incorrect description in charstatelabels. Single quote found in line ");
// }
// end++;
// states.add(stateStr.substring(begin, end));
// i=end;
// mode=WAITING;
// break;
// default:
// break;
// }
// }
// // oldTODO make strSplit[0] look nicer (remove whitespaces and may be numbers at the beginning)
// charDescriptions.add(new UserDataType(strSplit[0], states));
// maxNumberOfStates = Math.max(maxNumberOfStates, states.size());
// }
standardDataType.setInputValue("charstatelabels", charDescriptions);
standardDataType.setInputValue("nrOfStates", Math.max(maxNumberOfStates[0], totalCount));
standardDataType.initAndValidate();
for (UserDataType dataType : standardDataType.charStateLabelsInput.get()) {
dataType.initAndValidate();
}
}
// skipping before MATRIX block
while (!str.toLowerCase().contains(("matrix"))) {
str = nextLine(fin);
}
// read character data
// Use string builder for efficiency
final Map<String, StringBuilder> seqMap = new HashMap<>();
final List<String> taxa = new ArrayList<>();
String prevTaxon = null;
int seqLen = 0;
while (true) {
str = nextLine(fin);
int start = 0, end;
final String taxon;
while (Character.isWhitespace(str.charAt(start))) {
start++;
}
if (str.charAt(start) == '\'' || str.charAt(start) == '\"') {
final char c = str.charAt(start);
start++;
end = start;
while (str.charAt(end) != c) {
end++;
}
taxon = str.substring(start, end);
seqLen = 0;
end++;
} else {
end = start;
while (end < str.length() && !Character.isWhitespace(str.charAt(end))) {
end++;
}
if (end < str.length()) {
taxon = str.substring(start, end);
seqLen = 0;
} else if ((prevTaxon == null || seqLen == charCount) && end == str.length()) {
taxon = str.substring(start, end);
seqLen = 0;
} else {
taxon = prevTaxon;
if (taxon == null) {
throw new IOException("Could not recognise taxon");
}
end = start;
}
}
prevTaxon = taxon;
String data = str.substring(end);
for (int k = 0; k < data.length(); k++) {
if (!Character.isWhitespace(data.charAt(k))) {
seqLen++;
}
}
// Do this once outside loop- save on multiple regex compilations
// data = data.replaceAll("\\s", "");
// String [] strs = str.split("\\s+");
// String taxon = strs[0];
// for (int k = 1; k < strs.length - 1; k++) {
// taxon += strs[k];
// }
// taxon = taxon.replaceAll("'", "");
// Log.warning.println(taxon);
// String data = strs[strs.length - 1];
data = data.replaceAll(";", "");
if (data.trim().length() > 0) {
if (seqMap.containsKey(taxon)) {
seqMap.put(taxon, seqMap.get(taxon).append(data));
} else {
seqMap.put(taxon, new StringBuilder(data));
taxa.add(taxon);
}
}
if (str.contains(";")) {
break;
}
}
if (taxonCount > 0 && taxa.size() > taxonCount) {
throw new IOException("Wrong number of taxa. Perhaps a typo in one of the taxa: " + taxa);
}
HashSet<String> sortedAmbiguities = new HashSet<>();
for (final String taxon : taxa) {
if (!taxonListContains(taxon)) {
taxonList.add(new Taxon(taxon));
}
final StringBuilder bsData = seqMap.get(taxon);
String data = bsData.toString().replaceAll("\\s", "");
seqMap.put(taxon, new StringBuilder(data));
// collect all ambiguities in the sequence
List<String> ambiguities = new ArrayList<>();
Matcher m = Pattern.compile("\\{(.*?)\\}").matcher(data);
while (m.find()) {
int mLength = m.group().length();
ambiguities.add(m.group().substring(1, mLength - 1));
}
// sort elements of ambiguity sets
String data_without_ambiguities = data.replaceAll("\\{(.*?)\\}", "?");
for (String amb : ambiguities) {
List<Integer> ambInt = new ArrayList<>();
for (int i = 0; i < amb.length(); i++) {
char c = amb.charAt(i);
if (c >= '0' && c <= '9') {
ambInt.add(Integer.parseInt(amb.charAt(i) + ""));
} else {
// ignore
if (data != data_without_ambiguities) {
Log.warning.println("Ambiguity found in " + taxon + " that is treated as missing value");
}
data = data_without_ambiguities;
}
}
Collections.sort(ambInt);
String ambStr = "";
for (int i = 0; i < ambInt.size(); i++) {
ambStr += Integer.toString(ambInt.get(i));
}
sortedAmbiguities.add(ambStr);
}
// check the length of the sequence (treat ambiguity sets as single characters)
if (data_without_ambiguities.length() != charCount) {
throw new IOException(str + "\nExpected sequence of length " + charCount + " instead of " + data.length() + " for taxon " + taxon);
}
// map to standard missing and gap chars
data = data.replace(missing.charAt(0), DataType.MISSING_CHAR);
data = data.replace(gap.charAt(0), DataType.GAP_CHAR);
// resolve matching char, if any
if (matchChar != null && data.contains(matchChar)) {
final char cMatchChar = matchChar.charAt(0);
final String baseData = seqMap.get(taxa.get(0)).toString();
for (int i = 0; i < data.length(); i++) {
if (data.charAt(i) == cMatchChar) {
final char cReplaceChar = baseData.charAt(i);
data = data.substring(0, i) + cReplaceChar + (i + 1 < data.length() ? data.substring(i + 1) : "");
}
}
}
// Using Alignment as Map gives problems when producing XML:
// Sequence names are used as attribute names, producing very readable XML
// However, since attribute names cannot start with a number or contain
// special characters (like ":" or "]") but sequence names do contain them
// on occasion, it is more robust to create a Sequence object for each
// sequence where the taxon name is stored as an XML attribute values
// that do not have the attribute name restrictions.
// if (alignment.dataTypeInput.get().equals("nucleotide") ||
// alignment.dataTypeInput.get().equals("binary") ||
// alignment.dataTypeInput.get().equals("aminoacid") ) {
// alignment.setInputValue(taxon, data);
// } else {
final Sequence sequence = new Sequence();
sequence.init(totalCount, taxon, data);
sequence.setID(generateSequenceID(taxon));
alignment.sequenceInput.setValue(sequence, alignment);
// }
}
if (alignment.dataTypeInput.get().equals("standard")) {
// convert sortedAmbiguities to a whitespace separated string of ambiguities
String ambiguitiesStr = "";
for (String amb : sortedAmbiguities) {
ambiguitiesStr += amb + " ";
}
if (ambiguitiesStr.length() > 0) {
ambiguitiesStr = ambiguitiesStr.substring(0, ambiguitiesStr.length() - 1);
}
alignment.userDataTypeInput.get().initByName("ambiguities", ambiguitiesStr);
}
alignment.initAndValidate();
if (taxonCount > 0 && taxonCount != alignment.getTaxonCount()) {
throw new IOException("dimensions block says there are " + taxonCount + " taxa, but there were " + alignment.getTaxonCount() + " taxa found");
}
return alignment;
}
use of beast.evolution.datatype.UserDataType in project beast2 by CompEvol.
the class TreeLikelihoodTest method testSDolloLikelihood.
@Test
public void testSDolloLikelihood() throws Exception {
UserDataType dataType = new UserDataType();
dataType.initByName("states", 2, "codeMap", "0=1, 1=0, ?=0 1, -=0 1");
Alignment data = new Alignment();
Sequence German_ST = new Sequence("German_ST", BEASTTestCase.German_ST.dataInput.get());
Sequence Dutch_List = new Sequence("Dutch_List", BEASTTestCase.Dutch_List.dataInput.get());
;
Sequence English_ST = new Sequence("English_ST", BEASTTestCase.English_ST.dataInput.get());
;
Sequence French = new Sequence("French", BEASTTestCase.French.dataInput.get());
;
Sequence Italian = new Sequence("Italian", BEASTTestCase.Italian.dataInput.get());
;
Sequence Spanish = new Sequence("Spanish", BEASTTestCase.Spanish.dataInput.get());
;
data.initByName("sequence", German_ST, "sequence", Dutch_List, "sequence", English_ST, "sequence", French, "sequence", Italian, "sequence", Spanish, "userDataType", dataType);
Tree tree = BEASTTestCase.getTree(data, "((English_ST:0.22743347188019544,(German_ST:0.10557648379843088,Dutch_List:0.10557648379843088):0.12185698808176457):1.5793160946109988,(Spanish:0.11078392189606047,(Italian:0.10119772534558173,French:0.10119772534558173):0.009586196550478737):1.6959656445951337)");
RealParameter frequencies = new RealParameter("1 0");
Frequencies freqs = new Frequencies();
freqs.initByName("frequencies", frequencies);
RealParameter deathprob = new RealParameter("1.7");
MutationDeathModel SDollo = new MutationDeathModel();
SDollo.initByName("deathprob", deathprob, "frequencies", freqs);
SiteModel siteModel = new SiteModel();
siteModel.initByName("mutationRate", "1.0", "gammaCategoryCount", 1, "substModel", SDollo);
TreeLikelihood likelihood = newTreeLikelihood();
likelihood.initByName("data", data, "tree", tree, "siteModel", siteModel);
double logP = 0;
likelihood.initByName("data", data, "tree", tree, "siteModel", siteModel, "useAmbiguities", true);
logP = likelihood.calculateLogP();
// beast1 xml gives -3551.6436
assertEquals(logP, -3551.6436270344648, BEASTTestCase.PRECISION);
}
Aggregations