Search in sources :

Example 1 with StandardData

use of beast.evolution.datatype.StandardData in project beast2 by CompEvol.

the class NexusParser method parseDataBlock.

// parseCalibrations
/**
 * parse data block and create Alignment *
 */
public Alignment parseDataBlock(final BufferedReader fin) throws IOException {
    final Alignment alignment = new Alignment();
    String str;
    int taxonCount = -1;
    int charCount = -1;
    int totalCount = 4;
    String missing = "?";
    String gap = "-";
    // indicates character matches the one in the first sequence
    String matchChar = null;
    do {
        str = nextLine(fin);
        // dimensions ntax=12 nchar=898;
        if (str.toLowerCase().contains("dimensions")) {
            str = getNextDataBlock(str, fin);
            final String character = getAttValue("nchar", str);
            if (character == null) {
                throw new IOException("nchar attribute expected (e.g. 'dimensions char=123') expected, not " + str);
            }
            charCount = Integer.parseInt(character);
            final String taxa = getAttValue("ntax", str);
            if (taxa != null) {
                taxonCount = Integer.parseInt(taxa);
            }
        } else if (str.toLowerCase().contains("format")) {
            str = getNextDataBlock(str, fin);
            // format datatype=dna interleave=no gap=-;
            final String dataTypeName = getAttValue("datatype", str);
            final String symbols;
            if (getAttValue("symbols", str) == null) {
                symbols = getAttValue("symbols", str);
            } else {
                symbols = getAttValue("symbols", str).replaceAll("\\s", "");
            }
            if (dataTypeName == null) {
                Log.warning.println("Warning: expected datatype (e.g. something like 'format datatype=dna;') not '" + str + "' Assuming integer dataType");
                alignment.dataTypeInput.setValue("integer", alignment);
                if (symbols != null && (symbols.equals("01") || symbols.equals("012"))) {
                    totalCount = symbols.length();
                }
            } else if (dataTypeName.toLowerCase().equals("rna") || dataTypeName.toLowerCase().equals("dna") || dataTypeName.toLowerCase().equals("nucleotide")) {
                alignment.dataTypeInput.setValue("nucleotide", alignment);
                totalCount = 4;
            } else if (dataTypeName.toLowerCase().equals("aminoacid") || dataTypeName.toLowerCase().equals("protein")) {
                alignment.dataTypeInput.setValue("aminoacid", alignment);
                totalCount = 20;
            } else if (dataTypeName.toLowerCase().equals("standard")) {
                alignment.dataTypeInput.setValue("standard", alignment);
                totalCount = symbols.length();
            // if (symbols == null || symbols.equals("01")) {
            // alignment.dataTypeInput.setValue("binary", alignment);
            // totalCount = 2;
            // }  else {
            // alignment.dataTypeInput.setValue("standard", alignment);
            // totalCount = symbols.length();
            // }
            } else if (dataTypeName.toLowerCase().equals("binary")) {
                alignment.dataTypeInput.setValue("binary", alignment);
                totalCount = 2;
            } else {
                alignment.dataTypeInput.setValue("integer", alignment);
                if (symbols != null && (symbols.equals("01") || symbols.equals("012"))) {
                    totalCount = symbols.length();
                }
            }
            final String missingChar = getAttValue("missing", str);
            if (missingChar != null) {
                missing = missingChar;
            }
            final String gapChar = getAttValue("gap", str);
            if (gapChar != null) {
                gap = gapChar;
            }
            matchChar = getAttValue("matchchar", str);
        }
    } while (!str.trim().toLowerCase().startsWith("matrix") && !str.toLowerCase().contains("charstatelabels"));
    if (alignment.dataTypeInput.get().equals("standard")) {
        StandardData type = new StandardData();
        type.setInputValue("nrOfStates", totalCount);
        // type.setInputValue("symbols", symbols);
        type.initAndValidate();
        alignment.setInputValue("userDataType", type);
    }
    // reading CHARSTATELABELS block
    if (str.toLowerCase().contains("charstatelabels")) {
        if (!alignment.dataTypeInput.get().equals("standard")) {
            throw new IllegalArgumentException("If CHARSTATELABELS block is specified then DATATYPE has to be Standard");
        }
        StandardData standardDataType = (StandardData) alignment.userDataTypeInput.get();
        int[] maxNumberOfStates = new int[] { 0 };
        ArrayList<String> tokens = readInCharstatelablesTokens(fin);
        ArrayList<UserDataType> charDescriptions = processCharstatelabelsTokens(tokens, maxNumberOfStates);
        // while (true) {
        // str = nextLine(fin);
        // if (str.contains(";")) {
        // break;
        // }
        // String[] strSplit = str.split("/");
        // ArrayList<String> states = new ArrayList<>();
        // 
        // if (strSplit.length < 2) {
        // charDescriptions.add(new UserDataType(strSplit[0], states));
        // continue;
        // }
        // 
        // String stateStr = strSplit[1];
        // 
        // //add a comma at the end of the string if the last non-whitespace character is not a comma or all the
        // // characters are whitespaces in the string. Also remove whitespaces at the end of the string.
        // for (int i=stateStr.length()-1; i>=0; i--) {
        // if (!Character.isWhitespace(stateStr.charAt(i))) {
        // if (stateStr.charAt(i-1) != ',') {
        // stateStr = stateStr.substring(0, i)+",";
        // break;
        // }
        // }
        // if (i==0) {
        // stateStr = stateStr.substring(0, i)+",";
        // }
        // }
        // if (stateStr.isEmpty()) {
        // stateStr = stateStr+",";
        // }
        // 
        // final int WAITING=0, WORD=1, PHRASE_IN_QUOTES=2;
        // int mode =WAITING; //0 waiting for non-space letter, 1 reading a word; 2 reading a phrase in quotes
        // int begin =0, end;
        // 
        // for (int i=0; i< stateStr.length(); i++) {
        // switch (mode) {
        // case WAITING:
        // while (stateStr.charAt(i) == ' ') {
        // i++;
        // }
        // mode = stateStr.charAt(i) == '\'' ? PHRASE_IN_QUOTES : WORD;
        // begin = i;
        // break;
        // case WORD:
        // end = stateStr.indexOf(" ", begin) != -1 ? stateStr.indexOf(" ", begin) : stateStr.indexOf(",", begin);
        // states.add(stateStr.substring(begin, end));
        // i=end;
        // mode = WAITING;
        // break;
        // case PHRASE_IN_QUOTES:
        // end = begin;
        // do {
        // end = stateStr.indexOf("'", end+2);
        // } while (stateStr.charAt(end+1) == '\'' || end == -1);
        // if (end == -1) {
        // Log.info.println("Incorrect description in charstatelabels. Single quote found in line ");
        // }
        // end++;
        // states.add(stateStr.substring(begin, end));
        // i=end;
        // mode=WAITING;
        // break;
        // default:
        // break;
        // }
        // }
        // // oldTODO make strSplit[0] look nicer (remove whitespaces and may be numbers at the beginning)
        // charDescriptions.add(new UserDataType(strSplit[0], states));
        // maxNumberOfStates = Math.max(maxNumberOfStates, states.size());
        // }
        standardDataType.setInputValue("charstatelabels", charDescriptions);
        standardDataType.setInputValue("nrOfStates", Math.max(maxNumberOfStates[0], totalCount));
        standardDataType.initAndValidate();
        for (UserDataType dataType : standardDataType.charStateLabelsInput.get()) {
            dataType.initAndValidate();
        }
    }
    // skipping before MATRIX block
    while (!str.toLowerCase().contains(("matrix"))) {
        str = nextLine(fin);
    }
    // read character data
    // Use string builder for efficiency
    final Map<String, StringBuilder> seqMap = new HashMap<>();
    final List<String> taxa = new ArrayList<>();
    String prevTaxon = null;
    int seqLen = 0;
    while (true) {
        str = nextLine(fin);
        int start = 0, end;
        final String taxon;
        while (Character.isWhitespace(str.charAt(start))) {
            start++;
        }
        if (str.charAt(start) == '\'' || str.charAt(start) == '\"') {
            final char c = str.charAt(start);
            start++;
            end = start;
            while (str.charAt(end) != c) {
                end++;
            }
            taxon = str.substring(start, end);
            seqLen = 0;
            end++;
        } else {
            end = start;
            while (end < str.length() && !Character.isWhitespace(str.charAt(end))) {
                end++;
            }
            if (end < str.length()) {
                taxon = str.substring(start, end);
                seqLen = 0;
            } else if ((prevTaxon == null || seqLen == charCount) && end == str.length()) {
                taxon = str.substring(start, end);
                seqLen = 0;
            } else {
                taxon = prevTaxon;
                if (taxon == null) {
                    throw new IOException("Could not recognise taxon");
                }
                end = start;
            }
        }
        prevTaxon = taxon;
        String data = str.substring(end);
        for (int k = 0; k < data.length(); k++) {
            if (!Character.isWhitespace(data.charAt(k))) {
                seqLen++;
            }
        }
        // Do this once outside loop- save on multiple regex compilations
        // data = data.replaceAll("\\s", "");
        // String [] strs = str.split("\\s+");
        // String taxon = strs[0];
        // for (int k = 1; k < strs.length - 1; k++) {
        // taxon += strs[k];
        // }
        // taxon = taxon.replaceAll("'", "");
        // Log.warning.println(taxon);
        // String data = strs[strs.length - 1];
        data = data.replaceAll(";", "");
        if (data.trim().length() > 0) {
            if (seqMap.containsKey(taxon)) {
                seqMap.put(taxon, seqMap.get(taxon).append(data));
            } else {
                seqMap.put(taxon, new StringBuilder(data));
                taxa.add(taxon);
            }
        }
        if (str.contains(";")) {
            break;
        }
    }
    if (taxonCount > 0 && taxa.size() > taxonCount) {
        throw new IOException("Wrong number of taxa. Perhaps a typo in one of the taxa: " + taxa);
    }
    HashSet<String> sortedAmbiguities = new HashSet<>();
    for (final String taxon : taxa) {
        if (!taxonListContains(taxon)) {
            taxonList.add(new Taxon(taxon));
        }
        final StringBuilder bsData = seqMap.get(taxon);
        String data = bsData.toString().replaceAll("\\s", "");
        seqMap.put(taxon, new StringBuilder(data));
        // collect all ambiguities in the sequence
        List<String> ambiguities = new ArrayList<>();
        Matcher m = Pattern.compile("\\{(.*?)\\}").matcher(data);
        while (m.find()) {
            int mLength = m.group().length();
            ambiguities.add(m.group().substring(1, mLength - 1));
        }
        // sort elements of ambiguity sets
        String data_without_ambiguities = data.replaceAll("\\{(.*?)\\}", "?");
        for (String amb : ambiguities) {
            List<Integer> ambInt = new ArrayList<>();
            for (int i = 0; i < amb.length(); i++) {
                char c = amb.charAt(i);
                if (c >= '0' && c <= '9') {
                    ambInt.add(Integer.parseInt(amb.charAt(i) + ""));
                } else {
                    // ignore
                    if (data != data_without_ambiguities) {
                        Log.warning.println("Ambiguity found in " + taxon + " that is treated as missing value");
                    }
                    data = data_without_ambiguities;
                }
            }
            Collections.sort(ambInt);
            String ambStr = "";
            for (int i = 0; i < ambInt.size(); i++) {
                ambStr += Integer.toString(ambInt.get(i));
            }
            sortedAmbiguities.add(ambStr);
        }
        // check the length of the sequence (treat ambiguity sets as single characters)
        if (data_without_ambiguities.length() != charCount) {
            throw new IOException(str + "\nExpected sequence of length " + charCount + " instead of " + data.length() + " for taxon " + taxon);
        }
        // map to standard missing and gap chars
        data = data.replace(missing.charAt(0), DataType.MISSING_CHAR);
        data = data.replace(gap.charAt(0), DataType.GAP_CHAR);
        // resolve matching char, if any
        if (matchChar != null && data.contains(matchChar)) {
            final char cMatchChar = matchChar.charAt(0);
            final String baseData = seqMap.get(taxa.get(0)).toString();
            for (int i = 0; i < data.length(); i++) {
                if (data.charAt(i) == cMatchChar) {
                    final char cReplaceChar = baseData.charAt(i);
                    data = data.substring(0, i) + cReplaceChar + (i + 1 < data.length() ? data.substring(i + 1) : "");
                }
            }
        }
        // Using Alignment as Map gives problems when producing XML:
        // Sequence names are used as attribute names, producing very readable XML
        // However, since attribute names cannot start with a number or contain
        // special characters (like ":" or "]") but sequence names do contain them
        // on occasion, it is more robust to create a Sequence object for each
        // sequence where the taxon name is stored as an XML attribute values
        // that do not have the attribute name restrictions.
        // if (alignment.dataTypeInput.get().equals("nucleotide") ||
        // alignment.dataTypeInput.get().equals("binary")  ||
        // alignment.dataTypeInput.get().equals("aminoacid") ) {
        // alignment.setInputValue(taxon, data);
        // } else {
        final Sequence sequence = new Sequence();
        sequence.init(totalCount, taxon, data);
        sequence.setID(generateSequenceID(taxon));
        alignment.sequenceInput.setValue(sequence, alignment);
    // }
    }
    if (alignment.dataTypeInput.get().equals("standard")) {
        // convert sortedAmbiguities to a whitespace separated string of ambiguities
        String ambiguitiesStr = "";
        for (String amb : sortedAmbiguities) {
            ambiguitiesStr += amb + " ";
        }
        if (ambiguitiesStr.length() > 0) {
            ambiguitiesStr = ambiguitiesStr.substring(0, ambiguitiesStr.length() - 1);
        }
        alignment.userDataTypeInput.get().initByName("ambiguities", ambiguitiesStr);
    }
    alignment.initAndValidate();
    if (taxonCount > 0 && taxonCount != alignment.getTaxonCount()) {
        throw new IOException("dimensions block says there are " + taxonCount + " taxa, but there were " + alignment.getTaxonCount() + " taxa found");
    }
    return alignment;
}
Also used : Matcher(java.util.regex.Matcher) UserDataType(beast.evolution.datatype.UserDataType) StandardData(beast.evolution.datatype.StandardData)

Aggregations

StandardData (beast.evolution.datatype.StandardData)1 UserDataType (beast.evolution.datatype.UserDataType)1 Matcher (java.util.regex.Matcher)1