Search in sources :

Example 1 with CountedOutcome

use of com.joliciel.talismane.utils.CountedOutcome in project jochre by urieli.

the class MostLikelyWordChooser method getFrequency.

/**
 * Same as {@link #getFrequency(LetterSequence)}, but can either apply to
 * the guessed word or to the real word from the training corpus.
 *
 * @param guessedWord
 *            if true, applies to the guessed word
 */
public int getFrequency(LetterSequence letterSequence, boolean guessedWord) {
    int frequency = 0;
    List<LetterSequence> subsequences = letterSequence.getSubsequences();
    List<List<LetterSequence>> possibilities = new ArrayList<>();
    possibilities.add(new ArrayList<LetterSequence>());
    int lastIndex = -1;
    for (int i = 0; i < subsequences.size(); i++) {
        LetterSequence subsequence = subsequences.get(i);
        lastIndex += subsequence.getLetters().size();
        String word = null;
        if (guessedWord)
            word = subsequence.getGuessedWord();
        else
            word = subsequence.getRealWord();
        List<List<LetterSequence>> newPossibilities = new ArrayList<>();
        for (List<LetterSequence> possibility : possibilities) {
            if (possibility.size() > 0) {
                // has this subsequence already been processed ?
                LetterSequence lastSequence = possibility.get(possibility.size() - 1);
                Shape lastShape = lastSequence.getUnderlyingShapeSequence().get(lastSequence.getUnderlyingShapeSequence().size() - 1).getShape();
                Shape myLastShape = subsequence.getUnderlyingShapeSequence().get(subsequence.getUnderlyingShapeSequence().size() - 1).getShape();
                if (lastShape.equals(myLastShape)) {
                    newPossibilities.add(possibility);
                    continue;
                }
            }
            boolean addWord = true;
            if (subsequence.isPunctation()) {
                if (word.equals("-") || midWordPunctuation.contains(word) || startWordPunctuation.contains(word) || endWordPunctuation.contains(word)) {
                    LetterSequence prevSequence = possibility.size() == 0 ? null : possibility.get(possibility.size() - 1);
                    LetterSequence nextSequence = i == subsequences.size() - 1 ? null : subsequences.get(i + 1);
                    LetterSequence prevCurrentSequence = new LetterSequence(prevSequence, subsequence);
                    LetterSequence currentNextSequence = new LetterSequence(subsequence, nextSequence);
                    LetterSequence prevCurrentNextSequence = new LetterSequence(prevCurrentSequence, nextSequence);
                    if (word.equals("-")) {
                        if (prevSequence == null && nextSequence == null) {
                            newPossibilities.add(possibility);
                        } else if (prevSequence == null) {
                            List<LetterSequence> newPoss = new ArrayList<>();
                            newPoss.add(subsequence);
                            newPoss.add(nextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>();
                            newPoss.add(currentNextSequence);
                            newPossibilities.add(newPoss);
                        } else if (nextSequence == null) {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.add(subsequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentSequence);
                            newPossibilities.add(newPoss);
                        } else {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.add(subsequence);
                            newPoss.add(nextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.add(currentNextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentSequence);
                            newPoss.add(nextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentNextSequence);
                            newPossibilities.add(newPoss);
                            // add skipped dash possibility
                            if (lastIndex == letterSequence.getEndOfLineHyphenIndex()) {
                                subsequence.setHyphenSubsequence(subsequence);
                                prevCurrentNextSequence.setHyphenSubsequence(subsequence);
                                prevCurrentSequence.setHyphenSubsequence(subsequence);
                                currentNextSequence.setHyphenSubsequence(subsequence);
                                LetterSequence prevNextSequence = new LetterSequence(prevCurrentSequence, nextSequence);
                                prevNextSequence.setHyphenSubsequence(subsequence);
                                prevNextSequence.setSoftHyphen(true);
                                newPoss = new ArrayList<>(possibility);
                                newPoss.remove(newPoss.size() - 1);
                                newPoss.add(prevNextSequence);
                                newPossibilities.add(newPoss);
                            }
                        }
                        addWord = false;
                    }
                    if (midWordPunctuation.contains(word)) {
                        if (prevSequence != null && nextSequence != null) {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentNextSequence);
                            newPossibilities.add(newPoss);
                            addWord = false;
                        }
                    }
                    if (startWordPunctuation.contains(word)) {
                        if (nextSequence != null && !subsequences.get(subsequences.size() - 1).getGuessedWord().equals(word)) {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.add(currentNextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.add(subsequence);
                            newPoss.add(nextSequence);
                            newPossibilities.add(newPoss);
                            addWord = false;
                        }
                    }
                    if (endWordPunctuation.contains(word) && !subsequences.get(0).getGuessedWord().equals(word)) {
                        if (prevSequence != null) {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentSequence);
                            newPossibilities.add(newPoss);
                        }
                    }
                }
            }
            if (addWord) {
                possibility.add(subsequence);
                newPossibilities.add(possibility);
            }
        }
        possibilities = newPossibilities;
        if (possibilities.size() > 1000) {
            break;
        }
    }
    TreeMap<Integer, List<List<LetterSequence>>> freqPossibilityMap = new TreeMap<>();
    for (List<LetterSequence> possibility : possibilities) {
        boolean hasWords = false;
        for (LetterSequence subsequence : possibility) {
            if (!subsequence.isPunctation()) {
                hasWords = true;
                break;
            }
        }
        int minFreq = Integer.MAX_VALUE;
        for (LetterSequence subsequence : possibility) {
            String word = subsequence.getGuessedWord();
            int freq = 0;
            List<CountedOutcome<String>> frequencies = this.linguistics.getFrequencies(word);
            if (frequencies.size() == 0) {
                // check whether word is impossible
                if (!this.linguistics.isWordPossible(word)) {
                    frequencies.add(new CountedOutcome<>(word, -1));
                }
            }
            if (frequencies != null && frequencies.size() > 0) {
                subsequence.setWordFrequencies(frequencies);
                letterSequence.getWordFrequencies().add(frequencies.get(0));
                freq = frequencies.get(0).getCount();
            } else {
                frequencies = new ArrayList<>();
                frequencies.add(new CountedOutcome<>(word, 0));
                freq = 0;
                subsequence.setWordFrequencies(frequencies);
                letterSequence.getWordFrequencies().add(frequencies.get(0));
            }
            if (subsequence.isPunctation() && hasWords) {
                continue;
            }
            if (freq < minFreq)
                minFreq = freq;
        }
        List<List<LetterSequence>> possibilitiesAtFreq = freqPossibilityMap.get(minFreq);
        if (possibilitiesAtFreq == null) {
            possibilitiesAtFreq = new ArrayList<>();
            freqPossibilityMap.put(minFreq, possibilitiesAtFreq);
        }
        possibilitiesAtFreq.add(possibility);
    }
    // Out of all of the sub-sequences possibilities giving the max
    // frequency in the lexicon
    // we choose the one containing the single longest word to populate the
    // subsequences for this letter sequence
    // and select its hyphenated content.
    // Thus if both halves of an existing hyphenated word also happen to
    // exist independently as words in the lexicon,
    // we'll still take the longer hyphenated word.
    List<List<LetterSequence>> maxFreqPossibilities = freqPossibilityMap.lastEntry().getValue();
    List<LetterSequence> maxLengthList = null;
    int maxLengthForList = -1;
    for (List<LetterSequence> possibility : maxFreqPossibilities) {
        int maxLength = 0;
        for (LetterSequence subsequence : possibility) {
            String word = subsequence.getGuessedWord();
            if (word.length() > maxLength)
                maxLength = word.length();
        }
        if (maxLength > maxLengthForList) {
            maxLengthList = possibility;
            maxLengthForList = maxLength;
        }
    }
    frequency = freqPossibilityMap.lastEntry().getKey();
    letterSequence.setSubsequences(maxLengthList);
    // construct the hyphenated string out of the subsequences directly
    // surrounding the hyphen
    // making sure to leave out any opening and closing punctuation
    String hyphenatedString = "";
    boolean foundFirstWord = false;
    String punctuationString = "";
    for (LetterSequence subsequence : maxLengthList) {
        if (subsequence.getHyphenSubsequence() != null) {
            letterSequence.setHyphenSubsequence(subsequence.getHyphenSubsequence());
        }
        if (!foundFirstWord && !subsequence.isPunctation())
            foundFirstWord = true;
        if (foundFirstWord && subsequence.isPunctation()) {
            punctuationString += subsequence.getGuessedWord();
        } else if (foundFirstWord) {
            hyphenatedString += punctuationString;
            punctuationString = "";
            hyphenatedString += subsequence.getGuessedWord();
        }
    }
    if (letterSequence.isSplit()) {
        letterSequence.setHyphenatedString(hyphenatedString);
        for (LetterSequence subsequence : maxLengthList) {
            subsequence.setHyphenatedString(hyphenatedString);
        }
    }
    return frequency;
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) Shape(com.joliciel.jochre.graphics.Shape) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) CountedOutcome(com.joliciel.talismane.utils.CountedOutcome) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

Shape (com.joliciel.jochre.graphics.Shape)1 LetterSequence (com.joliciel.jochre.letterGuesser.LetterSequence)1 CountedOutcome (com.joliciel.talismane.utils.CountedOutcome)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 TreeMap (java.util.TreeMap)1