use of com.joliciel.talismane.utils.CountedOutcome in project jochre by urieli.
the class MostLikelyWordChooser method getFrequency.
/**
* Same as {@link #getFrequency(LetterSequence)}, but can either apply to
* the guessed word or to the real word from the training corpus.
*
* @param guessedWord
* if true, applies to the guessed word
*/
public int getFrequency(LetterSequence letterSequence, boolean guessedWord) {
int frequency = 0;
List<LetterSequence> subsequences = letterSequence.getSubsequences();
List<List<LetterSequence>> possibilities = new ArrayList<>();
possibilities.add(new ArrayList<LetterSequence>());
int lastIndex = -1;
for (int i = 0; i < subsequences.size(); i++) {
LetterSequence subsequence = subsequences.get(i);
lastIndex += subsequence.getLetters().size();
String word = null;
if (guessedWord)
word = subsequence.getGuessedWord();
else
word = subsequence.getRealWord();
List<List<LetterSequence>> newPossibilities = new ArrayList<>();
for (List<LetterSequence> possibility : possibilities) {
if (possibility.size() > 0) {
// has this subsequence already been processed ?
LetterSequence lastSequence = possibility.get(possibility.size() - 1);
Shape lastShape = lastSequence.getUnderlyingShapeSequence().get(lastSequence.getUnderlyingShapeSequence().size() - 1).getShape();
Shape myLastShape = subsequence.getUnderlyingShapeSequence().get(subsequence.getUnderlyingShapeSequence().size() - 1).getShape();
if (lastShape.equals(myLastShape)) {
newPossibilities.add(possibility);
continue;
}
}
boolean addWord = true;
if (subsequence.isPunctation()) {
if (word.equals("-") || midWordPunctuation.contains(word) || startWordPunctuation.contains(word) || endWordPunctuation.contains(word)) {
LetterSequence prevSequence = possibility.size() == 0 ? null : possibility.get(possibility.size() - 1);
LetterSequence nextSequence = i == subsequences.size() - 1 ? null : subsequences.get(i + 1);
LetterSequence prevCurrentSequence = new LetterSequence(prevSequence, subsequence);
LetterSequence currentNextSequence = new LetterSequence(subsequence, nextSequence);
LetterSequence prevCurrentNextSequence = new LetterSequence(prevCurrentSequence, nextSequence);
if (word.equals("-")) {
if (prevSequence == null && nextSequence == null) {
newPossibilities.add(possibility);
} else if (prevSequence == null) {
List<LetterSequence> newPoss = new ArrayList<>();
newPoss.add(subsequence);
newPoss.add(nextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>();
newPoss.add(currentNextSequence);
newPossibilities.add(newPoss);
} else if (nextSequence == null) {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.add(subsequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentSequence);
newPossibilities.add(newPoss);
} else {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.add(subsequence);
newPoss.add(nextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.add(currentNextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentSequence);
newPoss.add(nextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentNextSequence);
newPossibilities.add(newPoss);
// add skipped dash possibility
if (lastIndex == letterSequence.getEndOfLineHyphenIndex()) {
subsequence.setHyphenSubsequence(subsequence);
prevCurrentNextSequence.setHyphenSubsequence(subsequence);
prevCurrentSequence.setHyphenSubsequence(subsequence);
currentNextSequence.setHyphenSubsequence(subsequence);
LetterSequence prevNextSequence = new LetterSequence(prevCurrentSequence, nextSequence);
prevNextSequence.setHyphenSubsequence(subsequence);
prevNextSequence.setSoftHyphen(true);
newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevNextSequence);
newPossibilities.add(newPoss);
}
}
addWord = false;
}
if (midWordPunctuation.contains(word)) {
if (prevSequence != null && nextSequence != null) {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentNextSequence);
newPossibilities.add(newPoss);
addWord = false;
}
}
if (startWordPunctuation.contains(word)) {
if (nextSequence != null && !subsequences.get(subsequences.size() - 1).getGuessedWord().equals(word)) {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.add(currentNextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.add(subsequence);
newPoss.add(nextSequence);
newPossibilities.add(newPoss);
addWord = false;
}
}
if (endWordPunctuation.contains(word) && !subsequences.get(0).getGuessedWord().equals(word)) {
if (prevSequence != null) {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentSequence);
newPossibilities.add(newPoss);
}
}
}
}
if (addWord) {
possibility.add(subsequence);
newPossibilities.add(possibility);
}
}
possibilities = newPossibilities;
if (possibilities.size() > 1000) {
break;
}
}
TreeMap<Integer, List<List<LetterSequence>>> freqPossibilityMap = new TreeMap<>();
for (List<LetterSequence> possibility : possibilities) {
boolean hasWords = false;
for (LetterSequence subsequence : possibility) {
if (!subsequence.isPunctation()) {
hasWords = true;
break;
}
}
int minFreq = Integer.MAX_VALUE;
for (LetterSequence subsequence : possibility) {
String word = subsequence.getGuessedWord();
int freq = 0;
List<CountedOutcome<String>> frequencies = this.linguistics.getFrequencies(word);
if (frequencies.size() == 0) {
// check whether word is impossible
if (!this.linguistics.isWordPossible(word)) {
frequencies.add(new CountedOutcome<>(word, -1));
}
}
if (frequencies != null && frequencies.size() > 0) {
subsequence.setWordFrequencies(frequencies);
letterSequence.getWordFrequencies().add(frequencies.get(0));
freq = frequencies.get(0).getCount();
} else {
frequencies = new ArrayList<>();
frequencies.add(new CountedOutcome<>(word, 0));
freq = 0;
subsequence.setWordFrequencies(frequencies);
letterSequence.getWordFrequencies().add(frequencies.get(0));
}
if (subsequence.isPunctation() && hasWords) {
continue;
}
if (freq < minFreq)
minFreq = freq;
}
List<List<LetterSequence>> possibilitiesAtFreq = freqPossibilityMap.get(minFreq);
if (possibilitiesAtFreq == null) {
possibilitiesAtFreq = new ArrayList<>();
freqPossibilityMap.put(minFreq, possibilitiesAtFreq);
}
possibilitiesAtFreq.add(possibility);
}
// Out of all of the sub-sequences possibilities giving the max
// frequency in the lexicon
// we choose the one containing the single longest word to populate the
// subsequences for this letter sequence
// and select its hyphenated content.
// Thus if both halves of an existing hyphenated word also happen to
// exist independently as words in the lexicon,
// we'll still take the longer hyphenated word.
List<List<LetterSequence>> maxFreqPossibilities = freqPossibilityMap.lastEntry().getValue();
List<LetterSequence> maxLengthList = null;
int maxLengthForList = -1;
for (List<LetterSequence> possibility : maxFreqPossibilities) {
int maxLength = 0;
for (LetterSequence subsequence : possibility) {
String word = subsequence.getGuessedWord();
if (word.length() > maxLength)
maxLength = word.length();
}
if (maxLength > maxLengthForList) {
maxLengthList = possibility;
maxLengthForList = maxLength;
}
}
frequency = freqPossibilityMap.lastEntry().getKey();
letterSequence.setSubsequences(maxLengthList);
// construct the hyphenated string out of the subsequences directly
// surrounding the hyphen
// making sure to leave out any opening and closing punctuation
String hyphenatedString = "";
boolean foundFirstWord = false;
String punctuationString = "";
for (LetterSequence subsequence : maxLengthList) {
if (subsequence.getHyphenSubsequence() != null) {
letterSequence.setHyphenSubsequence(subsequence.getHyphenSubsequence());
}
if (!foundFirstWord && !subsequence.isPunctation())
foundFirstWord = true;
if (foundFirstWord && subsequence.isPunctation()) {
punctuationString += subsequence.getGuessedWord();
} else if (foundFirstWord) {
hyphenatedString += punctuationString;
punctuationString = "";
hyphenatedString += subsequence.getGuessedWord();
}
}
if (letterSequence.isSplit()) {
letterSequence.setHyphenatedString(hyphenatedString);
for (LetterSequence subsequence : maxLengthList) {
subsequence.setHyphenatedString(hyphenatedString);
}
}
return frequency;
}
Aggregations