use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class LetterFeatureTester method testFeatures.
void testFeatures(ShapeInSequence shapeInSequence, Set<LetterFeature<?>> features) {
LetterSequence history = null;
LetterGuesserContext context = new LetterGuesserContext(shapeInSequence, history);
for (LetterFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
feature.check(context, env);
}
}
use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class NgramFeature method checkInternal.
@Override
public FeatureResult<String> checkInternal(LetterGuesserContext context, RuntimeEnvironment env) {
FeatureResult<String> result = null;
FeatureResult<Integer> nResult = nFeature.check(context, env);
if (nResult != null) {
int n = nResult.getOutcome();
int historyToFind = n - 1;
String ngram = "";
Shape shape = context.getShapeInSequence().getShape();
LetterSequence history = context.getHistory();
for (int i = 0; i < historyToFind; i++) {
String letter = null;
if (history != null) {
// this is during analysis, we look at the current history
if (history.getLetters().size() > i) {
letter = history.getLetters().get(history.getLetters().size() - i - 1);
} else {
letter = SPACE;
}
} else {
// this is during training - we look at the previous letters
if (shape.getIndex() > i) {
GroupOfShapes group = shape.getGroup();
letter = group.getShapes().get(shape.getIndex() - i - 1).getLetter();
} else {
letter = SPACE;
}
}
ngram = letter + ngram;
}
result = this.generateResult(ngram);
}
return result;
}
use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class MostLikelyWordChooser method getFrequency.
/**
* Same as {@link #getFrequency(LetterSequence)}, but can either apply to
* the guessed word or to the real word from the training corpus.
*
* @param guessedWord
* if true, applies to the guessed word
*/
public int getFrequency(LetterSequence letterSequence, boolean guessedWord) {
int frequency = 0;
List<LetterSequence> subsequences = letterSequence.getSubsequences();
List<List<LetterSequence>> possibilities = new ArrayList<>();
possibilities.add(new ArrayList<LetterSequence>());
int lastIndex = -1;
for (int i = 0; i < subsequences.size(); i++) {
LetterSequence subsequence = subsequences.get(i);
lastIndex += subsequence.getLetters().size();
String word = null;
if (guessedWord)
word = subsequence.getGuessedWord();
else
word = subsequence.getRealWord();
List<List<LetterSequence>> newPossibilities = new ArrayList<>();
for (List<LetterSequence> possibility : possibilities) {
if (possibility.size() > 0) {
// has this subsequence already been processed ?
LetterSequence lastSequence = possibility.get(possibility.size() - 1);
Shape lastShape = lastSequence.getUnderlyingShapeSequence().get(lastSequence.getUnderlyingShapeSequence().size() - 1).getShape();
Shape myLastShape = subsequence.getUnderlyingShapeSequence().get(subsequence.getUnderlyingShapeSequence().size() - 1).getShape();
if (lastShape.equals(myLastShape)) {
newPossibilities.add(possibility);
continue;
}
}
boolean addWord = true;
if (subsequence.isPunctation()) {
if (word.equals("-") || midWordPunctuation.contains(word) || startWordPunctuation.contains(word) || endWordPunctuation.contains(word)) {
LetterSequence prevSequence = possibility.size() == 0 ? null : possibility.get(possibility.size() - 1);
LetterSequence nextSequence = i == subsequences.size() - 1 ? null : subsequences.get(i + 1);
LetterSequence prevCurrentSequence = new LetterSequence(prevSequence, subsequence);
LetterSequence currentNextSequence = new LetterSequence(subsequence, nextSequence);
LetterSequence prevCurrentNextSequence = new LetterSequence(prevCurrentSequence, nextSequence);
if (word.equals("-")) {
if (prevSequence == null && nextSequence == null) {
newPossibilities.add(possibility);
} else if (prevSequence == null) {
List<LetterSequence> newPoss = new ArrayList<>();
newPoss.add(subsequence);
newPoss.add(nextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>();
newPoss.add(currentNextSequence);
newPossibilities.add(newPoss);
} else if (nextSequence == null) {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.add(subsequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentSequence);
newPossibilities.add(newPoss);
} else {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.add(subsequence);
newPoss.add(nextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.add(currentNextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentSequence);
newPoss.add(nextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentNextSequence);
newPossibilities.add(newPoss);
// add skipped dash possibility
if (lastIndex == letterSequence.getEndOfLineHyphenIndex()) {
subsequence.setHyphenSubsequence(subsequence);
prevCurrentNextSequence.setHyphenSubsequence(subsequence);
prevCurrentSequence.setHyphenSubsequence(subsequence);
currentNextSequence.setHyphenSubsequence(subsequence);
LetterSequence prevNextSequence = new LetterSequence(prevCurrentSequence, nextSequence);
prevNextSequence.setHyphenSubsequence(subsequence);
prevNextSequence.setSoftHyphen(true);
newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevNextSequence);
newPossibilities.add(newPoss);
}
}
addWord = false;
}
if (midWordPunctuation.contains(word)) {
if (prevSequence != null && nextSequence != null) {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentNextSequence);
newPossibilities.add(newPoss);
addWord = false;
}
}
if (startWordPunctuation.contains(word)) {
if (nextSequence != null && !subsequences.get(subsequences.size() - 1).getGuessedWord().equals(word)) {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.add(currentNextSequence);
newPossibilities.add(newPoss);
newPoss = new ArrayList<>(possibility);
newPoss.add(subsequence);
newPoss.add(nextSequence);
newPossibilities.add(newPoss);
addWord = false;
}
}
if (endWordPunctuation.contains(word) && !subsequences.get(0).getGuessedWord().equals(word)) {
if (prevSequence != null) {
List<LetterSequence> newPoss = new ArrayList<>(possibility);
newPoss.remove(newPoss.size() - 1);
newPoss.add(prevCurrentSequence);
newPossibilities.add(newPoss);
}
}
}
}
if (addWord) {
possibility.add(subsequence);
newPossibilities.add(possibility);
}
}
possibilities = newPossibilities;
if (possibilities.size() > 1000) {
break;
}
}
TreeMap<Integer, List<List<LetterSequence>>> freqPossibilityMap = new TreeMap<>();
for (List<LetterSequence> possibility : possibilities) {
boolean hasWords = false;
for (LetterSequence subsequence : possibility) {
if (!subsequence.isPunctation()) {
hasWords = true;
break;
}
}
int minFreq = Integer.MAX_VALUE;
for (LetterSequence subsequence : possibility) {
String word = subsequence.getGuessedWord();
int freq = 0;
List<CountedOutcome<String>> frequencies = this.linguistics.getFrequencies(word);
if (frequencies.size() == 0) {
// check whether word is impossible
if (!this.linguistics.isWordPossible(word)) {
frequencies.add(new CountedOutcome<>(word, -1));
}
}
if (frequencies != null && frequencies.size() > 0) {
subsequence.setWordFrequencies(frequencies);
letterSequence.getWordFrequencies().add(frequencies.get(0));
freq = frequencies.get(0).getCount();
} else {
frequencies = new ArrayList<>();
frequencies.add(new CountedOutcome<>(word, 0));
freq = 0;
subsequence.setWordFrequencies(frequencies);
letterSequence.getWordFrequencies().add(frequencies.get(0));
}
if (subsequence.isPunctation() && hasWords) {
continue;
}
if (freq < minFreq)
minFreq = freq;
}
List<List<LetterSequence>> possibilitiesAtFreq = freqPossibilityMap.get(minFreq);
if (possibilitiesAtFreq == null) {
possibilitiesAtFreq = new ArrayList<>();
freqPossibilityMap.put(minFreq, possibilitiesAtFreq);
}
possibilitiesAtFreq.add(possibility);
}
// Out of all of the sub-sequences possibilities giving the max
// frequency in the lexicon
// we choose the one containing the single longest word to populate the
// subsequences for this letter sequence
// and select its hyphenated content.
// Thus if both halves of an existing hyphenated word also happen to
// exist independently as words in the lexicon,
// we'll still take the longer hyphenated word.
List<List<LetterSequence>> maxFreqPossibilities = freqPossibilityMap.lastEntry().getValue();
List<LetterSequence> maxLengthList = null;
int maxLengthForList = -1;
for (List<LetterSequence> possibility : maxFreqPossibilities) {
int maxLength = 0;
for (LetterSequence subsequence : possibility) {
String word = subsequence.getGuessedWord();
if (word.length() > maxLength)
maxLength = word.length();
}
if (maxLength > maxLengthForList) {
maxLengthList = possibility;
maxLengthForList = maxLength;
}
}
frequency = freqPossibilityMap.lastEntry().getKey();
letterSequence.setSubsequences(maxLengthList);
// construct the hyphenated string out of the subsequences directly
// surrounding the hyphen
// making sure to leave out any opening and closing punctuation
String hyphenatedString = "";
boolean foundFirstWord = false;
String punctuationString = "";
for (LetterSequence subsequence : maxLengthList) {
if (subsequence.getHyphenSubsequence() != null) {
letterSequence.setHyphenSubsequence(subsequence.getHyphenSubsequence());
}
if (!foundFirstWord && !subsequence.isPunctation())
foundFirstWord = true;
if (foundFirstWord && subsequence.isPunctation()) {
punctuationString += subsequence.getGuessedWord();
} else if (foundFirstWord) {
hyphenatedString += punctuationString;
punctuationString = "";
hyphenatedString += subsequence.getGuessedWord();
}
}
if (letterSequence.isSplit()) {
letterSequence.setHyphenatedString(hyphenatedString);
for (LetterSequence subsequence : maxLengthList) {
subsequence.setHyphenatedString(hyphenatedString);
}
}
return frequency;
}
use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class BeamSearchImageAnalyser method analyseInternal.
public void analyseInternal(JochreImage image) {
LOG.debug("Analysing image " + image.getId());
if (currentMonitor != null) {
currentMonitor.setCurrentAction("imageMonitor.analysingImage", new Object[] { image.getPage().getIndex() });
}
for (LetterGuessObserver observer : observers) {
observer.onImageStart(image);
}
if (totalShapeCount < 0)
totalShapeCount = image.getShapeCount();
for (Paragraph paragraph : image.getParagraphs()) {
LOG.debug("Analysing paragraph " + paragraph.getIndex() + " (id=" + paragraph.getId() + ")");
List<LetterSequence> holdoverSequences = null;
GroupOfShapes holdoverGroup = null;
for (RowOfShapes row : paragraph.getRows()) {
LOG.debug("Analysing row " + row.getIndex() + " (id=" + row.getId() + ")");
for (GroupOfShapes group : row.getGroups()) {
if (group.isSkip()) {
LOG.debug("Skipping group " + group.getIndex() + " (id=" + group.getId() + ")");
continue;
}
LOG.debug("Analysing group " + group.getIndex() + " (id=" + group.getId() + ")");
int width = group.getRight() - group.getLeft() + 1;
List<ShapeSequence> shapeSequences = null;
if (boundaryDetector != null) {
shapeSequences = boundaryDetector.findBoundaries(group);
} else {
// simply add this groups shape's
shapeSequences = new ArrayList<>();
ShapeSequence shapeSequence = new ShapeSequence();
for (Shape shape : group.getShapes()) shapeSequence.addShape(shape);
shapeSequences.add(shapeSequence);
}
// Perform a beam search to guess the most likely sequence
// for this
// word
TreeMap<Integer, PriorityQueue<LetterSequence>> heaps = new TreeMap<>();
// prime a starter heap with the n best shape boundary
// analyses for
// this group
PriorityQueue<LetterSequence> starterHeap = new PriorityQueue<>(1);
for (ShapeSequence shapeSequence : shapeSequences) {
LetterSequence emptySequence = new LetterSequence(shapeSequence, jochreSession);
starterHeap.add(emptySequence);
}
heaps.put(0, starterHeap);
PriorityQueue<LetterSequence> finalHeap = null;
while (heaps.size() > 0) {
Entry<Integer, PriorityQueue<LetterSequence>> heapEntry = heaps.pollFirstEntry();
if (LOG.isTraceEnabled())
LOG.trace("heap for index: " + heapEntry.getKey().intValue() + ", width: " + width);
if (heapEntry.getKey().intValue() == width) {
finalHeap = heapEntry.getValue();
break;
}
PriorityQueue<LetterSequence> previousHeap = heapEntry.getValue();
// limit the breadth to K
int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
for (int j = 0; j < maxSequences; j++) {
LetterSequence history = previousHeap.poll();
ShapeInSequence shapeInSequence = history.getNextShape();
Shape shape = shapeInSequence.getShape();
if (LOG.isTraceEnabled()) {
LOG.trace("Sequence " + history + ", shape: " + shape);
}
LogUtils.logMemory(LOG);
int position = 0;
if (jochreSession.getLinguistics().isLeftToRight()) {
position = shape.getRight() - group.getLeft() + 1;
} else {
position = group.getRight() - shape.getLeft() + 1;
}
PriorityQueue<LetterSequence> heap = heaps.get(position);
if (heap == null) {
heap = new PriorityQueue<>();
heaps.put(position, heap);
}
letterGuesser.guessLetter(shapeInSequence, history);
// heap sort
for (Decision letterGuess : shape.getLetterGuesses()) {
// leave out very low probability outcomes
if (letterGuess.getProbability() > this.minOutcomeWeight) {
LetterSequence sequence = new LetterSequence(history);
sequence.getLetters().add(letterGuess.getOutcome());
sequence.addDecision(letterGuess);
heap.add(sequence);
}
// weight big enough to include
}
// next letter guess for this shape
}
// next history in heap
}
// any more heaps?
// find best sequence
LetterSequence bestSequence = null;
boolean isHoldover = false;
List<LetterSequence> finalSequences = new ArrayList<>();
for (int i = 0; i < this.beamWidth; i++) {
if (finalHeap.isEmpty())
break;
finalSequences.add(finalHeap.poll());
}
if (this.mostLikelyWordChooser == null) {
// most likely sequence is on top of the last heap
bestSequence = finalSequences.get(0);
} else {
// get most likely sequence using lexicon
if (holdoverSequences != null) {
// we have a holdover from the previous row
// ending with a dash
bestSequence = this.mostLikelyWordChooser.chooseMostLikelyWord(finalSequences, holdoverSequences, this.beamWidth);
} else {
// check if this is the last group on the row
// and could end with
// a dash
boolean shouldBeHeldOver = false;
if (group.getIndex() == row.getGroups().size() - 1 && row.getIndex() < paragraph.getRows().size() - 1) {
for (LetterSequence letterSequence : finalSequences) {
if (letterSequence.toString().endsWith("-")) {
shouldBeHeldOver = true;
break;
}
}
}
if (shouldBeHeldOver) {
holdoverSequences = finalSequences;
holdoverGroup = group;
isHoldover = true;
} else {
// simplest case: no holdover
bestSequence = this.mostLikelyWordChooser.chooseMostLikelyWord(finalSequences, this.beamWidth);
}
}
// have we holdover sequences?
}
if (!isHoldover) {
for (LetterGuessObserver observer : observers) {
observer.onBeamSearchEnd(bestSequence, finalSequences, holdoverSequences);
}
}
// assign letter
if (!isHoldover) {
for (LetterGuessObserver observer : observers) {
observer.onStartSequence(bestSequence);
}
if (holdoverGroup == null) {
group.setBestLetterSequence(bestSequence);
} else {
// split bestSequence by group
List<LetterSequence> sequencesByGroup = bestSequence.splitByGroup();
for (LetterSequence sequenceByGroup : sequencesByGroup) {
if (sequenceByGroup.getGroups().get(0).equals(holdoverGroup))
holdoverGroup.setBestLetterSequence(sequenceByGroup);
else if (sequenceByGroup.getGroups().get(0).equals(group))
group.setBestLetterSequence(sequenceByGroup);
}
holdoverSequences = null;
holdoverGroup = null;
}
int i = 0;
for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
String bestOutcome = bestSequence.getLetters().get(i);
this.assignLetter(shapeInSequence, bestOutcome);
i++;
}
for (LetterGuessObserver observer : observers) {
observer.onGuessSequence(bestSequence);
}
}
this.shapeCount += group.getShapes().size();
if (this.currentMonitor != null) {
double progress = (double) shapeCount / (double) totalShapeCount;
LOG.debug("progress: " + progress);
currentMonitor.setPercentComplete(progress);
}
}
// next group
}
// next row
}
for (LetterGuessObserver observer : observers) {
observer.onImageEnd();
}
}
Aggregations