Search in sources :

Example 6 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class LetterFeatureTester method testFeatures.

void testFeatures(ShapeInSequence shapeInSequence, Set<LetterFeature<?>> features) {
    LetterSequence history = null;
    LetterGuesserContext context = new LetterGuesserContext(shapeInSequence, history);
    for (LetterFeature<?> feature : features) {
        RuntimeEnvironment env = new RuntimeEnvironment();
        feature.check(context, env);
    }
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) LetterGuesserContext(com.joliciel.jochre.letterGuesser.LetterGuesserContext) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)

Example 7 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class NgramFeature method checkInternal.

@Override
public FeatureResult<String> checkInternal(LetterGuesserContext context, RuntimeEnvironment env) {
    FeatureResult<String> result = null;
    FeatureResult<Integer> nResult = nFeature.check(context, env);
    if (nResult != null) {
        int n = nResult.getOutcome();
        int historyToFind = n - 1;
        String ngram = "";
        Shape shape = context.getShapeInSequence().getShape();
        LetterSequence history = context.getHistory();
        for (int i = 0; i < historyToFind; i++) {
            String letter = null;
            if (history != null) {
                // this is during analysis, we look at the current history
                if (history.getLetters().size() > i) {
                    letter = history.getLetters().get(history.getLetters().size() - i - 1);
                } else {
                    letter = SPACE;
                }
            } else {
                // this is during training - we look at the previous letters
                if (shape.getIndex() > i) {
                    GroupOfShapes group = shape.getGroup();
                    letter = group.getShapes().get(shape.getIndex() - i - 1).getLetter();
                } else {
                    letter = SPACE;
                }
            }
            ngram = letter + ngram;
        }
        result = this.generateResult(ngram);
    }
    return result;
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) Shape(com.joliciel.jochre.graphics.Shape) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes)

Example 8 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class MostLikelyWordChooser method getFrequency.

/**
 * Same as {@link #getFrequency(LetterSequence)}, but can either apply to
 * the guessed word or to the real word from the training corpus.
 *
 * @param guessedWord
 *            if true, applies to the guessed word
 */
public int getFrequency(LetterSequence letterSequence, boolean guessedWord) {
    int frequency = 0;
    List<LetterSequence> subsequences = letterSequence.getSubsequences();
    List<List<LetterSequence>> possibilities = new ArrayList<>();
    possibilities.add(new ArrayList<LetterSequence>());
    int lastIndex = -1;
    for (int i = 0; i < subsequences.size(); i++) {
        LetterSequence subsequence = subsequences.get(i);
        lastIndex += subsequence.getLetters().size();
        String word = null;
        if (guessedWord)
            word = subsequence.getGuessedWord();
        else
            word = subsequence.getRealWord();
        List<List<LetterSequence>> newPossibilities = new ArrayList<>();
        for (List<LetterSequence> possibility : possibilities) {
            if (possibility.size() > 0) {
                // has this subsequence already been processed ?
                LetterSequence lastSequence = possibility.get(possibility.size() - 1);
                Shape lastShape = lastSequence.getUnderlyingShapeSequence().get(lastSequence.getUnderlyingShapeSequence().size() - 1).getShape();
                Shape myLastShape = subsequence.getUnderlyingShapeSequence().get(subsequence.getUnderlyingShapeSequence().size() - 1).getShape();
                if (lastShape.equals(myLastShape)) {
                    newPossibilities.add(possibility);
                    continue;
                }
            }
            boolean addWord = true;
            if (subsequence.isPunctation()) {
                if (word.equals("-") || midWordPunctuation.contains(word) || startWordPunctuation.contains(word) || endWordPunctuation.contains(word)) {
                    LetterSequence prevSequence = possibility.size() == 0 ? null : possibility.get(possibility.size() - 1);
                    LetterSequence nextSequence = i == subsequences.size() - 1 ? null : subsequences.get(i + 1);
                    LetterSequence prevCurrentSequence = new LetterSequence(prevSequence, subsequence);
                    LetterSequence currentNextSequence = new LetterSequence(subsequence, nextSequence);
                    LetterSequence prevCurrentNextSequence = new LetterSequence(prevCurrentSequence, nextSequence);
                    if (word.equals("-")) {
                        if (prevSequence == null && nextSequence == null) {
                            newPossibilities.add(possibility);
                        } else if (prevSequence == null) {
                            List<LetterSequence> newPoss = new ArrayList<>();
                            newPoss.add(subsequence);
                            newPoss.add(nextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>();
                            newPoss.add(currentNextSequence);
                            newPossibilities.add(newPoss);
                        } else if (nextSequence == null) {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.add(subsequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentSequence);
                            newPossibilities.add(newPoss);
                        } else {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.add(subsequence);
                            newPoss.add(nextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.add(currentNextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentSequence);
                            newPoss.add(nextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentNextSequence);
                            newPossibilities.add(newPoss);
                            // add skipped dash possibility
                            if (lastIndex == letterSequence.getEndOfLineHyphenIndex()) {
                                subsequence.setHyphenSubsequence(subsequence);
                                prevCurrentNextSequence.setHyphenSubsequence(subsequence);
                                prevCurrentSequence.setHyphenSubsequence(subsequence);
                                currentNextSequence.setHyphenSubsequence(subsequence);
                                LetterSequence prevNextSequence = new LetterSequence(prevCurrentSequence, nextSequence);
                                prevNextSequence.setHyphenSubsequence(subsequence);
                                prevNextSequence.setSoftHyphen(true);
                                newPoss = new ArrayList<>(possibility);
                                newPoss.remove(newPoss.size() - 1);
                                newPoss.add(prevNextSequence);
                                newPossibilities.add(newPoss);
                            }
                        }
                        addWord = false;
                    }
                    if (midWordPunctuation.contains(word)) {
                        if (prevSequence != null && nextSequence != null) {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentNextSequence);
                            newPossibilities.add(newPoss);
                            addWord = false;
                        }
                    }
                    if (startWordPunctuation.contains(word)) {
                        if (nextSequence != null && !subsequences.get(subsequences.size() - 1).getGuessedWord().equals(word)) {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.add(currentNextSequence);
                            newPossibilities.add(newPoss);
                            newPoss = new ArrayList<>(possibility);
                            newPoss.add(subsequence);
                            newPoss.add(nextSequence);
                            newPossibilities.add(newPoss);
                            addWord = false;
                        }
                    }
                    if (endWordPunctuation.contains(word) && !subsequences.get(0).getGuessedWord().equals(word)) {
                        if (prevSequence != null) {
                            List<LetterSequence> newPoss = new ArrayList<>(possibility);
                            newPoss.remove(newPoss.size() - 1);
                            newPoss.add(prevCurrentSequence);
                            newPossibilities.add(newPoss);
                        }
                    }
                }
            }
            if (addWord) {
                possibility.add(subsequence);
                newPossibilities.add(possibility);
            }
        }
        possibilities = newPossibilities;
        if (possibilities.size() > 1000) {
            break;
        }
    }
    TreeMap<Integer, List<List<LetterSequence>>> freqPossibilityMap = new TreeMap<>();
    for (List<LetterSequence> possibility : possibilities) {
        boolean hasWords = false;
        for (LetterSequence subsequence : possibility) {
            if (!subsequence.isPunctation()) {
                hasWords = true;
                break;
            }
        }
        int minFreq = Integer.MAX_VALUE;
        for (LetterSequence subsequence : possibility) {
            String word = subsequence.getGuessedWord();
            int freq = 0;
            List<CountedOutcome<String>> frequencies = this.linguistics.getFrequencies(word);
            if (frequencies.size() == 0) {
                // check whether word is impossible
                if (!this.linguistics.isWordPossible(word)) {
                    frequencies.add(new CountedOutcome<>(word, -1));
                }
            }
            if (frequencies != null && frequencies.size() > 0) {
                subsequence.setWordFrequencies(frequencies);
                letterSequence.getWordFrequencies().add(frequencies.get(0));
                freq = frequencies.get(0).getCount();
            } else {
                frequencies = new ArrayList<>();
                frequencies.add(new CountedOutcome<>(word, 0));
                freq = 0;
                subsequence.setWordFrequencies(frequencies);
                letterSequence.getWordFrequencies().add(frequencies.get(0));
            }
            if (subsequence.isPunctation() && hasWords) {
                continue;
            }
            if (freq < minFreq)
                minFreq = freq;
        }
        List<List<LetterSequence>> possibilitiesAtFreq = freqPossibilityMap.get(minFreq);
        if (possibilitiesAtFreq == null) {
            possibilitiesAtFreq = new ArrayList<>();
            freqPossibilityMap.put(minFreq, possibilitiesAtFreq);
        }
        possibilitiesAtFreq.add(possibility);
    }
    // Out of all of the sub-sequences possibilities giving the max
    // frequency in the lexicon
    // we choose the one containing the single longest word to populate the
    // subsequences for this letter sequence
    // and select its hyphenated content.
    // Thus if both halves of an existing hyphenated word also happen to
    // exist independently as words in the lexicon,
    // we'll still take the longer hyphenated word.
    List<List<LetterSequence>> maxFreqPossibilities = freqPossibilityMap.lastEntry().getValue();
    List<LetterSequence> maxLengthList = null;
    int maxLengthForList = -1;
    for (List<LetterSequence> possibility : maxFreqPossibilities) {
        int maxLength = 0;
        for (LetterSequence subsequence : possibility) {
            String word = subsequence.getGuessedWord();
            if (word.length() > maxLength)
                maxLength = word.length();
        }
        if (maxLength > maxLengthForList) {
            maxLengthList = possibility;
            maxLengthForList = maxLength;
        }
    }
    frequency = freqPossibilityMap.lastEntry().getKey();
    letterSequence.setSubsequences(maxLengthList);
    // construct the hyphenated string out of the subsequences directly
    // surrounding the hyphen
    // making sure to leave out any opening and closing punctuation
    String hyphenatedString = "";
    boolean foundFirstWord = false;
    String punctuationString = "";
    for (LetterSequence subsequence : maxLengthList) {
        if (subsequence.getHyphenSubsequence() != null) {
            letterSequence.setHyphenSubsequence(subsequence.getHyphenSubsequence());
        }
        if (!foundFirstWord && !subsequence.isPunctation())
            foundFirstWord = true;
        if (foundFirstWord && subsequence.isPunctation()) {
            punctuationString += subsequence.getGuessedWord();
        } else if (foundFirstWord) {
            hyphenatedString += punctuationString;
            punctuationString = "";
            hyphenatedString += subsequence.getGuessedWord();
        }
    }
    if (letterSequence.isSplit()) {
        letterSequence.setHyphenatedString(hyphenatedString);
        for (LetterSequence subsequence : maxLengthList) {
            subsequence.setHyphenatedString(hyphenatedString);
        }
    }
    return frequency;
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) Shape(com.joliciel.jochre.graphics.Shape) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) CountedOutcome(com.joliciel.talismane.utils.CountedOutcome) ArrayList(java.util.ArrayList) List(java.util.List)

Example 9 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class BeamSearchImageAnalyser method analyseInternal.

public void analyseInternal(JochreImage image) {
    LOG.debug("Analysing image " + image.getId());
    if (currentMonitor != null) {
        currentMonitor.setCurrentAction("imageMonitor.analysingImage", new Object[] { image.getPage().getIndex() });
    }
    for (LetterGuessObserver observer : observers) {
        observer.onImageStart(image);
    }
    if (totalShapeCount < 0)
        totalShapeCount = image.getShapeCount();
    for (Paragraph paragraph : image.getParagraphs()) {
        LOG.debug("Analysing paragraph " + paragraph.getIndex() + " (id=" + paragraph.getId() + ")");
        List<LetterSequence> holdoverSequences = null;
        GroupOfShapes holdoverGroup = null;
        for (RowOfShapes row : paragraph.getRows()) {
            LOG.debug("Analysing row " + row.getIndex() + " (id=" + row.getId() + ")");
            for (GroupOfShapes group : row.getGroups()) {
                if (group.isSkip()) {
                    LOG.debug("Skipping group " + group.getIndex() + " (id=" + group.getId() + ")");
                    continue;
                }
                LOG.debug("Analysing group " + group.getIndex() + " (id=" + group.getId() + ")");
                int width = group.getRight() - group.getLeft() + 1;
                List<ShapeSequence> shapeSequences = null;
                if (boundaryDetector != null) {
                    shapeSequences = boundaryDetector.findBoundaries(group);
                } else {
                    // simply add this groups shape's
                    shapeSequences = new ArrayList<>();
                    ShapeSequence shapeSequence = new ShapeSequence();
                    for (Shape shape : group.getShapes()) shapeSequence.addShape(shape);
                    shapeSequences.add(shapeSequence);
                }
                // Perform a beam search to guess the most likely sequence
                // for this
                // word
                TreeMap<Integer, PriorityQueue<LetterSequence>> heaps = new TreeMap<>();
                // prime a starter heap with the n best shape boundary
                // analyses for
                // this group
                PriorityQueue<LetterSequence> starterHeap = new PriorityQueue<>(1);
                for (ShapeSequence shapeSequence : shapeSequences) {
                    LetterSequence emptySequence = new LetterSequence(shapeSequence, jochreSession);
                    starterHeap.add(emptySequence);
                }
                heaps.put(0, starterHeap);
                PriorityQueue<LetterSequence> finalHeap = null;
                while (heaps.size() > 0) {
                    Entry<Integer, PriorityQueue<LetterSequence>> heapEntry = heaps.pollFirstEntry();
                    if (LOG.isTraceEnabled())
                        LOG.trace("heap for index: " + heapEntry.getKey().intValue() + ", width: " + width);
                    if (heapEntry.getKey().intValue() == width) {
                        finalHeap = heapEntry.getValue();
                        break;
                    }
                    PriorityQueue<LetterSequence> previousHeap = heapEntry.getValue();
                    // limit the breadth to K
                    int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
                    for (int j = 0; j < maxSequences; j++) {
                        LetterSequence history = previousHeap.poll();
                        ShapeInSequence shapeInSequence = history.getNextShape();
                        Shape shape = shapeInSequence.getShape();
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Sequence " + history + ", shape: " + shape);
                        }
                        LogUtils.logMemory(LOG);
                        int position = 0;
                        if (jochreSession.getLinguistics().isLeftToRight()) {
                            position = shape.getRight() - group.getLeft() + 1;
                        } else {
                            position = group.getRight() - shape.getLeft() + 1;
                        }
                        PriorityQueue<LetterSequence> heap = heaps.get(position);
                        if (heap == null) {
                            heap = new PriorityQueue<>();
                            heaps.put(position, heap);
                        }
                        letterGuesser.guessLetter(shapeInSequence, history);
                        // heap sort
                        for (Decision letterGuess : shape.getLetterGuesses()) {
                            // leave out very low probability outcomes
                            if (letterGuess.getProbability() > this.minOutcomeWeight) {
                                LetterSequence sequence = new LetterSequence(history);
                                sequence.getLetters().add(letterGuess.getOutcome());
                                sequence.addDecision(letterGuess);
                                heap.add(sequence);
                            }
                        // weight big enough to include
                        }
                    // next letter guess for this shape
                    }
                // next history in heap
                }
                // any more heaps?
                // find best sequence
                LetterSequence bestSequence = null;
                boolean isHoldover = false;
                List<LetterSequence> finalSequences = new ArrayList<>();
                for (int i = 0; i < this.beamWidth; i++) {
                    if (finalHeap.isEmpty())
                        break;
                    finalSequences.add(finalHeap.poll());
                }
                if (this.mostLikelyWordChooser == null) {
                    // most likely sequence is on top of the last heap
                    bestSequence = finalSequences.get(0);
                } else {
                    // get most likely sequence using lexicon
                    if (holdoverSequences != null) {
                        // we have a holdover from the previous row
                        // ending with a dash
                        bestSequence = this.mostLikelyWordChooser.chooseMostLikelyWord(finalSequences, holdoverSequences, this.beamWidth);
                    } else {
                        // check if this is the last group on the row
                        // and could end with
                        // a dash
                        boolean shouldBeHeldOver = false;
                        if (group.getIndex() == row.getGroups().size() - 1 && row.getIndex() < paragraph.getRows().size() - 1) {
                            for (LetterSequence letterSequence : finalSequences) {
                                if (letterSequence.toString().endsWith("-")) {
                                    shouldBeHeldOver = true;
                                    break;
                                }
                            }
                        }
                        if (shouldBeHeldOver) {
                            holdoverSequences = finalSequences;
                            holdoverGroup = group;
                            isHoldover = true;
                        } else {
                            // simplest case: no holdover
                            bestSequence = this.mostLikelyWordChooser.chooseMostLikelyWord(finalSequences, this.beamWidth);
                        }
                    }
                // have we holdover sequences?
                }
                if (!isHoldover) {
                    for (LetterGuessObserver observer : observers) {
                        observer.onBeamSearchEnd(bestSequence, finalSequences, holdoverSequences);
                    }
                }
                // assign letter
                if (!isHoldover) {
                    for (LetterGuessObserver observer : observers) {
                        observer.onStartSequence(bestSequence);
                    }
                    if (holdoverGroup == null) {
                        group.setBestLetterSequence(bestSequence);
                    } else {
                        // split bestSequence by group
                        List<LetterSequence> sequencesByGroup = bestSequence.splitByGroup();
                        for (LetterSequence sequenceByGroup : sequencesByGroup) {
                            if (sequenceByGroup.getGroups().get(0).equals(holdoverGroup))
                                holdoverGroup.setBestLetterSequence(sequenceByGroup);
                            else if (sequenceByGroup.getGroups().get(0).equals(group))
                                group.setBestLetterSequence(sequenceByGroup);
                        }
                        holdoverSequences = null;
                        holdoverGroup = null;
                    }
                    int i = 0;
                    for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
                        String bestOutcome = bestSequence.getLetters().get(i);
                        this.assignLetter(shapeInSequence, bestOutcome);
                        i++;
                    }
                    for (LetterGuessObserver observer : observers) {
                        observer.onGuessSequence(bestSequence);
                    }
                }
                this.shapeCount += group.getShapes().size();
                if (this.currentMonitor != null) {
                    double progress = (double) shapeCount / (double) totalShapeCount;
                    LOG.debug("progress: " + progress);
                    currentMonitor.setPercentComplete(progress);
                }
            }
        // next group
        }
    // next row
    }
    for (LetterGuessObserver observer : observers) {
        observer.onImageEnd();
    }
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) Shape(com.joliciel.jochre.graphics.Shape) ArrayList(java.util.ArrayList) RowOfShapes(com.joliciel.jochre.graphics.RowOfShapes) PriorityQueue(java.util.PriorityQueue) TreeMap(java.util.TreeMap) Decision(com.joliciel.talismane.machineLearning.Decision) Paragraph(com.joliciel.jochre.graphics.Paragraph) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) ShapeSequence(com.joliciel.jochre.boundaries.ShapeSequence) ShapeInSequence(com.joliciel.jochre.boundaries.ShapeInSequence)

Aggregations

LetterSequence (com.joliciel.jochre.letterGuesser.LetterSequence)9 ArrayList (java.util.ArrayList)5 GroupOfShapes (com.joliciel.jochre.graphics.GroupOfShapes)4 Shape (com.joliciel.jochre.graphics.Shape)3 ShapeInSequence (com.joliciel.jochre.boundaries.ShapeInSequence)2 Paragraph (com.joliciel.jochre.graphics.Paragraph)2 RowOfShapes (com.joliciel.jochre.graphics.RowOfShapes)2 IOException (java.io.IOException)2 List (java.util.List)2 PriorityQueue (java.util.PriorityQueue)2 TreeMap (java.util.TreeMap)2 ShapeSequence (com.joliciel.jochre.boundaries.ShapeSequence)1 Linguistics (com.joliciel.jochre.lang.Linguistics)1 LetterGuesserContext (com.joliciel.jochre.letterGuesser.LetterGuesserContext)1 Decision (com.joliciel.talismane.machineLearning.Decision)1 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)1 CountedOutcome (com.joliciel.talismane.utils.CountedOutcome)1 BufferedWriter (java.io.BufferedWriter)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1