Search in sources :

Example 1 with ShapeSequence

use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.

the class JochreLetterEventStream method getNextGroup.

void getNextGroup() {
    shapeSequence = null;
    shapeIndex = 0;
    if (groupReader.hasNext()) {
        GroupOfShapes group = groupReader.next();
        if (boundaryDetector != null) {
            // in this case the boundary detector is supposed to give us the
            // correct
            // splits and merges
            shapeSequence = boundaryDetector.findBoundaries(group).get(0);
        } else {
            // simply add this group's shapes
            shapeSequence = new ShapeSequence();
            for (Shape shape : group.getShapes()) shapeSequence.addShape(shape);
        }
        history = new LetterSequence(shapeSequence, jochreSession);
    }
}
Also used : Shape(com.joliciel.jochre.graphics.Shape) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) ShapeSequence(com.joliciel.jochre.boundaries.ShapeSequence)

Example 2 with ShapeSequence

use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.

the class LetterSequence method splitByGroup.

/**
 * For a letter sequence covering two groups, split this letter sequence
 * into one sequence per group.
 */
public List<LetterSequence> splitByGroup() {
    List<LetterSequence> letterSequences = new ArrayList<LetterSequence>();
    if (this.isSplit()) {
        Map<GroupOfShapes, LetterSequence> groupToLetterSequenceMap = new HashMap<GroupOfShapes, LetterSequence>();
        if (groupSequences != null) {
            letterSequences = groupSequences;
            for (LetterSequence letterSequence : letterSequences) {
                groupToLetterSequenceMap.put(letterSequence.getGroups().get(0), letterSequence);
            }
        } else {
            List<String> currentLetters = new ArrayList<String>();
            ShapeSequence currentShapes = new ShapeSequence();
            GroupOfShapes currentGroup = this.getGroups().get(0);
            for (int i = 0; i < this.letters.size(); i++) {
                String letter = this.letters.get(i);
                Shape shape = this.underlyingShapeSequence.get(i).getShape();
                if (!currentGroup.equals(shape.getGroup())) {
                    LetterSequence letterSequence = new LetterSequence(currentShapes, currentLetters, jochreSession);
                    letterSequence.setScore(this.getScore());
                    letterSequence.setAdjustedScore(this.getAdjustedScore());
                    groupToLetterSequenceMap.put(currentGroup, letterSequence);
                    letterSequences.add(letterSequence);
                    currentLetters = new ArrayList<String>();
                    currentShapes = new ShapeSequence();
                    currentGroup = shape.getGroup();
                }
                currentShapes.addShape(shape);
                currentLetters.add(letter);
            }
            if (currentLetters.size() > 0) {
                LetterSequence letterSequence = new LetterSequence(currentShapes, currentLetters, jochreSession);
                letterSequence.setScore(this.getScore());
                letterSequence.setAdjustedScore(this.getAdjustedScore());
                groupToLetterSequenceMap.put(currentGroup, letterSequence);
                letterSequences.add(letterSequence);
            }
        }
        GroupOfShapes currentGroup = this.getGroups().get(0);
        List<LetterSequence> newSubsequences = new ArrayList<LetterSequence>();
        for (LetterSequence subsequence : this.getSubsequences()) {
            if (subsequence.getHyphenSubsequence() != null) {
                // subsequence contains end-of-line hyphen
                // break it up into several subsequences
                List<LetterSequence> subsequencesByGroup = subsequence.getSubsequences();
                LetterSequence firstSubsequence = subsequencesByGroup.get(0);
                firstSubsequence.setHyphenSubsequence(subsequence.getHyphenSubsequence());
                newSubsequences.addAll(subsequencesByGroup);
                for (LetterSequence subsubsequence : subsequencesByGroup) {
                    subsubsequence.setHyphenatedString(subsequence.getHyphenatedString());
                }
            } else {
                newSubsequences.add(subsequence);
            }
        }
        // assign my subsequences to the correct group
        List<LetterSequence> currentSubsequences = new ArrayList<LetterSequence>();
        for (LetterSequence subsequence : newSubsequences) {
            if (!subsequence.getGroups().get(0).equals(currentGroup)) {
                LetterSequence currentSequence = groupToLetterSequenceMap.get(currentGroup);
                currentSequence.setSubsequences(currentSubsequences);
                for (LetterSequence oneSubsequence : currentSubsequences) {
                    if (oneSubsequence.getWordFrequencies().size() > 0) {
                        currentSequence.getWordFrequencies().add(oneSubsequence.getWordFrequencies().get(0));
                    }
                }
                currentSubsequences = new ArrayList<LetterSequence>();
                currentGroup = subsequence.getGroups().get(0);
            }
            currentSubsequences.add(subsequence);
        }
        if (currentSubsequences.size() > 0) {
            LetterSequence currentSequence = groupToLetterSequenceMap.get(currentGroup);
            currentSequence.setSubsequences(currentSubsequences);
            for (LetterSequence oneSubsequence : currentSubsequences) {
                if (oneSubsequence.getWordFrequencies().size() > 0) {
                    currentSequence.getWordFrequencies().add(oneSubsequence.getWordFrequencies().get(0));
                }
            }
        }
        if (this.getHyphenSubsequence() != null)
            letterSequences.get(0).setHyphenSubsequence(this.getHyphenSubsequence());
        for (LetterSequence letterSequence : letterSequences) {
            letterSequence.setScore(this.getScore());
            letterSequence.setAdjustedScore(this.getAdjustedScore());
        }
    } else {
        letterSequences.add(this);
    }
    return letterSequences;
}
Also used : Shape(com.joliciel.jochre.graphics.Shape) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) ShapeSequence(com.joliciel.jochre.boundaries.ShapeSequence)

Example 3 with ShapeSequence

use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.

the class OriginalShapeLetterAssigner method onGuessSequence.

@Override
public void onGuessSequence(LetterSequence letterSequence) {
    stillValid = true;
    ShapeSequence shapeSequence = letterSequence.getUnderlyingShapeSequence();
    Shape previousOriginalShape = null;
    List<ShapeInSequence> subsequenceForPrevOriginalShape = new ArrayList<ShapeInSequence>();
    for (ShapeInSequence shapeInSequence : shapeSequence) {
        // cases that are possible:
        // 1) shapeInSequence is 1-to-1 with an original shape (A from original shape A)
        // 2) shapeInSequence shares an original shape with previous (B from original shape AB)
        // 3) shapeInSequence shares an original shape with next (A from original shape AB)
        // 4) shapeInSequence shares an original shape with previous and next (B from original shape ABC)
        // 5) shapeInSequence has two original shapes (A from original shapes |A A|)
        // 6) shapeInSequence has 3 original shapes (A from original shapes |A * A|)
        // 7) shapeInSequence shares with previous and has 2+ original shapes (A from |A A|B)
        // 8) shapeInSequence shares with next and has 2+ original shapes (B from A|B B|)
        // So, when we reach a new original shape,
        // either it coincides with a previous shape border, or it doesn't
        List<Shape> originalShapes = shapeInSequence.getOriginalShapes();
        for (Shape nextOriginalShape : originalShapes) {
            if (!nextOriginalShape.equals(previousOriginalShape)) {
                // new original shape, we need to populate the letters of the previous one
                if (previousOriginalShape != null)
                    this.assignLetter(previousOriginalShape, subsequenceForPrevOriginalShape);
                previousOriginalShape = nextOriginalShape;
                subsequenceForPrevOriginalShape = new ArrayList<ShapeInSequence>();
            }
            subsequenceForPrevOriginalShape.add(shapeInSequence);
        }
    // next original shape
    }
    // next underlying shape sequence shape
    if (previousOriginalShape != null)
        this.assignLetter(previousOriginalShape, subsequenceForPrevOriginalShape);
}
Also used : Shape(com.joliciel.jochre.graphics.Shape) ShapeSequence(com.joliciel.jochre.boundaries.ShapeSequence) ArrayList(java.util.ArrayList) ShapeInSequence(com.joliciel.jochre.boundaries.ShapeInSequence)

Example 4 with ShapeSequence

use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.

the class LetterSequence method getSubsequences.

/**
 * If this sequence contains any punctuation, returns individual sequences
 * representing letters and punctuation. Otherwise, returns the original
 * sequence.
 */
public List<LetterSequence> getSubsequences() {
    if (subsequences == null) {
        subsequences = new ArrayList<LetterSequence>();
        List<String> currentLetters = new ArrayList<String>();
        ShapeSequence currentShapes = new ShapeSequence();
        boolean inPunctuation = false;
        boolean expectEndOfLineHyphen = false;
        for (int i = 0; i < this.letters.size(); i++) {
            String letter = this.letters.get(i);
            ShapeInSequence shape = this.underlyingShapeSequence.get(i);
            if (i == this.getEndOfLineHyphenIndex())
                expectEndOfLineHyphen = true;
            if (PUNCTUATION.matcher(letter).matches()) {
                if (!inPunctuation && currentLetters.size() > 0) {
                    LetterSequence subsequence = this.getSubsequence(currentShapes, currentLetters);
                    subsequences.add(subsequence);
                    currentLetters = new ArrayList<String>();
                    currentShapes = new ShapeSequence();
                }
                inPunctuation = true;
            } else {
                if (inPunctuation && currentLetters.size() > 0) {
                    LetterSequence subsequence = this.getSubsequence(currentShapes, currentLetters);
                    subsequence.setPunctation(true);
                    if (expectEndOfLineHyphen) {
                        this.setHyphenSubsequence(subsequence);
                    }
                    subsequences.add(subsequence);
                    currentLetters = new ArrayList<String>();
                    currentShapes = new ShapeSequence();
                }
                inPunctuation = false;
            }
            currentLetters.add(letter);
            currentShapes.addShape(shape.getShape());
        }
        if (currentLetters.size() > 0) {
            LetterSequence subsequence = this.getSubsequence(currentShapes, currentLetters);
            subsequence.setPunctation(inPunctuation);
            if (inPunctuation && expectEndOfLineHyphen)
                this.setHyphenSubsequence(subsequence);
            subsequences.add(subsequence);
        }
    }
    return subsequences;
}
Also used : ArrayList(java.util.ArrayList) ShapeSequence(com.joliciel.jochre.boundaries.ShapeSequence) ShapeInSequence(com.joliciel.jochre.boundaries.ShapeInSequence)

Example 5 with ShapeSequence

use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.

the class BeamSearchImageAnalyser method analyseInternal.

public void analyseInternal(JochreImage image) {
    LOG.debug("Analysing image " + image.getId());
    if (currentMonitor != null) {
        currentMonitor.setCurrentAction("imageMonitor.analysingImage", new Object[] { image.getPage().getIndex() });
    }
    for (LetterGuessObserver observer : observers) {
        observer.onImageStart(image);
    }
    if (totalShapeCount < 0)
        totalShapeCount = image.getShapeCount();
    for (Paragraph paragraph : image.getParagraphs()) {
        LOG.debug("Analysing paragraph " + paragraph.getIndex() + " (id=" + paragraph.getId() + ")");
        List<LetterSequence> holdoverSequences = null;
        GroupOfShapes holdoverGroup = null;
        for (RowOfShapes row : paragraph.getRows()) {
            LOG.debug("Analysing row " + row.getIndex() + " (id=" + row.getId() + ")");
            for (GroupOfShapes group : row.getGroups()) {
                if (group.isSkip()) {
                    LOG.debug("Skipping group " + group.getIndex() + " (id=" + group.getId() + ")");
                    continue;
                }
                LOG.debug("Analysing group " + group.getIndex() + " (id=" + group.getId() + ")");
                int width = group.getRight() - group.getLeft() + 1;
                List<ShapeSequence> shapeSequences = null;
                if (boundaryDetector != null) {
                    shapeSequences = boundaryDetector.findBoundaries(group);
                } else {
                    // simply add this groups shape's
                    shapeSequences = new ArrayList<>();
                    ShapeSequence shapeSequence = new ShapeSequence();
                    for (Shape shape : group.getShapes()) shapeSequence.addShape(shape);
                    shapeSequences.add(shapeSequence);
                }
                // Perform a beam search to guess the most likely sequence
                // for this
                // word
                TreeMap<Integer, PriorityQueue<LetterSequence>> heaps = new TreeMap<>();
                // prime a starter heap with the n best shape boundary
                // analyses for
                // this group
                PriorityQueue<LetterSequence> starterHeap = new PriorityQueue<>(1);
                for (ShapeSequence shapeSequence : shapeSequences) {
                    LetterSequence emptySequence = new LetterSequence(shapeSequence, jochreSession);
                    starterHeap.add(emptySequence);
                }
                heaps.put(0, starterHeap);
                PriorityQueue<LetterSequence> finalHeap = null;
                while (heaps.size() > 0) {
                    Entry<Integer, PriorityQueue<LetterSequence>> heapEntry = heaps.pollFirstEntry();
                    if (LOG.isTraceEnabled())
                        LOG.trace("heap for index: " + heapEntry.getKey().intValue() + ", width: " + width);
                    if (heapEntry.getKey().intValue() == width) {
                        finalHeap = heapEntry.getValue();
                        break;
                    }
                    PriorityQueue<LetterSequence> previousHeap = heapEntry.getValue();
                    // limit the breadth to K
                    int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
                    for (int j = 0; j < maxSequences; j++) {
                        LetterSequence history = previousHeap.poll();
                        ShapeInSequence shapeInSequence = history.getNextShape();
                        Shape shape = shapeInSequence.getShape();
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Sequence " + history + ", shape: " + shape);
                        }
                        LogUtils.logMemory(LOG);
                        int position = 0;
                        if (jochreSession.getLinguistics().isLeftToRight()) {
                            position = shape.getRight() - group.getLeft() + 1;
                        } else {
                            position = group.getRight() - shape.getLeft() + 1;
                        }
                        PriorityQueue<LetterSequence> heap = heaps.get(position);
                        if (heap == null) {
                            heap = new PriorityQueue<>();
                            heaps.put(position, heap);
                        }
                        letterGuesser.guessLetter(shapeInSequence, history);
                        // heap sort
                        for (Decision letterGuess : shape.getLetterGuesses()) {
                            // leave out very low probability outcomes
                            if (letterGuess.getProbability() > this.minOutcomeWeight) {
                                LetterSequence sequence = new LetterSequence(history);
                                sequence.getLetters().add(letterGuess.getOutcome());
                                sequence.addDecision(letterGuess);
                                heap.add(sequence);
                            }
                        // weight big enough to include
                        }
                    // next letter guess for this shape
                    }
                // next history in heap
                }
                // any more heaps?
                // find best sequence
                LetterSequence bestSequence = null;
                boolean isHoldover = false;
                List<LetterSequence> finalSequences = new ArrayList<>();
                for (int i = 0; i < this.beamWidth; i++) {
                    if (finalHeap.isEmpty())
                        break;
                    finalSequences.add(finalHeap.poll());
                }
                if (this.mostLikelyWordChooser == null) {
                    // most likely sequence is on top of the last heap
                    bestSequence = finalSequences.get(0);
                } else {
                    // get most likely sequence using lexicon
                    if (holdoverSequences != null) {
                        // we have a holdover from the previous row
                        // ending with a dash
                        bestSequence = this.mostLikelyWordChooser.chooseMostLikelyWord(finalSequences, holdoverSequences, this.beamWidth);
                    } else {
                        // check if this is the last group on the row
                        // and could end with
                        // a dash
                        boolean shouldBeHeldOver = false;
                        if (group.getIndex() == row.getGroups().size() - 1 && row.getIndex() < paragraph.getRows().size() - 1) {
                            for (LetterSequence letterSequence : finalSequences) {
                                if (letterSequence.toString().endsWith("-")) {
                                    shouldBeHeldOver = true;
                                    break;
                                }
                            }
                        }
                        if (shouldBeHeldOver) {
                            holdoverSequences = finalSequences;
                            holdoverGroup = group;
                            isHoldover = true;
                        } else {
                            // simplest case: no holdover
                            bestSequence = this.mostLikelyWordChooser.chooseMostLikelyWord(finalSequences, this.beamWidth);
                        }
                    }
                // have we holdover sequences?
                }
                if (!isHoldover) {
                    for (LetterGuessObserver observer : observers) {
                        observer.onBeamSearchEnd(bestSequence, finalSequences, holdoverSequences);
                    }
                }
                // assign letter
                if (!isHoldover) {
                    for (LetterGuessObserver observer : observers) {
                        observer.onStartSequence(bestSequence);
                    }
                    if (holdoverGroup == null) {
                        group.setBestLetterSequence(bestSequence);
                    } else {
                        // split bestSequence by group
                        List<LetterSequence> sequencesByGroup = bestSequence.splitByGroup();
                        for (LetterSequence sequenceByGroup : sequencesByGroup) {
                            if (sequenceByGroup.getGroups().get(0).equals(holdoverGroup))
                                holdoverGroup.setBestLetterSequence(sequenceByGroup);
                            else if (sequenceByGroup.getGroups().get(0).equals(group))
                                group.setBestLetterSequence(sequenceByGroup);
                        }
                        holdoverSequences = null;
                        holdoverGroup = null;
                    }
                    int i = 0;
                    for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
                        String bestOutcome = bestSequence.getLetters().get(i);
                        this.assignLetter(shapeInSequence, bestOutcome);
                        i++;
                    }
                    for (LetterGuessObserver observer : observers) {
                        observer.onGuessSequence(bestSequence);
                    }
                }
                this.shapeCount += group.getShapes().size();
                if (this.currentMonitor != null) {
                    double progress = (double) shapeCount / (double) totalShapeCount;
                    LOG.debug("progress: " + progress);
                    currentMonitor.setPercentComplete(progress);
                }
            }
        // next group
        }
    // next row
    }
    for (LetterGuessObserver observer : observers) {
        observer.onImageEnd();
    }
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) Shape(com.joliciel.jochre.graphics.Shape) ArrayList(java.util.ArrayList) RowOfShapes(com.joliciel.jochre.graphics.RowOfShapes) PriorityQueue(java.util.PriorityQueue) TreeMap(java.util.TreeMap) Decision(com.joliciel.talismane.machineLearning.Decision) Paragraph(com.joliciel.jochre.graphics.Paragraph) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) ShapeSequence(com.joliciel.jochre.boundaries.ShapeSequence) ShapeInSequence(com.joliciel.jochre.boundaries.ShapeInSequence)

Aggregations

ShapeSequence (com.joliciel.jochre.boundaries.ShapeSequence)6 Shape (com.joliciel.jochre.graphics.Shape)5 ShapeInSequence (com.joliciel.jochre.boundaries.ShapeInSequence)4 ArrayList (java.util.ArrayList)4 GroupOfShapes (com.joliciel.jochre.graphics.GroupOfShapes)3 Paragraph (com.joliciel.jochre.graphics.Paragraph)1 RowOfShapes (com.joliciel.jochre.graphics.RowOfShapes)1 LetterSequence (com.joliciel.jochre.letterGuesser.LetterSequence)1 Decision (com.joliciel.talismane.machineLearning.Decision)1 HashMap (java.util.HashMap)1 PriorityQueue (java.util.PriorityQueue)1 TreeMap (java.util.TreeMap)1