use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.
the class JochreLetterEventStream method getNextGroup.
void getNextGroup() {
shapeSequence = null;
shapeIndex = 0;
if (groupReader.hasNext()) {
GroupOfShapes group = groupReader.next();
if (boundaryDetector != null) {
// in this case the boundary detector is supposed to give us the
// correct
// splits and merges
shapeSequence = boundaryDetector.findBoundaries(group).get(0);
} else {
// simply add this group's shapes
shapeSequence = new ShapeSequence();
for (Shape shape : group.getShapes()) shapeSequence.addShape(shape);
}
history = new LetterSequence(shapeSequence, jochreSession);
}
}
use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.
the class LetterSequence method splitByGroup.
/**
* For a letter sequence covering two groups, split this letter sequence
* into one sequence per group.
*/
public List<LetterSequence> splitByGroup() {
List<LetterSequence> letterSequences = new ArrayList<LetterSequence>();
if (this.isSplit()) {
Map<GroupOfShapes, LetterSequence> groupToLetterSequenceMap = new HashMap<GroupOfShapes, LetterSequence>();
if (groupSequences != null) {
letterSequences = groupSequences;
for (LetterSequence letterSequence : letterSequences) {
groupToLetterSequenceMap.put(letterSequence.getGroups().get(0), letterSequence);
}
} else {
List<String> currentLetters = new ArrayList<String>();
ShapeSequence currentShapes = new ShapeSequence();
GroupOfShapes currentGroup = this.getGroups().get(0);
for (int i = 0; i < this.letters.size(); i++) {
String letter = this.letters.get(i);
Shape shape = this.underlyingShapeSequence.get(i).getShape();
if (!currentGroup.equals(shape.getGroup())) {
LetterSequence letterSequence = new LetterSequence(currentShapes, currentLetters, jochreSession);
letterSequence.setScore(this.getScore());
letterSequence.setAdjustedScore(this.getAdjustedScore());
groupToLetterSequenceMap.put(currentGroup, letterSequence);
letterSequences.add(letterSequence);
currentLetters = new ArrayList<String>();
currentShapes = new ShapeSequence();
currentGroup = shape.getGroup();
}
currentShapes.addShape(shape);
currentLetters.add(letter);
}
if (currentLetters.size() > 0) {
LetterSequence letterSequence = new LetterSequence(currentShapes, currentLetters, jochreSession);
letterSequence.setScore(this.getScore());
letterSequence.setAdjustedScore(this.getAdjustedScore());
groupToLetterSequenceMap.put(currentGroup, letterSequence);
letterSequences.add(letterSequence);
}
}
GroupOfShapes currentGroup = this.getGroups().get(0);
List<LetterSequence> newSubsequences = new ArrayList<LetterSequence>();
for (LetterSequence subsequence : this.getSubsequences()) {
if (subsequence.getHyphenSubsequence() != null) {
// subsequence contains end-of-line hyphen
// break it up into several subsequences
List<LetterSequence> subsequencesByGroup = subsequence.getSubsequences();
LetterSequence firstSubsequence = subsequencesByGroup.get(0);
firstSubsequence.setHyphenSubsequence(subsequence.getHyphenSubsequence());
newSubsequences.addAll(subsequencesByGroup);
for (LetterSequence subsubsequence : subsequencesByGroup) {
subsubsequence.setHyphenatedString(subsequence.getHyphenatedString());
}
} else {
newSubsequences.add(subsequence);
}
}
// assign my subsequences to the correct group
List<LetterSequence> currentSubsequences = new ArrayList<LetterSequence>();
for (LetterSequence subsequence : newSubsequences) {
if (!subsequence.getGroups().get(0).equals(currentGroup)) {
LetterSequence currentSequence = groupToLetterSequenceMap.get(currentGroup);
currentSequence.setSubsequences(currentSubsequences);
for (LetterSequence oneSubsequence : currentSubsequences) {
if (oneSubsequence.getWordFrequencies().size() > 0) {
currentSequence.getWordFrequencies().add(oneSubsequence.getWordFrequencies().get(0));
}
}
currentSubsequences = new ArrayList<LetterSequence>();
currentGroup = subsequence.getGroups().get(0);
}
currentSubsequences.add(subsequence);
}
if (currentSubsequences.size() > 0) {
LetterSequence currentSequence = groupToLetterSequenceMap.get(currentGroup);
currentSequence.setSubsequences(currentSubsequences);
for (LetterSequence oneSubsequence : currentSubsequences) {
if (oneSubsequence.getWordFrequencies().size() > 0) {
currentSequence.getWordFrequencies().add(oneSubsequence.getWordFrequencies().get(0));
}
}
}
if (this.getHyphenSubsequence() != null)
letterSequences.get(0).setHyphenSubsequence(this.getHyphenSubsequence());
for (LetterSequence letterSequence : letterSequences) {
letterSequence.setScore(this.getScore());
letterSequence.setAdjustedScore(this.getAdjustedScore());
}
} else {
letterSequences.add(this);
}
return letterSequences;
}
use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.
the class OriginalShapeLetterAssigner method onGuessSequence.
@Override
public void onGuessSequence(LetterSequence letterSequence) {
stillValid = true;
ShapeSequence shapeSequence = letterSequence.getUnderlyingShapeSequence();
Shape previousOriginalShape = null;
List<ShapeInSequence> subsequenceForPrevOriginalShape = new ArrayList<ShapeInSequence>();
for (ShapeInSequence shapeInSequence : shapeSequence) {
// cases that are possible:
// 1) shapeInSequence is 1-to-1 with an original shape (A from original shape A)
// 2) shapeInSequence shares an original shape with previous (B from original shape AB)
// 3) shapeInSequence shares an original shape with next (A from original shape AB)
// 4) shapeInSequence shares an original shape with previous and next (B from original shape ABC)
// 5) shapeInSequence has two original shapes (A from original shapes |A A|)
// 6) shapeInSequence has 3 original shapes (A from original shapes |A * A|)
// 7) shapeInSequence shares with previous and has 2+ original shapes (A from |A A|B)
// 8) shapeInSequence shares with next and has 2+ original shapes (B from A|B B|)
// So, when we reach a new original shape,
// either it coincides with a previous shape border, or it doesn't
List<Shape> originalShapes = shapeInSequence.getOriginalShapes();
for (Shape nextOriginalShape : originalShapes) {
if (!nextOriginalShape.equals(previousOriginalShape)) {
// new original shape, we need to populate the letters of the previous one
if (previousOriginalShape != null)
this.assignLetter(previousOriginalShape, subsequenceForPrevOriginalShape);
previousOriginalShape = nextOriginalShape;
subsequenceForPrevOriginalShape = new ArrayList<ShapeInSequence>();
}
subsequenceForPrevOriginalShape.add(shapeInSequence);
}
// next original shape
}
// next underlying shape sequence shape
if (previousOriginalShape != null)
this.assignLetter(previousOriginalShape, subsequenceForPrevOriginalShape);
}
use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.
the class LetterSequence method getSubsequences.
/**
* If this sequence contains any punctuation, returns individual sequences
* representing letters and punctuation. Otherwise, returns the original
* sequence.
*/
public List<LetterSequence> getSubsequences() {
if (subsequences == null) {
subsequences = new ArrayList<LetterSequence>();
List<String> currentLetters = new ArrayList<String>();
ShapeSequence currentShapes = new ShapeSequence();
boolean inPunctuation = false;
boolean expectEndOfLineHyphen = false;
for (int i = 0; i < this.letters.size(); i++) {
String letter = this.letters.get(i);
ShapeInSequence shape = this.underlyingShapeSequence.get(i);
if (i == this.getEndOfLineHyphenIndex())
expectEndOfLineHyphen = true;
if (PUNCTUATION.matcher(letter).matches()) {
if (!inPunctuation && currentLetters.size() > 0) {
LetterSequence subsequence = this.getSubsequence(currentShapes, currentLetters);
subsequences.add(subsequence);
currentLetters = new ArrayList<String>();
currentShapes = new ShapeSequence();
}
inPunctuation = true;
} else {
if (inPunctuation && currentLetters.size() > 0) {
LetterSequence subsequence = this.getSubsequence(currentShapes, currentLetters);
subsequence.setPunctation(true);
if (expectEndOfLineHyphen) {
this.setHyphenSubsequence(subsequence);
}
subsequences.add(subsequence);
currentLetters = new ArrayList<String>();
currentShapes = new ShapeSequence();
}
inPunctuation = false;
}
currentLetters.add(letter);
currentShapes.addShape(shape.getShape());
}
if (currentLetters.size() > 0) {
LetterSequence subsequence = this.getSubsequence(currentShapes, currentLetters);
subsequence.setPunctation(inPunctuation);
if (inPunctuation && expectEndOfLineHyphen)
this.setHyphenSubsequence(subsequence);
subsequences.add(subsequence);
}
}
return subsequences;
}
use of com.joliciel.jochre.boundaries.ShapeSequence in project jochre by urieli.
the class BeamSearchImageAnalyser method analyseInternal.
public void analyseInternal(JochreImage image) {
LOG.debug("Analysing image " + image.getId());
if (currentMonitor != null) {
currentMonitor.setCurrentAction("imageMonitor.analysingImage", new Object[] { image.getPage().getIndex() });
}
for (LetterGuessObserver observer : observers) {
observer.onImageStart(image);
}
if (totalShapeCount < 0)
totalShapeCount = image.getShapeCount();
for (Paragraph paragraph : image.getParagraphs()) {
LOG.debug("Analysing paragraph " + paragraph.getIndex() + " (id=" + paragraph.getId() + ")");
List<LetterSequence> holdoverSequences = null;
GroupOfShapes holdoverGroup = null;
for (RowOfShapes row : paragraph.getRows()) {
LOG.debug("Analysing row " + row.getIndex() + " (id=" + row.getId() + ")");
for (GroupOfShapes group : row.getGroups()) {
if (group.isSkip()) {
LOG.debug("Skipping group " + group.getIndex() + " (id=" + group.getId() + ")");
continue;
}
LOG.debug("Analysing group " + group.getIndex() + " (id=" + group.getId() + ")");
int width = group.getRight() - group.getLeft() + 1;
List<ShapeSequence> shapeSequences = null;
if (boundaryDetector != null) {
shapeSequences = boundaryDetector.findBoundaries(group);
} else {
// simply add this groups shape's
shapeSequences = new ArrayList<>();
ShapeSequence shapeSequence = new ShapeSequence();
for (Shape shape : group.getShapes()) shapeSequence.addShape(shape);
shapeSequences.add(shapeSequence);
}
// Perform a beam search to guess the most likely sequence
// for this
// word
TreeMap<Integer, PriorityQueue<LetterSequence>> heaps = new TreeMap<>();
// prime a starter heap with the n best shape boundary
// analyses for
// this group
PriorityQueue<LetterSequence> starterHeap = new PriorityQueue<>(1);
for (ShapeSequence shapeSequence : shapeSequences) {
LetterSequence emptySequence = new LetterSequence(shapeSequence, jochreSession);
starterHeap.add(emptySequence);
}
heaps.put(0, starterHeap);
PriorityQueue<LetterSequence> finalHeap = null;
while (heaps.size() > 0) {
Entry<Integer, PriorityQueue<LetterSequence>> heapEntry = heaps.pollFirstEntry();
if (LOG.isTraceEnabled())
LOG.trace("heap for index: " + heapEntry.getKey().intValue() + ", width: " + width);
if (heapEntry.getKey().intValue() == width) {
finalHeap = heapEntry.getValue();
break;
}
PriorityQueue<LetterSequence> previousHeap = heapEntry.getValue();
// limit the breadth to K
int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
for (int j = 0; j < maxSequences; j++) {
LetterSequence history = previousHeap.poll();
ShapeInSequence shapeInSequence = history.getNextShape();
Shape shape = shapeInSequence.getShape();
if (LOG.isTraceEnabled()) {
LOG.trace("Sequence " + history + ", shape: " + shape);
}
LogUtils.logMemory(LOG);
int position = 0;
if (jochreSession.getLinguistics().isLeftToRight()) {
position = shape.getRight() - group.getLeft() + 1;
} else {
position = group.getRight() - shape.getLeft() + 1;
}
PriorityQueue<LetterSequence> heap = heaps.get(position);
if (heap == null) {
heap = new PriorityQueue<>();
heaps.put(position, heap);
}
letterGuesser.guessLetter(shapeInSequence, history);
// heap sort
for (Decision letterGuess : shape.getLetterGuesses()) {
// leave out very low probability outcomes
if (letterGuess.getProbability() > this.minOutcomeWeight) {
LetterSequence sequence = new LetterSequence(history);
sequence.getLetters().add(letterGuess.getOutcome());
sequence.addDecision(letterGuess);
heap.add(sequence);
}
// weight big enough to include
}
// next letter guess for this shape
}
// next history in heap
}
// any more heaps?
// find best sequence
LetterSequence bestSequence = null;
boolean isHoldover = false;
List<LetterSequence> finalSequences = new ArrayList<>();
for (int i = 0; i < this.beamWidth; i++) {
if (finalHeap.isEmpty())
break;
finalSequences.add(finalHeap.poll());
}
if (this.mostLikelyWordChooser == null) {
// most likely sequence is on top of the last heap
bestSequence = finalSequences.get(0);
} else {
// get most likely sequence using lexicon
if (holdoverSequences != null) {
// we have a holdover from the previous row
// ending with a dash
bestSequence = this.mostLikelyWordChooser.chooseMostLikelyWord(finalSequences, holdoverSequences, this.beamWidth);
} else {
// check if this is the last group on the row
// and could end with
// a dash
boolean shouldBeHeldOver = false;
if (group.getIndex() == row.getGroups().size() - 1 && row.getIndex() < paragraph.getRows().size() - 1) {
for (LetterSequence letterSequence : finalSequences) {
if (letterSequence.toString().endsWith("-")) {
shouldBeHeldOver = true;
break;
}
}
}
if (shouldBeHeldOver) {
holdoverSequences = finalSequences;
holdoverGroup = group;
isHoldover = true;
} else {
// simplest case: no holdover
bestSequence = this.mostLikelyWordChooser.chooseMostLikelyWord(finalSequences, this.beamWidth);
}
}
// have we holdover sequences?
}
if (!isHoldover) {
for (LetterGuessObserver observer : observers) {
observer.onBeamSearchEnd(bestSequence, finalSequences, holdoverSequences);
}
}
// assign letter
if (!isHoldover) {
for (LetterGuessObserver observer : observers) {
observer.onStartSequence(bestSequence);
}
if (holdoverGroup == null) {
group.setBestLetterSequence(bestSequence);
} else {
// split bestSequence by group
List<LetterSequence> sequencesByGroup = bestSequence.splitByGroup();
for (LetterSequence sequenceByGroup : sequencesByGroup) {
if (sequenceByGroup.getGroups().get(0).equals(holdoverGroup))
holdoverGroup.setBestLetterSequence(sequenceByGroup);
else if (sequenceByGroup.getGroups().get(0).equals(group))
group.setBestLetterSequence(sequenceByGroup);
}
holdoverSequences = null;
holdoverGroup = null;
}
int i = 0;
for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
String bestOutcome = bestSequence.getLetters().get(i);
this.assignLetter(shapeInSequence, bestOutcome);
i++;
}
for (LetterGuessObserver observer : observers) {
observer.onGuessSequence(bestSequence);
}
}
this.shapeCount += group.getShapes().size();
if (this.currentMonitor != null) {
double progress = (double) shapeCount / (double) totalShapeCount;
LOG.debug("progress: " + progress);
currentMonitor.setPercentComplete(progress);
}
}
// next group
}
// next row
}
for (LetterGuessObserver observer : observers) {
observer.onImageEnd();
}
}
Aggregations