Search in sources :

Example 1 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class LexiconErrorWriter method onBeamSearchEnd.

@Override
public void onBeamSearchEnd(LetterSequence bestSequence, List<LetterSequence> finalSequences, List<LetterSequence> holdoverSequences) {
    beamContainsRightWord = false;
    this.finalSequences = finalSequences;
    this.holdoverSequences = holdoverSequences;
    for (LetterSequence letterSequence : finalSequences) {
        if (letterSequence.getRealWord().equals(letterSequence.getGuessedWord())) {
            beamContainsRightWord = true;
            break;
        }
    }
    if (beamContainsRightWord && holdoverSequences != null && holdoverSequences.size() > 0) {
        beamContainsRightWord = false;
        for (LetterSequence letterSequence : holdoverSequences) {
            if (letterSequence.getRealWord().equals(letterSequence.getGuessedWord())) {
                beamContainsRightWord = true;
                break;
            }
        }
    }
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence)

Example 2 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class LexiconErrorWriter method onGuessSequence.

@Override
public void onGuessSequence(LetterSequence bestSequence) {
    try {
        int realFrequency = 0;
        if (wordChooser != null)
            realFrequency = wordChooser.getFrequency(bestSequence, false);
        boolean error = !bestSequence.getRealWord().equals(bestSequence.getGuessedWord());
        boolean known = realFrequency > 0;
        boolean badSeg = bestSequence.getRealSequence().contains("[") || bestSequence.getRealSequence().contains("|");
        for (int i = 0; i < 3; i++) {
            Writer writer = null;
            if (i == 0) {
                writer = allWordWriter;
            } else if (i == 1) {
                if (error)
                    writer = allErrorWriter;
                else
                    continue;
            } else {
                int j = 0;
                List<ErrorStatistics> statList = new ArrayList<LexiconErrorWriter.ErrorStatistics>();
                statList.add(errorMap.get(ALL_GROUP));
                statList.add(errorMap.get(currentDoc.getName()));
                for (String docGroupName : documentGroups.keySet()) {
                    if (documentGroups.get(docGroupName).contains(currentDoc.getId()))
                        statList.add(errorMap.get(docGroupName));
                }
                if (beamContainsRightWord) {
                    if (error) {
                        for (ErrorStatistics stats : statList) stats.answerInBeamErrorCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.answerInBeamCorrectCount++;
                    }
                    beamContainsRightWord = false;
                }
                Linguistics linguistics = jochreSession.getLinguistics();
                for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
                    String letterGuess = bestSequence.getLetters().get(j++);
                    String letter = shapeInSequence.getShape().getLetter();
                    boolean badSegLetter = letter.contains("|") || letter.length() == 0 || (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter));
                    if (letter.equals(letterGuess)) {
                        if (known) {
                            for (ErrorStatistics stats : statList) stats.knownWordCorrectLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.unknownWordCorrectLetterCount++;
                        }
                        if (badSegLetter) {
                            for (ErrorStatistics stats : statList) stats.badSegCorrectLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.goodSegCorrectLetterCount++;
                        }
                    } else {
                        if (known) {
                            for (ErrorStatistics stats : statList) stats.knownWordErrorLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.unknownWordErrorLetterCount++;
                        }
                        if (badSegLetter) {
                            for (ErrorStatistics stats : statList) stats.badSegErrorLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.goodSegErrorLetterCount++;
                        }
                    }
                }
                if (error && known) {
                    for (ErrorStatistics stats : statList) stats.knownWordErrorCount++;
                    writer = knownWordErrorWriter;
                } else if (error && !known) {
                    for (ErrorStatistics stats : statList) stats.unknownWordErrorCount++;
                    writer = unknownWordErrorWriter;
                } else if (!error && known) {
                    for (ErrorStatistics stats : statList) stats.knownWordCorrectCount++;
                    writer = knownWordCorrectWriter;
                } else if (!error && !known) {
                    for (ErrorStatistics stats : statList) stats.unknownWordCorrectCount++;
                    writer = unknownWordCorrectWriter;
                }
                if (error) {
                    if (badSeg) {
                        for (ErrorStatistics stats : statList) stats.badSegErrorCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.goodSegErrorCount++;
                    }
                } else {
                    if (badSeg) {
                        for (ErrorStatistics stats : statList) stats.badSegCorrectCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.goodSegCorrectCount++;
                    }
                }
            }
            writer.write(CSV.format(bestSequence.getRealSequence()));
            writer.write(CSV.format(bestSequence.getRealWord()));
            writer.write(CSV.format(bestSequence.getGuessedSequence()));
            writer.write(CSV.format(bestSequence.getGuessedWord()));
            if (i < 2) {
                writer.write(CSV.format(known ? 1 : 0));
                writer.write(CSV.format(error ? 1 : 0));
            }
            writer.write(CSV.format(realFrequency));
            writer.write(CSV.format(bestSequence.getFrequency()));
            GroupOfShapes group = bestSequence.getGroups().get(0);
            writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getDocument().getName()));
            writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getIndex()));
            writer.write(CSV.format(group.getRow().getParagraph().getIndex()));
            writer.write(CSV.format(group.getRow().getIndex()));
            writer.write(CSV.format(group.getIndex()));
            writer.write(CSV.format(group.getId()));
            if (this.includeBeam) {
                if (finalSequences != null) {
                    for (LetterSequence sequence : finalSequences) {
                        writer.write(CSV.format(sequence.getGuessedSequence()));
                        writer.write(CSV.format(sequence.getScore()));
                        writer.write(CSV.format(sequence.getAdjustedScore()));
                    }
                }
                writer.write(CSV.format(""));
                if (holdoverSequences != null) {
                    for (LetterSequence sequence : holdoverSequences) {
                        writer.write(CSV.format(sequence.getGuessedSequence()));
                        writer.write(CSV.format(sequence.getScore()));
                        writer.write(CSV.format(sequence.getAdjustedScore()));
                    }
                }
            }
            writer.write("\n");
            writer.flush();
        }
    } catch (IOException e) {
        LOG.error("Failed to write to LexiconErrorWriter", e);
        throw new RuntimeException(e);
    }
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) IOException(java.io.IOException) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) Linguistics(com.joliciel.jochre.lang.Linguistics) ArrayList(java.util.ArrayList) List(java.util.List) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) ShapeInSequence(com.joliciel.jochre.boundaries.ShapeInSequence)

Example 3 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class GroupOfShapes method getSubsequences.

/**
 * Returns the subsequences of the best letter sequence.
 */
public List<LetterSequence> getSubsequences() {
    LetterSequence bestLetterSequence = this.getBestLetterSequence();
    List<LetterSequence> subsequences = new ArrayList<>();
    if (bestLetterSequence != null)
        subsequences = bestLetterSequence.getSubsequences();
    return subsequences;
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) ArrayList(java.util.ArrayList)

Example 4 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class MostLikelyWordChooser method chooseMostLikelyWord.

/**
 * Choose the most likely letter sequence from two heaps, one representing
 * the holdover from the previous row (some of which end with a dash), and
 * one representing the first heap on the current row.
 *
 * @param heap
 *            the current row's first heap
 * @param holdoverHeap
 *            the previous row's holdover heap, at least some of whose
 *            sequences end with a dash
 * @param n
 *            the number of sequences to consider in each heap
 * @return a letter sequence covering both heaps, either as a combined word
 *         (with a dash in the middle) or as two separate words
 */
public LetterSequence chooseMostLikelyWord(List<LetterSequence> heap, List<LetterSequence> holdoverHeap, int n) {
    LetterSequence bestSequence = null;
    List<LetterSequence> holdoverWithDash = new ArrayList<>(n);
    List<LetterSequence> holdoverWithoutDash = new ArrayList<>(n);
    int i = 0;
    for (LetterSequence holdoverSequence : holdoverHeap) {
        if (i >= n)
            break;
        if (holdoverSequence.toString().endsWith("-"))
            holdoverWithDash.add(holdoverSequence);
        else
            holdoverWithoutDash.add(holdoverSequence);
        i++;
    }
    PriorityQueue<LetterSequence> combinedHeap = new PriorityQueue<>();
    for (LetterSequence sequenceWithDash : holdoverWithDash) {
        // find the dash that needs to be skipped at the end of sequence 1
        for (int j = sequenceWithDash.getLetters().size() - 1; j >= 0; j--) {
            String outcome = sequenceWithDash.getLetters().get(j);
            if (outcome.equals("-")) {
                sequenceWithDash.setEndOfLineHyphenIndex(j);
                break;
            }
        }
        for (LetterSequence letterSequence : heap) {
            LetterSequence combinedSequence = new LetterSequence(sequenceWithDash, letterSequence);
            combinedHeap.add(combinedSequence);
        }
    }
    List<LetterSequence> combinedSequences = new ArrayList<>();
    for (i = 0; i < n; i++) {
        if (combinedHeap.isEmpty())
            break;
        combinedSequences.add(combinedHeap.poll());
    }
    if (holdoverWithoutDash.size() == 0) {
        // all holdovers end with a dash
        // therefore we must combine the two sequences
        bestSequence = this.chooseMostLikelyWord(combinedSequences, n);
    } else {
        // some holdovers end with a dash, others don't
        // need to compare combined sequences with individual sequences
        LetterSequence bestCombinedSequence = this.chooseMostLikelyWord(combinedSequences, n);
        // Originally we only included sequences without dashes here
        // However, this falsifies the results towards those without a dash
        // especially in the case where sequence 1 or sequence 2 is also a
        // common word (e.g. der in Yiddish)
        // PriorityQueue<LetterSequence> holdoverHeapWithoutDash = new
        // PriorityQueue<LetterSequence>(holdoverWithoutDash);
        // LetterSequence bestHoldoverSequenceWithoutDash =
        // this.chooseMostLikelyWord(holdoverHeapWithoutDash, n);
        // Changed it to the following:
        LetterSequence bestHoldoverSequence = this.chooseMostLikelyWord(holdoverHeap, n);
        LetterSequence bestNextRowSequence = this.chooseMostLikelyWord(heap, n);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Best combined: " + bestCombinedSequence.toString() + ". Adjusted score: " + bestCombinedSequence.getAdjustedScore());
            LOG.debug("Best seq1 separate: " + bestHoldoverSequence.toString() + ". Adjusted score: " + bestHoldoverSequence.getAdjustedScore());
            LOG.debug("Best seq2 separate: " + bestNextRowSequence.toString() + ". Adjusted score: " + bestNextRowSequence.getAdjustedScore());
        }
        // Now, to compare the best combined with the best separate scores,
        // we need to get a geometric mean of the shapes
        // in the best separate ones, and adjust for the lowest frequency
        // word
        LetterSequence separateSequence = new LetterSequence(bestHoldoverSequence, bestNextRowSequence);
        int minFrequency = bestHoldoverSequence.getFrequency() < bestNextRowSequence.getFrequency() ? bestHoldoverSequence.getFrequency() : bestNextRowSequence.getFrequency();
        double freqLog = this.getFrequencyAdjustment(minFrequency);
        double separateAdjustedScore = separateSequence.getScore() * freqLog;
        separateSequence.setAdjustedScore(separateAdjustedScore);
        if (LOG.isDebugEnabled())
            LOG.debug("Best separate: " + separateSequence.toString() + ". Score: " + separateSequence.getScore() + ". Freq: " + minFrequency + ". Adjusted: " + freqLog + ". Adjusted score: " + separateSequence.getAdjustedScore());
        if (bestCombinedSequence.getAdjustedScore() > separateAdjustedScore) {
            if (LOG.isDebugEnabled())
                LOG.debug("Using combined sequence");
            bestSequence = bestCombinedSequence;
        } else {
            if (LOG.isDebugEnabled())
                LOG.debug("Using separate sequences");
            bestSequence = new LetterSequence(bestHoldoverSequence, bestNextRowSequence);
        }
        if (LOG.isDebugEnabled())
            LOG.debug("Best with holdover: " + bestSequence.toString());
    }
    return bestSequence;
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) ArrayList(java.util.ArrayList) PriorityQueue(java.util.PriorityQueue)

Example 5 with LetterSequence

use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.

the class UnknownWordListWriter method onImageComplete.

@Override
public void onImageComplete(JochreImage image) {
    try {
        for (Paragraph paragraph : image.getParagraphs()) {
            if (!paragraph.isJunk()) {
                for (RowOfShapes row : paragraph.getRows()) {
                    for (GroupOfShapes group : row.getGroups()) {
                        if (group.getBestLetterSequence() != null) {
                            for (LetterSequence subsequence : group.getBestLetterSequence().getSubsequences()) {
                                for (CountedOutcome<String> wordFrequency : subsequence.getWordFrequencies()) {
                                    if (wordFrequency.getCount() == 0) {
                                        writer.write(wordFrequency.getOutcome() + "\n");
                                        writer.flush();
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    } catch (IOException e) {
        LOG.error("Failed to write to UnknownWordListWriter", e);
        throw new RuntimeException(e);
    }
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) RowOfShapes(com.joliciel.jochre.graphics.RowOfShapes) IOException(java.io.IOException) Paragraph(com.joliciel.jochre.graphics.Paragraph)

Aggregations

LetterSequence (com.joliciel.jochre.letterGuesser.LetterSequence)9 ArrayList (java.util.ArrayList)5 GroupOfShapes (com.joliciel.jochre.graphics.GroupOfShapes)4 Shape (com.joliciel.jochre.graphics.Shape)3 ShapeInSequence (com.joliciel.jochre.boundaries.ShapeInSequence)2 Paragraph (com.joliciel.jochre.graphics.Paragraph)2 RowOfShapes (com.joliciel.jochre.graphics.RowOfShapes)2 IOException (java.io.IOException)2 List (java.util.List)2 PriorityQueue (java.util.PriorityQueue)2 TreeMap (java.util.TreeMap)2 ShapeSequence (com.joliciel.jochre.boundaries.ShapeSequence)1 Linguistics (com.joliciel.jochre.lang.Linguistics)1 LetterGuesserContext (com.joliciel.jochre.letterGuesser.LetterGuesserContext)1 Decision (com.joliciel.talismane.machineLearning.Decision)1 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)1 CountedOutcome (com.joliciel.talismane.utils.CountedOutcome)1 BufferedWriter (java.io.BufferedWriter)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1