Search in sources :

Example 1 with Linguistics

use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.

the class LexiconErrorWriter method onGuessSequence.

@Override
public void onGuessSequence(LetterSequence bestSequence) {
    try {
        int realFrequency = 0;
        if (wordChooser != null)
            realFrequency = wordChooser.getFrequency(bestSequence, false);
        boolean error = !bestSequence.getRealWord().equals(bestSequence.getGuessedWord());
        boolean known = realFrequency > 0;
        boolean badSeg = bestSequence.getRealSequence().contains("[") || bestSequence.getRealSequence().contains("|");
        for (int i = 0; i < 3; i++) {
            Writer writer = null;
            if (i == 0) {
                writer = allWordWriter;
            } else if (i == 1) {
                if (error)
                    writer = allErrorWriter;
                else
                    continue;
            } else {
                int j = 0;
                List<ErrorStatistics> statList = new ArrayList<LexiconErrorWriter.ErrorStatistics>();
                statList.add(errorMap.get(ALL_GROUP));
                statList.add(errorMap.get(currentDoc.getName()));
                for (String docGroupName : documentGroups.keySet()) {
                    if (documentGroups.get(docGroupName).contains(currentDoc.getId()))
                        statList.add(errorMap.get(docGroupName));
                }
                if (beamContainsRightWord) {
                    if (error) {
                        for (ErrorStatistics stats : statList) stats.answerInBeamErrorCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.answerInBeamCorrectCount++;
                    }
                    beamContainsRightWord = false;
                }
                Linguistics linguistics = jochreSession.getLinguistics();
                for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
                    String letterGuess = bestSequence.getLetters().get(j++);
                    String letter = shapeInSequence.getShape().getLetter();
                    boolean badSegLetter = letter.contains("|") || letter.length() == 0 || (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter));
                    if (letter.equals(letterGuess)) {
                        if (known) {
                            for (ErrorStatistics stats : statList) stats.knownWordCorrectLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.unknownWordCorrectLetterCount++;
                        }
                        if (badSegLetter) {
                            for (ErrorStatistics stats : statList) stats.badSegCorrectLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.goodSegCorrectLetterCount++;
                        }
                    } else {
                        if (known) {
                            for (ErrorStatistics stats : statList) stats.knownWordErrorLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.unknownWordErrorLetterCount++;
                        }
                        if (badSegLetter) {
                            for (ErrorStatistics stats : statList) stats.badSegErrorLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.goodSegErrorLetterCount++;
                        }
                    }
                }
                if (error && known) {
                    for (ErrorStatistics stats : statList) stats.knownWordErrorCount++;
                    writer = knownWordErrorWriter;
                } else if (error && !known) {
                    for (ErrorStatistics stats : statList) stats.unknownWordErrorCount++;
                    writer = unknownWordErrorWriter;
                } else if (!error && known) {
                    for (ErrorStatistics stats : statList) stats.knownWordCorrectCount++;
                    writer = knownWordCorrectWriter;
                } else if (!error && !known) {
                    for (ErrorStatistics stats : statList) stats.unknownWordCorrectCount++;
                    writer = unknownWordCorrectWriter;
                }
                if (error) {
                    if (badSeg) {
                        for (ErrorStatistics stats : statList) stats.badSegErrorCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.goodSegErrorCount++;
                    }
                } else {
                    if (badSeg) {
                        for (ErrorStatistics stats : statList) stats.badSegCorrectCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.goodSegCorrectCount++;
                    }
                }
            }
            writer.write(CSV.format(bestSequence.getRealSequence()));
            writer.write(CSV.format(bestSequence.getRealWord()));
            writer.write(CSV.format(bestSequence.getGuessedSequence()));
            writer.write(CSV.format(bestSequence.getGuessedWord()));
            if (i < 2) {
                writer.write(CSV.format(known ? 1 : 0));
                writer.write(CSV.format(error ? 1 : 0));
            }
            writer.write(CSV.format(realFrequency));
            writer.write(CSV.format(bestSequence.getFrequency()));
            GroupOfShapes group = bestSequence.getGroups().get(0);
            writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getDocument().getName()));
            writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getIndex()));
            writer.write(CSV.format(group.getRow().getParagraph().getIndex()));
            writer.write(CSV.format(group.getRow().getIndex()));
            writer.write(CSV.format(group.getIndex()));
            writer.write(CSV.format(group.getId()));
            if (this.includeBeam) {
                if (finalSequences != null) {
                    for (LetterSequence sequence : finalSequences) {
                        writer.write(CSV.format(sequence.getGuessedSequence()));
                        writer.write(CSV.format(sequence.getScore()));
                        writer.write(CSV.format(sequence.getAdjustedScore()));
                    }
                }
                writer.write(CSV.format(""));
                if (holdoverSequences != null) {
                    for (LetterSequence sequence : holdoverSequences) {
                        writer.write(CSV.format(sequence.getGuessedSequence()));
                        writer.write(CSV.format(sequence.getScore()));
                        writer.write(CSV.format(sequence.getAdjustedScore()));
                    }
                }
            }
            writer.write("\n");
            writer.flush();
        }
    } catch (IOException e) {
        LOG.error("Failed to write to LexiconErrorWriter", e);
        throw new RuntimeException(e);
    }
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) IOException(java.io.IOException) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) Linguistics(com.joliciel.jochre.lang.Linguistics) ArrayList(java.util.ArrayList) List(java.util.List) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) ShapeInSequence(com.joliciel.jochre.boundaries.ShapeInSequence)

Example 2 with Linguistics

use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.

the class GraphicsDao method findShapesToSplit.

/**
 * Return a list of all shapes that need to be split.
 */
public List<Shape> findShapesToSplit(Locale locale) {
    NamedParameterJdbcTemplate jt = new NamedParameterJdbcTemplate(this.getDataSource());
    Linguistics linguistics = this.jochreSession.getLinguistics();
    Set<String> dualCharacterLetters = linguistics.getDualCharacterLetters();
    String sql = "SELECT " + SELECT_SHAPE + ", count(split_id) as the_count FROM ocr_shape" + " LEFT JOIN ocr_split on shape_id = split_shape_id" + " LEFT JOIN ocr_group ON shape_group_id = group_id" + " LEFT JOIN ocr_row ON group_row_id = row_id" + " LEFT JOIN ocr_paragraph ON row_paragraph_id = paragraph_id" + " LEFT JOIN ocr_image ON paragraph_image_id = image_id" + " WHERE length(shape_letter)>1" + " AND shape_letter not like '%|'" + " AND shape_letter not like '|%'";
    if (dualCharacterLetters.size() > 0)
        sql += " AND shape_letter not in (:dual_character_letters)";
    sql += " AND image_imgstatus_id in (:image_imgstatus_id)" + " GROUP BY " + SELECT_SHAPE + " ORDER BY the_count, shape_letter, shape_id";
    MapSqlParameterSource paramSource = new MapSqlParameterSource();
    if (dualCharacterLetters.size() > 0)
        paramSource.addValue("dual_character_letters", linguistics.getDualCharacterLetters());
    List<Integer> imageStatusList = new ArrayList<>();
    imageStatusList.add(ImageStatus.TRAINING_VALIDATED.getId());
    imageStatusList.add(ImageStatus.TRAINING_HELD_OUT.getId());
    imageStatusList.add(ImageStatus.TRAINING_TEST.getId());
    paramSource.addValue("image_imgstatus_id", imageStatusList);
    LOG.debug(sql);
    logParameters(paramSource);
    List<Shape> shapes = jt.query(sql, paramSource, new ShapeMapper());
    return shapes;
}
Also used : MapSqlParameterSource(org.springframework.jdbc.core.namedparam.MapSqlParameterSource) NamedParameterJdbcTemplate(org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate) Linguistics(com.joliciel.jochre.lang.Linguistics) ArrayList(java.util.ArrayList)

Example 3 with Linguistics

use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.

the class GroupOfShapes method getWordForIndex.

/**
 * Returns the word excluding opening and closing punctuation.
 */
public String getWordForIndex() {
    Linguistics linguistics = jochreSession.getLinguistics();
    String word = this.getWord();
    int wordStart = 0;
    for (int i = 0; i < word.length(); i++) {
        wordStart = i;
        char c = word.charAt(i);
        if (linguistics.getPunctuation().contains(c)) {
            continue;
        }
        break;
    }
    int wordEnd = word.length() - 1;
    for (int i = word.length() - 1; i >= 0; i--) {
        wordEnd = i;
        char c = word.charAt(i);
        if (linguistics.getPunctuation().contains(c)) {
            continue;
        }
        break;
    }
    wordEnd += 1;
    if (wordStart > wordEnd)
        wordStart = wordEnd;
    String wordForIndex = word.substring(wordStart, wordEnd);
    return wordForIndex;
}
Also used : Linguistics(com.joliciel.jochre.lang.Linguistics)

Example 4 with Linguistics

use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.

the class LetterSequence method getRealSequence.

/**
 * A string representation of the real sequence behind this letter sequence
 * (including split letters and inkspots).
 */
public String getRealSequence() {
    if (realSequence == null) {
        Linguistics linguistics = jochreSession.getLinguistics();
        StringBuilder realWordBuilder = new StringBuilder();
        Shape lastShape = null;
        for (ShapeInSequence shapeInSequence : this.getUnderlyingShapeSequence()) {
            for (Shape originalShape : shapeInSequence.getOriginalShapes()) {
                if (!originalShape.equals(lastShape)) {
                    String letter = originalShape.getLetter();
                    if (letter.length() == 0)
                        realWordBuilder.append("[]");
                    else if (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter))
                        realWordBuilder.append("[" + letter + "]");
                    else
                        realWordBuilder.append(letter);
                }
                lastShape = originalShape;
            }
        }
        realSequence = realWordBuilder.toString();
    }
    return realSequence;
}
Also used : Shape(com.joliciel.jochre.graphics.Shape) Linguistics(com.joliciel.jochre.lang.Linguistics) ShapeInSequence(com.joliciel.jochre.boundaries.ShapeInSequence)

Example 5 with Linguistics

use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.

the class LetterSequence method getGuessedSequence.

/**
 * A string representation of the guessed sequence behind this letter
 * sequence (including split letters and inkspots).
 */
public String getGuessedSequence() {
    if (guessedSequence == null) {
        Linguistics linguistics = jochreSession.getLinguistics();
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < letters.size(); i++) {
            String letter = letters.get(i);
            if (i == this.endOfLineHyphenIndex) {
                if (this.softHyphen) {
                    continue;
                }
            }
            if (letter.length() == 0)
                builder.append("[]");
            else if (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter))
                builder.append("[" + letter + "]");
            else
                builder.append(letter);
        }
        guessedSequence = builder.toString();
    }
    return guessedSequence;
}
Also used : Linguistics(com.joliciel.jochre.lang.Linguistics)

Aggregations

Linguistics (com.joliciel.jochre.lang.Linguistics)5 ShapeInSequence (com.joliciel.jochre.boundaries.ShapeInSequence)2 ArrayList (java.util.ArrayList)2 GroupOfShapes (com.joliciel.jochre.graphics.GroupOfShapes)1 Shape (com.joliciel.jochre.graphics.Shape)1 LetterSequence (com.joliciel.jochre.letterGuesser.LetterSequence)1 BufferedWriter (java.io.BufferedWriter)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1 List (java.util.List)1 MapSqlParameterSource (org.springframework.jdbc.core.namedparam.MapSqlParameterSource)1 NamedParameterJdbcTemplate (org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate)1