use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.
the class LexiconErrorWriter method onGuessSequence.
@Override
public void onGuessSequence(LetterSequence bestSequence) {
try {
int realFrequency = 0;
if (wordChooser != null)
realFrequency = wordChooser.getFrequency(bestSequence, false);
boolean error = !bestSequence.getRealWord().equals(bestSequence.getGuessedWord());
boolean known = realFrequency > 0;
boolean badSeg = bestSequence.getRealSequence().contains("[") || bestSequence.getRealSequence().contains("|");
for (int i = 0; i < 3; i++) {
Writer writer = null;
if (i == 0) {
writer = allWordWriter;
} else if (i == 1) {
if (error)
writer = allErrorWriter;
else
continue;
} else {
int j = 0;
List<ErrorStatistics> statList = new ArrayList<LexiconErrorWriter.ErrorStatistics>();
statList.add(errorMap.get(ALL_GROUP));
statList.add(errorMap.get(currentDoc.getName()));
for (String docGroupName : documentGroups.keySet()) {
if (documentGroups.get(docGroupName).contains(currentDoc.getId()))
statList.add(errorMap.get(docGroupName));
}
if (beamContainsRightWord) {
if (error) {
for (ErrorStatistics stats : statList) stats.answerInBeamErrorCount++;
} else {
for (ErrorStatistics stats : statList) stats.answerInBeamCorrectCount++;
}
beamContainsRightWord = false;
}
Linguistics linguistics = jochreSession.getLinguistics();
for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
String letterGuess = bestSequence.getLetters().get(j++);
String letter = shapeInSequence.getShape().getLetter();
boolean badSegLetter = letter.contains("|") || letter.length() == 0 || (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter));
if (letter.equals(letterGuess)) {
if (known) {
for (ErrorStatistics stats : statList) stats.knownWordCorrectLetterCount++;
} else {
for (ErrorStatistics stats : statList) stats.unknownWordCorrectLetterCount++;
}
if (badSegLetter) {
for (ErrorStatistics stats : statList) stats.badSegCorrectLetterCount++;
} else {
for (ErrorStatistics stats : statList) stats.goodSegCorrectLetterCount++;
}
} else {
if (known) {
for (ErrorStatistics stats : statList) stats.knownWordErrorLetterCount++;
} else {
for (ErrorStatistics stats : statList) stats.unknownWordErrorLetterCount++;
}
if (badSegLetter) {
for (ErrorStatistics stats : statList) stats.badSegErrorLetterCount++;
} else {
for (ErrorStatistics stats : statList) stats.goodSegErrorLetterCount++;
}
}
}
if (error && known) {
for (ErrorStatistics stats : statList) stats.knownWordErrorCount++;
writer = knownWordErrorWriter;
} else if (error && !known) {
for (ErrorStatistics stats : statList) stats.unknownWordErrorCount++;
writer = unknownWordErrorWriter;
} else if (!error && known) {
for (ErrorStatistics stats : statList) stats.knownWordCorrectCount++;
writer = knownWordCorrectWriter;
} else if (!error && !known) {
for (ErrorStatistics stats : statList) stats.unknownWordCorrectCount++;
writer = unknownWordCorrectWriter;
}
if (error) {
if (badSeg) {
for (ErrorStatistics stats : statList) stats.badSegErrorCount++;
} else {
for (ErrorStatistics stats : statList) stats.goodSegErrorCount++;
}
} else {
if (badSeg) {
for (ErrorStatistics stats : statList) stats.badSegCorrectCount++;
} else {
for (ErrorStatistics stats : statList) stats.goodSegCorrectCount++;
}
}
}
writer.write(CSV.format(bestSequence.getRealSequence()));
writer.write(CSV.format(bestSequence.getRealWord()));
writer.write(CSV.format(bestSequence.getGuessedSequence()));
writer.write(CSV.format(bestSequence.getGuessedWord()));
if (i < 2) {
writer.write(CSV.format(known ? 1 : 0));
writer.write(CSV.format(error ? 1 : 0));
}
writer.write(CSV.format(realFrequency));
writer.write(CSV.format(bestSequence.getFrequency()));
GroupOfShapes group = bestSequence.getGroups().get(0);
writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getDocument().getName()));
writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getIndex()));
writer.write(CSV.format(group.getRow().getParagraph().getIndex()));
writer.write(CSV.format(group.getRow().getIndex()));
writer.write(CSV.format(group.getIndex()));
writer.write(CSV.format(group.getId()));
if (this.includeBeam) {
if (finalSequences != null) {
for (LetterSequence sequence : finalSequences) {
writer.write(CSV.format(sequence.getGuessedSequence()));
writer.write(CSV.format(sequence.getScore()));
writer.write(CSV.format(sequence.getAdjustedScore()));
}
}
writer.write(CSV.format(""));
if (holdoverSequences != null) {
for (LetterSequence sequence : holdoverSequences) {
writer.write(CSV.format(sequence.getGuessedSequence()));
writer.write(CSV.format(sequence.getScore()));
writer.write(CSV.format(sequence.getAdjustedScore()));
}
}
}
writer.write("\n");
writer.flush();
}
} catch (IOException e) {
LOG.error("Failed to write to LexiconErrorWriter", e);
throw new RuntimeException(e);
}
}
use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.
the class GraphicsDao method findShapesToSplit.
/**
* Return a list of all shapes that need to be split.
*/
public List<Shape> findShapesToSplit(Locale locale) {
NamedParameterJdbcTemplate jt = new NamedParameterJdbcTemplate(this.getDataSource());
Linguistics linguistics = this.jochreSession.getLinguistics();
Set<String> dualCharacterLetters = linguistics.getDualCharacterLetters();
String sql = "SELECT " + SELECT_SHAPE + ", count(split_id) as the_count FROM ocr_shape" + " LEFT JOIN ocr_split on shape_id = split_shape_id" + " LEFT JOIN ocr_group ON shape_group_id = group_id" + " LEFT JOIN ocr_row ON group_row_id = row_id" + " LEFT JOIN ocr_paragraph ON row_paragraph_id = paragraph_id" + " LEFT JOIN ocr_image ON paragraph_image_id = image_id" + " WHERE length(shape_letter)>1" + " AND shape_letter not like '%|'" + " AND shape_letter not like '|%'";
if (dualCharacterLetters.size() > 0)
sql += " AND shape_letter not in (:dual_character_letters)";
sql += " AND image_imgstatus_id in (:image_imgstatus_id)" + " GROUP BY " + SELECT_SHAPE + " ORDER BY the_count, shape_letter, shape_id";
MapSqlParameterSource paramSource = new MapSqlParameterSource();
if (dualCharacterLetters.size() > 0)
paramSource.addValue("dual_character_letters", linguistics.getDualCharacterLetters());
List<Integer> imageStatusList = new ArrayList<>();
imageStatusList.add(ImageStatus.TRAINING_VALIDATED.getId());
imageStatusList.add(ImageStatus.TRAINING_HELD_OUT.getId());
imageStatusList.add(ImageStatus.TRAINING_TEST.getId());
paramSource.addValue("image_imgstatus_id", imageStatusList);
LOG.debug(sql);
logParameters(paramSource);
List<Shape> shapes = jt.query(sql, paramSource, new ShapeMapper());
return shapes;
}
use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.
the class GroupOfShapes method getWordForIndex.
/**
* Returns the word excluding opening and closing punctuation.
*/
public String getWordForIndex() {
Linguistics linguistics = jochreSession.getLinguistics();
String word = this.getWord();
int wordStart = 0;
for (int i = 0; i < word.length(); i++) {
wordStart = i;
char c = word.charAt(i);
if (linguistics.getPunctuation().contains(c)) {
continue;
}
break;
}
int wordEnd = word.length() - 1;
for (int i = word.length() - 1; i >= 0; i--) {
wordEnd = i;
char c = word.charAt(i);
if (linguistics.getPunctuation().contains(c)) {
continue;
}
break;
}
wordEnd += 1;
if (wordStart > wordEnd)
wordStart = wordEnd;
String wordForIndex = word.substring(wordStart, wordEnd);
return wordForIndex;
}
use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.
the class LetterSequence method getRealSequence.
/**
* A string representation of the real sequence behind this letter sequence
* (including split letters and inkspots).
*/
public String getRealSequence() {
if (realSequence == null) {
Linguistics linguistics = jochreSession.getLinguistics();
StringBuilder realWordBuilder = new StringBuilder();
Shape lastShape = null;
for (ShapeInSequence shapeInSequence : this.getUnderlyingShapeSequence()) {
for (Shape originalShape : shapeInSequence.getOriginalShapes()) {
if (!originalShape.equals(lastShape)) {
String letter = originalShape.getLetter();
if (letter.length() == 0)
realWordBuilder.append("[]");
else if (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter))
realWordBuilder.append("[" + letter + "]");
else
realWordBuilder.append(letter);
}
lastShape = originalShape;
}
}
realSequence = realWordBuilder.toString();
}
return realSequence;
}
use of com.joliciel.jochre.lang.Linguistics in project jochre by urieli.
the class LetterSequence method getGuessedSequence.
/**
* A string representation of the guessed sequence behind this letter
* sequence (including split letters and inkspots).
*/
public String getGuessedSequence() {
if (guessedSequence == null) {
Linguistics linguistics = jochreSession.getLinguistics();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < letters.size(); i++) {
String letter = letters.get(i);
if (i == this.endOfLineHyphenIndex) {
if (this.softHyphen) {
continue;
}
}
if (letter.length() == 0)
builder.append("[]");
else if (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter))
builder.append("[" + letter + "]");
else
builder.append(letter);
}
guessedSequence = builder.toString();
}
return guessedSequence;
}
Aggregations