use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class LexiconErrorWriter method onBeamSearchEnd.
@Override
public void onBeamSearchEnd(LetterSequence bestSequence, List<LetterSequence> finalSequences, List<LetterSequence> holdoverSequences) {
beamContainsRightWord = false;
this.finalSequences = finalSequences;
this.holdoverSequences = holdoverSequences;
for (LetterSequence letterSequence : finalSequences) {
if (letterSequence.getRealWord().equals(letterSequence.getGuessedWord())) {
beamContainsRightWord = true;
break;
}
}
if (beamContainsRightWord && holdoverSequences != null && holdoverSequences.size() > 0) {
beamContainsRightWord = false;
for (LetterSequence letterSequence : holdoverSequences) {
if (letterSequence.getRealWord().equals(letterSequence.getGuessedWord())) {
beamContainsRightWord = true;
break;
}
}
}
}
use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class LexiconErrorWriter method onGuessSequence.
@Override
public void onGuessSequence(LetterSequence bestSequence) {
try {
int realFrequency = 0;
if (wordChooser != null)
realFrequency = wordChooser.getFrequency(bestSequence, false);
boolean error = !bestSequence.getRealWord().equals(bestSequence.getGuessedWord());
boolean known = realFrequency > 0;
boolean badSeg = bestSequence.getRealSequence().contains("[") || bestSequence.getRealSequence().contains("|");
for (int i = 0; i < 3; i++) {
Writer writer = null;
if (i == 0) {
writer = allWordWriter;
} else if (i == 1) {
if (error)
writer = allErrorWriter;
else
continue;
} else {
int j = 0;
List<ErrorStatistics> statList = new ArrayList<LexiconErrorWriter.ErrorStatistics>();
statList.add(errorMap.get(ALL_GROUP));
statList.add(errorMap.get(currentDoc.getName()));
for (String docGroupName : documentGroups.keySet()) {
if (documentGroups.get(docGroupName).contains(currentDoc.getId()))
statList.add(errorMap.get(docGroupName));
}
if (beamContainsRightWord) {
if (error) {
for (ErrorStatistics stats : statList) stats.answerInBeamErrorCount++;
} else {
for (ErrorStatistics stats : statList) stats.answerInBeamCorrectCount++;
}
beamContainsRightWord = false;
}
Linguistics linguistics = jochreSession.getLinguistics();
for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
String letterGuess = bestSequence.getLetters().get(j++);
String letter = shapeInSequence.getShape().getLetter();
boolean badSegLetter = letter.contains("|") || letter.length() == 0 || (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter));
if (letter.equals(letterGuess)) {
if (known) {
for (ErrorStatistics stats : statList) stats.knownWordCorrectLetterCount++;
} else {
for (ErrorStatistics stats : statList) stats.unknownWordCorrectLetterCount++;
}
if (badSegLetter) {
for (ErrorStatistics stats : statList) stats.badSegCorrectLetterCount++;
} else {
for (ErrorStatistics stats : statList) stats.goodSegCorrectLetterCount++;
}
} else {
if (known) {
for (ErrorStatistics stats : statList) stats.knownWordErrorLetterCount++;
} else {
for (ErrorStatistics stats : statList) stats.unknownWordErrorLetterCount++;
}
if (badSegLetter) {
for (ErrorStatistics stats : statList) stats.badSegErrorLetterCount++;
} else {
for (ErrorStatistics stats : statList) stats.goodSegErrorLetterCount++;
}
}
}
if (error && known) {
for (ErrorStatistics stats : statList) stats.knownWordErrorCount++;
writer = knownWordErrorWriter;
} else if (error && !known) {
for (ErrorStatistics stats : statList) stats.unknownWordErrorCount++;
writer = unknownWordErrorWriter;
} else if (!error && known) {
for (ErrorStatistics stats : statList) stats.knownWordCorrectCount++;
writer = knownWordCorrectWriter;
} else if (!error && !known) {
for (ErrorStatistics stats : statList) stats.unknownWordCorrectCount++;
writer = unknownWordCorrectWriter;
}
if (error) {
if (badSeg) {
for (ErrorStatistics stats : statList) stats.badSegErrorCount++;
} else {
for (ErrorStatistics stats : statList) stats.goodSegErrorCount++;
}
} else {
if (badSeg) {
for (ErrorStatistics stats : statList) stats.badSegCorrectCount++;
} else {
for (ErrorStatistics stats : statList) stats.goodSegCorrectCount++;
}
}
}
writer.write(CSV.format(bestSequence.getRealSequence()));
writer.write(CSV.format(bestSequence.getRealWord()));
writer.write(CSV.format(bestSequence.getGuessedSequence()));
writer.write(CSV.format(bestSequence.getGuessedWord()));
if (i < 2) {
writer.write(CSV.format(known ? 1 : 0));
writer.write(CSV.format(error ? 1 : 0));
}
writer.write(CSV.format(realFrequency));
writer.write(CSV.format(bestSequence.getFrequency()));
GroupOfShapes group = bestSequence.getGroups().get(0);
writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getDocument().getName()));
writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getIndex()));
writer.write(CSV.format(group.getRow().getParagraph().getIndex()));
writer.write(CSV.format(group.getRow().getIndex()));
writer.write(CSV.format(group.getIndex()));
writer.write(CSV.format(group.getId()));
if (this.includeBeam) {
if (finalSequences != null) {
for (LetterSequence sequence : finalSequences) {
writer.write(CSV.format(sequence.getGuessedSequence()));
writer.write(CSV.format(sequence.getScore()));
writer.write(CSV.format(sequence.getAdjustedScore()));
}
}
writer.write(CSV.format(""));
if (holdoverSequences != null) {
for (LetterSequence sequence : holdoverSequences) {
writer.write(CSV.format(sequence.getGuessedSequence()));
writer.write(CSV.format(sequence.getScore()));
writer.write(CSV.format(sequence.getAdjustedScore()));
}
}
}
writer.write("\n");
writer.flush();
}
} catch (IOException e) {
LOG.error("Failed to write to LexiconErrorWriter", e);
throw new RuntimeException(e);
}
}
use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class GroupOfShapes method getSubsequences.
/**
* Returns the subsequences of the best letter sequence.
*/
public List<LetterSequence> getSubsequences() {
LetterSequence bestLetterSequence = this.getBestLetterSequence();
List<LetterSequence> subsequences = new ArrayList<>();
if (bestLetterSequence != null)
subsequences = bestLetterSequence.getSubsequences();
return subsequences;
}
use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class MostLikelyWordChooser method chooseMostLikelyWord.
/**
* Choose the most likely letter sequence from two heaps, one representing
* the holdover from the previous row (some of which end with a dash), and
* one representing the first heap on the current row.
*
* @param heap
* the current row's first heap
* @param holdoverHeap
* the previous row's holdover heap, at least some of whose
* sequences end with a dash
* @param n
* the number of sequences to consider in each heap
* @return a letter sequence covering both heaps, either as a combined word
* (with a dash in the middle) or as two separate words
*/
public LetterSequence chooseMostLikelyWord(List<LetterSequence> heap, List<LetterSequence> holdoverHeap, int n) {
LetterSequence bestSequence = null;
List<LetterSequence> holdoverWithDash = new ArrayList<>(n);
List<LetterSequence> holdoverWithoutDash = new ArrayList<>(n);
int i = 0;
for (LetterSequence holdoverSequence : holdoverHeap) {
if (i >= n)
break;
if (holdoverSequence.toString().endsWith("-"))
holdoverWithDash.add(holdoverSequence);
else
holdoverWithoutDash.add(holdoverSequence);
i++;
}
PriorityQueue<LetterSequence> combinedHeap = new PriorityQueue<>();
for (LetterSequence sequenceWithDash : holdoverWithDash) {
// find the dash that needs to be skipped at the end of sequence 1
for (int j = sequenceWithDash.getLetters().size() - 1; j >= 0; j--) {
String outcome = sequenceWithDash.getLetters().get(j);
if (outcome.equals("-")) {
sequenceWithDash.setEndOfLineHyphenIndex(j);
break;
}
}
for (LetterSequence letterSequence : heap) {
LetterSequence combinedSequence = new LetterSequence(sequenceWithDash, letterSequence);
combinedHeap.add(combinedSequence);
}
}
List<LetterSequence> combinedSequences = new ArrayList<>();
for (i = 0; i < n; i++) {
if (combinedHeap.isEmpty())
break;
combinedSequences.add(combinedHeap.poll());
}
if (holdoverWithoutDash.size() == 0) {
// all holdovers end with a dash
// therefore we must combine the two sequences
bestSequence = this.chooseMostLikelyWord(combinedSequences, n);
} else {
// some holdovers end with a dash, others don't
// need to compare combined sequences with individual sequences
LetterSequence bestCombinedSequence = this.chooseMostLikelyWord(combinedSequences, n);
// Originally we only included sequences without dashes here
// However, this falsifies the results towards those without a dash
// especially in the case where sequence 1 or sequence 2 is also a
// common word (e.g. der in Yiddish)
// PriorityQueue<LetterSequence> holdoverHeapWithoutDash = new
// PriorityQueue<LetterSequence>(holdoverWithoutDash);
// LetterSequence bestHoldoverSequenceWithoutDash =
// this.chooseMostLikelyWord(holdoverHeapWithoutDash, n);
// Changed it to the following:
LetterSequence bestHoldoverSequence = this.chooseMostLikelyWord(holdoverHeap, n);
LetterSequence bestNextRowSequence = this.chooseMostLikelyWord(heap, n);
if (LOG.isDebugEnabled()) {
LOG.debug("Best combined: " + bestCombinedSequence.toString() + ". Adjusted score: " + bestCombinedSequence.getAdjustedScore());
LOG.debug("Best seq1 separate: " + bestHoldoverSequence.toString() + ". Adjusted score: " + bestHoldoverSequence.getAdjustedScore());
LOG.debug("Best seq2 separate: " + bestNextRowSequence.toString() + ". Adjusted score: " + bestNextRowSequence.getAdjustedScore());
}
// Now, to compare the best combined with the best separate scores,
// we need to get a geometric mean of the shapes
// in the best separate ones, and adjust for the lowest frequency
// word
LetterSequence separateSequence = new LetterSequence(bestHoldoverSequence, bestNextRowSequence);
int minFrequency = bestHoldoverSequence.getFrequency() < bestNextRowSequence.getFrequency() ? bestHoldoverSequence.getFrequency() : bestNextRowSequence.getFrequency();
double freqLog = this.getFrequencyAdjustment(minFrequency);
double separateAdjustedScore = separateSequence.getScore() * freqLog;
separateSequence.setAdjustedScore(separateAdjustedScore);
if (LOG.isDebugEnabled())
LOG.debug("Best separate: " + separateSequence.toString() + ". Score: " + separateSequence.getScore() + ". Freq: " + minFrequency + ". Adjusted: " + freqLog + ". Adjusted score: " + separateSequence.getAdjustedScore());
if (bestCombinedSequence.getAdjustedScore() > separateAdjustedScore) {
if (LOG.isDebugEnabled())
LOG.debug("Using combined sequence");
bestSequence = bestCombinedSequence;
} else {
if (LOG.isDebugEnabled())
LOG.debug("Using separate sequences");
bestSequence = new LetterSequence(bestHoldoverSequence, bestNextRowSequence);
}
if (LOG.isDebugEnabled())
LOG.debug("Best with holdover: " + bestSequence.toString());
}
return bestSequence;
}
use of com.joliciel.jochre.letterGuesser.LetterSequence in project jochre by urieli.
the class UnknownWordListWriter method onImageComplete.
@Override
public void onImageComplete(JochreImage image) {
try {
for (Paragraph paragraph : image.getParagraphs()) {
if (!paragraph.isJunk()) {
for (RowOfShapes row : paragraph.getRows()) {
for (GroupOfShapes group : row.getGroups()) {
if (group.getBestLetterSequence() != null) {
for (LetterSequence subsequence : group.getBestLetterSequence().getSubsequences()) {
for (CountedOutcome<String> wordFrequency : subsequence.getWordFrequencies()) {
if (wordFrequency.getCount() == 0) {
writer.write(wordFrequency.getOutcome() + "\n");
writer.flush();
}
}
}
}
}
}
}
}
} catch (IOException e) {
LOG.error("Failed to write to UnknownWordListWriter", e);
throw new RuntimeException(e);
}
}
Aggregations