use of com.joliciel.jochre.graphics.JochreCorpusImageReader in project jochre by urieli.
the class CorpusLexiconBuilder method buildLexicon.
/**
* Build a lexicon from the training corpus.
*/
public TextFileLexicon buildLexicon() {
TextFileLexicon lexicon = new TextFileLexicon();
JochreCorpusImageReader imageReader = new JochreCorpusImageReader(jochreSession);
imageReader.setSelectionCriteria(criteria);
String wordText = "";
while (imageReader.hasNext()) {
JochreImage image = imageReader.next();
for (Paragraph paragraph : image.getParagraphs()) {
// rows ending in dashes can only be held-over within the same
// paragraph.
// to avoid strange things like a page number getting added to
// the word,
// if the dash is on the last row of the page.
String holdoverWord = null;
for (RowOfShapes row : paragraph.getRows()) {
for (GroupOfShapes group : row.getGroups()) {
if (group.isBrokenWord())
continue;
wordText = "";
for (Shape shape : group.getShapes()) {
if (shape.getLetter() != null)
wordText += shape.getLetter();
}
if (wordText.length() == 0) {
lexicon.incrementEntry("");
continue;
}
List<String> words = jochreSession.getLinguistics().splitText(wordText);
int i = 0;
for (String word : words) {
if (i == 0) {
// first word
if (holdoverWord != null && holdoverWord.length() > 0) {
word = holdoverWord + word;
holdoverWord = null;
}
}
if (i == words.size() - 1) {
// last word
if (group.getIndex() == row.getGroups().size() - 1 && word.endsWith("-")) {
// a dash at the end of a line
if (group.isHardHyphen())
holdoverWord = word;
else
holdoverWord = word.substring(0, word.length() - 1);
word = "";
}
}
lexicon.incrementEntry(word);
i++;
}
}
}
}
}
return lexicon;
}
use of com.joliciel.jochre.graphics.JochreCorpusImageReader in project jochre by urieli.
the class Jochre method doCommandEvaluate.
/**
* Evaluate a given letter guessing model.
* @param criteria
* the criteria used to select the evaluation corpus
*/
public void doCommandEvaluate(CorpusSelectionCriteria criteria, File outputDir, MostLikelyWordChooser wordChooser, boolean reconstructLetters, boolean save, String suffix, boolean includeBeam, List<DocumentObserver> observers) throws IOException {
ClassificationModel letterModel = jochreSession.getLetterModel();
List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors();
LetterFeatureParser letterFeatureParser = new LetterFeatureParser();
Set<LetterFeature<?>> letterFeatures = letterFeatureParser.getLetterFeatureSet(letterFeatureDescriptors);
LetterGuesser letterGuesser = new LetterGuesser(letterFeatures, letterModel.getDecisionMaker());
String baseName = jochreSession.getLetterModelPath().substring(0, jochreSession.getLetterModelPath().indexOf("."));
if (baseName.lastIndexOf("/") > 0)
baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
baseName += suffix;
BoundaryDetector boundaryDetector = null;
if (reconstructLetters) {
ShapeSplitter splitter = new TrainingCorpusShapeSplitter(jochreSession);
ShapeMerger merger = new TrainingCorpusShapeMerger();
boundaryDetector = new LetterByLetterBoundaryDetector(splitter, merger, jochreSession);
} else {
boundaryDetector = new OriginalBoundaryDetector();
}
ImageAnalyser evaluator = new BeamSearchImageAnalyser(boundaryDetector, letterGuesser, wordChooser, jochreSession);
FScoreObserver fScoreObserver = null;
LetterValidator letterValidator = new ComponentCharacterValidator(jochreSession);
if (reconstructLetters) {
OriginalShapeLetterAssigner originalShapeLetterAssigner = new OriginalShapeLetterAssigner();
originalShapeLetterAssigner.setEvaluate(true);
originalShapeLetterAssigner.setSave(save);
originalShapeLetterAssigner.setLetterValidator(letterValidator);
fScoreObserver = originalShapeLetterAssigner;
} else {
LetterAssigner letterAssigner = new LetterAssigner();
letterAssigner.setSave(save);
evaluator.addObserver(letterAssigner);
fScoreObserver = new SimpleLetterFScoreObserver(letterValidator, jochreSession);
}
evaluator.addObserver(fScoreObserver);
ErrorLogger errorLogger = new ErrorLogger(jochreSession);
Writer errorWriter = null;
File errorFile = new File(outputDir, baseName + "_errors.txt");
errorFile.delete();
errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(errorFile, true), "UTF8"));
errorLogger.setErrorWriter(errorWriter);
evaluator.addObserver(errorLogger);
LexiconErrorWriter lexiconErrorWriter = new LexiconErrorWriter(outputDir, baseName, wordChooser, jochreSession);
if (documentGroups != null)
lexiconErrorWriter.setDocumentGroups(documentGroups);
lexiconErrorWriter.setIncludeBeam(includeBeam);
// find all document names (alphabetical ordering)
Set<String> documentNameSet = new TreeSet<>();
JochreCorpusImageReader imageReader1 = new JochreCorpusImageReader(jochreSession);
CorpusSelectionCriteria docCriteria = new CorpusSelectionCriteria();
docCriteria.setImageStatusesToInclude(criteria.getImageStatusesToInclude());
docCriteria.setImageId(criteria.getImageId());
docCriteria.setDocumentId(criteria.getDocumentId());
docCriteria.setDocumentIds(criteria.getDocumentIds());
imageReader1.setSelectionCriteria(docCriteria);
JochreDocument currentDoc = null;
while (imageReader1.hasNext()) {
JochreImage image = imageReader1.next();
if (!image.getPage().getDocument().equals(currentDoc)) {
currentDoc = image.getPage().getDocument();
documentNameSet.add(currentDoc.getName());
}
}
List<String> documentNames = new ArrayList<>(documentNameSet);
lexiconErrorWriter.setDocumentNames(documentNames);
evaluator.addObserver(lexiconErrorWriter);
JochreCorpusImageProcessor imageProcessor = new JochreCorpusImageProcessor(criteria, jochreSession);
imageProcessor.addObserver(evaluator);
for (DocumentObserver observer : observers) imageProcessor.addObserver(observer);
try {
imageProcessor.process();
} finally {
if (errorWriter != null)
errorWriter.close();
}
LOG.debug("F-score for " + jochreSession.getLetterModelPath() + ": " + fScoreObserver.getFScoreCalculator().getTotalFScore());
String modelFileName = baseName;
if (reconstructLetters)
modelFileName += "_Reconstruct";
File fscoreFile = new File(outputDir, modelFileName + "_fscores.csv");
Writer fscoreWriter = errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fscoreFile, true), jochreSession.getCsvEncoding()));
fScoreObserver.getFScoreCalculator().writeScoresToCSV(fscoreWriter);
}
Aggregations