Search in sources :

Example 1 with JochreCorpusImageReader

use of com.joliciel.jochre.graphics.JochreCorpusImageReader in project jochre by urieli.

the class CorpusLexiconBuilder method buildLexicon.

/**
 * Build a lexicon from the training corpus.
 */
public TextFileLexicon buildLexicon() {
    TextFileLexicon lexicon = new TextFileLexicon();
    JochreCorpusImageReader imageReader = new JochreCorpusImageReader(jochreSession);
    imageReader.setSelectionCriteria(criteria);
    String wordText = "";
    while (imageReader.hasNext()) {
        JochreImage image = imageReader.next();
        for (Paragraph paragraph : image.getParagraphs()) {
            // rows ending in dashes can only be held-over within the same
            // paragraph.
            // to avoid strange things like a page number getting added to
            // the word,
            // if the dash is on the last row of the page.
            String holdoverWord = null;
            for (RowOfShapes row : paragraph.getRows()) {
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.isBrokenWord())
                        continue;
                    wordText = "";
                    for (Shape shape : group.getShapes()) {
                        if (shape.getLetter() != null)
                            wordText += shape.getLetter();
                    }
                    if (wordText.length() == 0) {
                        lexicon.incrementEntry("");
                        continue;
                    }
                    List<String> words = jochreSession.getLinguistics().splitText(wordText);
                    int i = 0;
                    for (String word : words) {
                        if (i == 0) {
                            // first word
                            if (holdoverWord != null && holdoverWord.length() > 0) {
                                word = holdoverWord + word;
                                holdoverWord = null;
                            }
                        }
                        if (i == words.size() - 1) {
                            // last word
                            if (group.getIndex() == row.getGroups().size() - 1 && word.endsWith("-")) {
                                // a dash at the end of a line
                                if (group.isHardHyphen())
                                    holdoverWord = word;
                                else
                                    holdoverWord = word.substring(0, word.length() - 1);
                                word = "";
                            }
                        }
                        lexicon.incrementEntry(word);
                        i++;
                    }
                }
            }
        }
    }
    return lexicon;
}
Also used : JochreCorpusImageReader(com.joliciel.jochre.graphics.JochreCorpusImageReader) JochreImage(com.joliciel.jochre.graphics.JochreImage) Shape(com.joliciel.jochre.graphics.Shape) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) RowOfShapes(com.joliciel.jochre.graphics.RowOfShapes) Paragraph(com.joliciel.jochre.graphics.Paragraph)

Example 2 with JochreCorpusImageReader

use of com.joliciel.jochre.graphics.JochreCorpusImageReader in project jochre by urieli.

the class Jochre method doCommandEvaluate.

/**
 * Evaluate a given letter guessing model.
 *  @param criteria
 *          the criteria used to select the evaluation corpus
 */
public void doCommandEvaluate(CorpusSelectionCriteria criteria, File outputDir, MostLikelyWordChooser wordChooser, boolean reconstructLetters, boolean save, String suffix, boolean includeBeam, List<DocumentObserver> observers) throws IOException {
    ClassificationModel letterModel = jochreSession.getLetterModel();
    List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors();
    LetterFeatureParser letterFeatureParser = new LetterFeatureParser();
    Set<LetterFeature<?>> letterFeatures = letterFeatureParser.getLetterFeatureSet(letterFeatureDescriptors);
    LetterGuesser letterGuesser = new LetterGuesser(letterFeatures, letterModel.getDecisionMaker());
    String baseName = jochreSession.getLetterModelPath().substring(0, jochreSession.getLetterModelPath().indexOf("."));
    if (baseName.lastIndexOf("/") > 0)
        baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
    baseName += suffix;
    BoundaryDetector boundaryDetector = null;
    if (reconstructLetters) {
        ShapeSplitter splitter = new TrainingCorpusShapeSplitter(jochreSession);
        ShapeMerger merger = new TrainingCorpusShapeMerger();
        boundaryDetector = new LetterByLetterBoundaryDetector(splitter, merger, jochreSession);
    } else {
        boundaryDetector = new OriginalBoundaryDetector();
    }
    ImageAnalyser evaluator = new BeamSearchImageAnalyser(boundaryDetector, letterGuesser, wordChooser, jochreSession);
    FScoreObserver fScoreObserver = null;
    LetterValidator letterValidator = new ComponentCharacterValidator(jochreSession);
    if (reconstructLetters) {
        OriginalShapeLetterAssigner originalShapeLetterAssigner = new OriginalShapeLetterAssigner();
        originalShapeLetterAssigner.setEvaluate(true);
        originalShapeLetterAssigner.setSave(save);
        originalShapeLetterAssigner.setLetterValidator(letterValidator);
        fScoreObserver = originalShapeLetterAssigner;
    } else {
        LetterAssigner letterAssigner = new LetterAssigner();
        letterAssigner.setSave(save);
        evaluator.addObserver(letterAssigner);
        fScoreObserver = new SimpleLetterFScoreObserver(letterValidator, jochreSession);
    }
    evaluator.addObserver(fScoreObserver);
    ErrorLogger errorLogger = new ErrorLogger(jochreSession);
    Writer errorWriter = null;
    File errorFile = new File(outputDir, baseName + "_errors.txt");
    errorFile.delete();
    errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(errorFile, true), "UTF8"));
    errorLogger.setErrorWriter(errorWriter);
    evaluator.addObserver(errorLogger);
    LexiconErrorWriter lexiconErrorWriter = new LexiconErrorWriter(outputDir, baseName, wordChooser, jochreSession);
    if (documentGroups != null)
        lexiconErrorWriter.setDocumentGroups(documentGroups);
    lexiconErrorWriter.setIncludeBeam(includeBeam);
    // find all document names (alphabetical ordering)
    Set<String> documentNameSet = new TreeSet<>();
    JochreCorpusImageReader imageReader1 = new JochreCorpusImageReader(jochreSession);
    CorpusSelectionCriteria docCriteria = new CorpusSelectionCriteria();
    docCriteria.setImageStatusesToInclude(criteria.getImageStatusesToInclude());
    docCriteria.setImageId(criteria.getImageId());
    docCriteria.setDocumentId(criteria.getDocumentId());
    docCriteria.setDocumentIds(criteria.getDocumentIds());
    imageReader1.setSelectionCriteria(docCriteria);
    JochreDocument currentDoc = null;
    while (imageReader1.hasNext()) {
        JochreImage image = imageReader1.next();
        if (!image.getPage().getDocument().equals(currentDoc)) {
            currentDoc = image.getPage().getDocument();
            documentNameSet.add(currentDoc.getName());
        }
    }
    List<String> documentNames = new ArrayList<>(documentNameSet);
    lexiconErrorWriter.setDocumentNames(documentNames);
    evaluator.addObserver(lexiconErrorWriter);
    JochreCorpusImageProcessor imageProcessor = new JochreCorpusImageProcessor(criteria, jochreSession);
    imageProcessor.addObserver(evaluator);
    for (DocumentObserver observer : observers) imageProcessor.addObserver(observer);
    try {
        imageProcessor.process();
    } finally {
        if (errorWriter != null)
            errorWriter.close();
    }
    LOG.debug("F-score for " + jochreSession.getLetterModelPath() + ": " + fScoreObserver.getFScoreCalculator().getTotalFScore());
    String modelFileName = baseName;
    if (reconstructLetters)
        modelFileName += "_Reconstruct";
    File fscoreFile = new File(outputDir, modelFileName + "_fscores.csv");
    Writer fscoreWriter = errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fscoreFile, true), jochreSession.getCsvEncoding()));
    fScoreObserver.getFScoreCalculator().writeScoresToCSV(fscoreWriter);
}
Also used : LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) ImageAnalyser(com.joliciel.jochre.analyser.ImageAnalyser) TrainingCorpusShapeMerger(com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger) LexiconErrorWriter(com.joliciel.jochre.lexicon.LexiconErrorWriter) ArrayList(java.util.ArrayList) JochreDocument(com.joliciel.jochre.doc.JochreDocument) BufferedWriter(java.io.BufferedWriter) JochreCorpusImageReader(com.joliciel.jochre.graphics.JochreCorpusImageReader) LetterValidator(com.joliciel.jochre.letterGuesser.LetterValidator) JochreCorpusImageProcessor(com.joliciel.jochre.graphics.JochreCorpusImageProcessor) LetterFeature(com.joliciel.jochre.letterGuesser.features.LetterFeature) TreeSet(java.util.TreeSet) LetterFeatureParser(com.joliciel.jochre.letterGuesser.features.LetterFeatureParser) RecursiveShapeSplitter(com.joliciel.jochre.boundaries.RecursiveShapeSplitter) TrainingCorpusShapeSplitter(com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter) ShapeSplitter(com.joliciel.jochre.boundaries.ShapeSplitter) JochreImage(com.joliciel.jochre.graphics.JochreImage) CorpusSelectionCriteria(com.joliciel.jochre.graphics.CorpusSelectionCriteria) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) BoundaryDetector(com.joliciel.jochre.boundaries.BoundaryDetector) LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) DeterministicBoundaryDetector(com.joliciel.jochre.boundaries.DeterministicBoundaryDetector) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) LetterAssigner(com.joliciel.jochre.analyser.LetterAssigner) DocumentObserver(com.joliciel.jochre.doc.DocumentObserver) SimpleLetterFScoreObserver(com.joliciel.jochre.analyser.SimpleLetterFScoreObserver) LetterGuesser(com.joliciel.jochre.letterGuesser.LetterGuesser) ErrorLogger(com.joliciel.jochre.analyser.ErrorLogger) SimpleLetterFScoreObserver(com.joliciel.jochre.analyser.SimpleLetterFScoreObserver) FScoreObserver(com.joliciel.jochre.analyser.FScoreObserver) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) TrainingCorpusShapeMerger(com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger) ShapeMerger(com.joliciel.jochre.boundaries.ShapeMerger) FileOutputStream(java.io.FileOutputStream) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) OutputStreamWriter(java.io.OutputStreamWriter) TrainingCorpusShapeSplitter(com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter) ComponentCharacterValidator(com.joliciel.jochre.letterGuesser.ComponentCharacterValidator) File(java.io.File) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) UnknownWordListWriter(com.joliciel.jochre.lexicon.UnknownWordListWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) LexiconErrorWriter(com.joliciel.jochre.lexicon.LexiconErrorWriter) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

JochreCorpusImageReader (com.joliciel.jochre.graphics.JochreCorpusImageReader)2 JochreImage (com.joliciel.jochre.graphics.JochreImage)2 BeamSearchImageAnalyser (com.joliciel.jochre.analyser.BeamSearchImageAnalyser)1 ErrorLogger (com.joliciel.jochre.analyser.ErrorLogger)1 FScoreObserver (com.joliciel.jochre.analyser.FScoreObserver)1 ImageAnalyser (com.joliciel.jochre.analyser.ImageAnalyser)1 LetterAssigner (com.joliciel.jochre.analyser.LetterAssigner)1 OriginalShapeLetterAssigner (com.joliciel.jochre.analyser.OriginalShapeLetterAssigner)1 SimpleLetterFScoreObserver (com.joliciel.jochre.analyser.SimpleLetterFScoreObserver)1 BoundaryDetector (com.joliciel.jochre.boundaries.BoundaryDetector)1 DeterministicBoundaryDetector (com.joliciel.jochre.boundaries.DeterministicBoundaryDetector)1 LetterByLetterBoundaryDetector (com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector)1 OriginalBoundaryDetector (com.joliciel.jochre.boundaries.OriginalBoundaryDetector)1 RecursiveShapeSplitter (com.joliciel.jochre.boundaries.RecursiveShapeSplitter)1 ShapeMerger (com.joliciel.jochre.boundaries.ShapeMerger)1 ShapeSplitter (com.joliciel.jochre.boundaries.ShapeSplitter)1 TrainingCorpusShapeMerger (com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger)1 TrainingCorpusShapeSplitter (com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter)1 DocumentObserver (com.joliciel.jochre.doc.DocumentObserver)1 JochreDocument (com.joliciel.jochre.doc.JochreDocument)1