Search in sources :

Example 11 with JochreDocument

use of com.joliciel.jochre.doc.JochreDocument in project jochre by urieli.

the class Jochre method doCommandEvaluate.

/**
 * Evaluate a given letter guessing model.
 *  @param criteria
 *          the criteria used to select the evaluation corpus
 */
public void doCommandEvaluate(CorpusSelectionCriteria criteria, File outputDir, MostLikelyWordChooser wordChooser, boolean reconstructLetters, boolean save, String suffix, boolean includeBeam, List<DocumentObserver> observers) throws IOException {
    ClassificationModel letterModel = jochreSession.getLetterModel();
    List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors();
    LetterFeatureParser letterFeatureParser = new LetterFeatureParser();
    Set<LetterFeature<?>> letterFeatures = letterFeatureParser.getLetterFeatureSet(letterFeatureDescriptors);
    LetterGuesser letterGuesser = new LetterGuesser(letterFeatures, letterModel.getDecisionMaker());
    String baseName = jochreSession.getLetterModelPath().substring(0, jochreSession.getLetterModelPath().indexOf("."));
    if (baseName.lastIndexOf("/") > 0)
        baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
    baseName += suffix;
    BoundaryDetector boundaryDetector = null;
    if (reconstructLetters) {
        ShapeSplitter splitter = new TrainingCorpusShapeSplitter(jochreSession);
        ShapeMerger merger = new TrainingCorpusShapeMerger();
        boundaryDetector = new LetterByLetterBoundaryDetector(splitter, merger, jochreSession);
    } else {
        boundaryDetector = new OriginalBoundaryDetector();
    }
    ImageAnalyser evaluator = new BeamSearchImageAnalyser(boundaryDetector, letterGuesser, wordChooser, jochreSession);
    FScoreObserver fScoreObserver = null;
    LetterValidator letterValidator = new ComponentCharacterValidator(jochreSession);
    if (reconstructLetters) {
        OriginalShapeLetterAssigner originalShapeLetterAssigner = new OriginalShapeLetterAssigner();
        originalShapeLetterAssigner.setEvaluate(true);
        originalShapeLetterAssigner.setSave(save);
        originalShapeLetterAssigner.setLetterValidator(letterValidator);
        fScoreObserver = originalShapeLetterAssigner;
    } else {
        LetterAssigner letterAssigner = new LetterAssigner();
        letterAssigner.setSave(save);
        evaluator.addObserver(letterAssigner);
        fScoreObserver = new SimpleLetterFScoreObserver(letterValidator, jochreSession);
    }
    evaluator.addObserver(fScoreObserver);
    ErrorLogger errorLogger = new ErrorLogger(jochreSession);
    Writer errorWriter = null;
    File errorFile = new File(outputDir, baseName + "_errors.txt");
    errorFile.delete();
    errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(errorFile, true), "UTF8"));
    errorLogger.setErrorWriter(errorWriter);
    evaluator.addObserver(errorLogger);
    LexiconErrorWriter lexiconErrorWriter = new LexiconErrorWriter(outputDir, baseName, wordChooser, jochreSession);
    if (documentGroups != null)
        lexiconErrorWriter.setDocumentGroups(documentGroups);
    lexiconErrorWriter.setIncludeBeam(includeBeam);
    // find all document names (alphabetical ordering)
    Set<String> documentNameSet = new TreeSet<>();
    JochreCorpusImageReader imageReader1 = new JochreCorpusImageReader(jochreSession);
    CorpusSelectionCriteria docCriteria = new CorpusSelectionCriteria();
    docCriteria.setImageStatusesToInclude(criteria.getImageStatusesToInclude());
    docCriteria.setImageId(criteria.getImageId());
    docCriteria.setDocumentId(criteria.getDocumentId());
    docCriteria.setDocumentIds(criteria.getDocumentIds());
    imageReader1.setSelectionCriteria(docCriteria);
    JochreDocument currentDoc = null;
    while (imageReader1.hasNext()) {
        JochreImage image = imageReader1.next();
        if (!image.getPage().getDocument().equals(currentDoc)) {
            currentDoc = image.getPage().getDocument();
            documentNameSet.add(currentDoc.getName());
        }
    }
    List<String> documentNames = new ArrayList<>(documentNameSet);
    lexiconErrorWriter.setDocumentNames(documentNames);
    evaluator.addObserver(lexiconErrorWriter);
    JochreCorpusImageProcessor imageProcessor = new JochreCorpusImageProcessor(criteria, jochreSession);
    imageProcessor.addObserver(evaluator);
    for (DocumentObserver observer : observers) imageProcessor.addObserver(observer);
    try {
        imageProcessor.process();
    } finally {
        if (errorWriter != null)
            errorWriter.close();
    }
    LOG.debug("F-score for " + jochreSession.getLetterModelPath() + ": " + fScoreObserver.getFScoreCalculator().getTotalFScore());
    String modelFileName = baseName;
    if (reconstructLetters)
        modelFileName += "_Reconstruct";
    File fscoreFile = new File(outputDir, modelFileName + "_fscores.csv");
    Writer fscoreWriter = errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fscoreFile, true), jochreSession.getCsvEncoding()));
    fScoreObserver.getFScoreCalculator().writeScoresToCSV(fscoreWriter);
}
Also used : LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) ImageAnalyser(com.joliciel.jochre.analyser.ImageAnalyser) TrainingCorpusShapeMerger(com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger) LexiconErrorWriter(com.joliciel.jochre.lexicon.LexiconErrorWriter) ArrayList(java.util.ArrayList) JochreDocument(com.joliciel.jochre.doc.JochreDocument) BufferedWriter(java.io.BufferedWriter) JochreCorpusImageReader(com.joliciel.jochre.graphics.JochreCorpusImageReader) LetterValidator(com.joliciel.jochre.letterGuesser.LetterValidator) JochreCorpusImageProcessor(com.joliciel.jochre.graphics.JochreCorpusImageProcessor) LetterFeature(com.joliciel.jochre.letterGuesser.features.LetterFeature) TreeSet(java.util.TreeSet) LetterFeatureParser(com.joliciel.jochre.letterGuesser.features.LetterFeatureParser) RecursiveShapeSplitter(com.joliciel.jochre.boundaries.RecursiveShapeSplitter) TrainingCorpusShapeSplitter(com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter) ShapeSplitter(com.joliciel.jochre.boundaries.ShapeSplitter) JochreImage(com.joliciel.jochre.graphics.JochreImage) CorpusSelectionCriteria(com.joliciel.jochre.graphics.CorpusSelectionCriteria) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) BoundaryDetector(com.joliciel.jochre.boundaries.BoundaryDetector) LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) DeterministicBoundaryDetector(com.joliciel.jochre.boundaries.DeterministicBoundaryDetector) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) LetterAssigner(com.joliciel.jochre.analyser.LetterAssigner) DocumentObserver(com.joliciel.jochre.doc.DocumentObserver) SimpleLetterFScoreObserver(com.joliciel.jochre.analyser.SimpleLetterFScoreObserver) LetterGuesser(com.joliciel.jochre.letterGuesser.LetterGuesser) ErrorLogger(com.joliciel.jochre.analyser.ErrorLogger) SimpleLetterFScoreObserver(com.joliciel.jochre.analyser.SimpleLetterFScoreObserver) FScoreObserver(com.joliciel.jochre.analyser.FScoreObserver) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) TrainingCorpusShapeMerger(com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger) ShapeMerger(com.joliciel.jochre.boundaries.ShapeMerger) FileOutputStream(java.io.FileOutputStream) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) OutputStreamWriter(java.io.OutputStreamWriter) TrainingCorpusShapeSplitter(com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter) ComponentCharacterValidator(com.joliciel.jochre.letterGuesser.ComponentCharacterValidator) File(java.io.File) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) UnknownWordListWriter(com.joliciel.jochre.lexicon.UnknownWordListWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) LexiconErrorWriter(com.joliciel.jochre.lexicon.LexiconErrorWriter) OutputStreamWriter(java.io.OutputStreamWriter)

Example 12 with JochreDocument

use of com.joliciel.jochre.doc.JochreDocument in project jochre by urieli.

the class Jochre method doCommandUpdateImages.

/**
 * Update the images in an existing Jochre document.
 *
 * @param filename
 *          the PDF file containing the images
 * @param docId
 *          the id of the document to update
 * @param pŠ°ges
 *          the pages to process, empty means all
 */
public void doCommandUpdateImages(String filename, int docId, Set<Integer> pages) {
    if (filename.length() == 0)
        throw new RuntimeException("Missing argument: file");
    if (docId < 0)
        throw new RuntimeException("Missing argument: docId");
    DocumentDao documentDao = DocumentDao.getInstance(jochreSession);
    JochreDocument doc = documentDao.loadJochreDocument(docId);
    if (filename.toLowerCase().endsWith(".pdf")) {
        File pdfFile = new File(filename);
        PdfDocumentProcessor pdfDocumentProcessor = new PdfDocumentProcessor(pdfFile, pages, new PdfImageUpdater(doc));
        pdfDocumentProcessor.process();
    } else {
        throw new RuntimeException("Unrecognised file extension");
    }
}
Also used : PdfDocumentProcessor(com.joliciel.jochre.pdf.PdfDocumentProcessor) DocumentDao(com.joliciel.jochre.doc.DocumentDao) JochreDocument(com.joliciel.jochre.doc.JochreDocument) File(java.io.File)

Aggregations

JochreDocument (com.joliciel.jochre.doc.JochreDocument)12 JochrePage (com.joliciel.jochre.doc.JochrePage)8 JochreImage (com.joliciel.jochre.graphics.JochreImage)6 Shape (com.joliciel.jochre.graphics.Shape)5 ArrayList (java.util.ArrayList)5 GroupOfShapes (com.joliciel.jochre.graphics.GroupOfShapes)4 Paragraph (com.joliciel.jochre.graphics.Paragraph)4 RowOfShapes (com.joliciel.jochre.graphics.RowOfShapes)4 Test (org.junit.Test)4 JochreSession (com.joliciel.jochre.JochreSession)3 File (java.io.File)3 DocumentDao (com.joliciel.jochre.doc.DocumentDao)2 DocumentObserver (com.joliciel.jochre.doc.DocumentObserver)2 Config (com.typesafe.config.Config)2 StringWriter (java.io.StringWriter)2 BeamSearchImageAnalyser (com.joliciel.jochre.analyser.BeamSearchImageAnalyser)1 ErrorLogger (com.joliciel.jochre.analyser.ErrorLogger)1 FScoreObserver (com.joliciel.jochre.analyser.FScoreObserver)1 ImageAnalyser (com.joliciel.jochre.analyser.ImageAnalyser)1 LetterAssigner (com.joliciel.jochre.analyser.LetterAssigner)1