Search in sources :

Example 6 with DocumentObserver

use of com.joliciel.jochre.doc.DocumentObserver in project jochre by urieli.

the class Jochre method doCommandAnalyse.

/**
 * Full analysis, including merge, split and letter guessing.
 *
 * @param pages
 *          the pages to process, empty means all
 */
public void doCommandAnalyse(File sourceFile, MostLikelyWordChooser wordChooser, Set<Integer> pages, List<DocumentObserver> observers, List<PdfImageObserver> imageObservers) throws IOException {
    ClassificationModel letterModel = jochreSession.getLetterModel();
    List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors();
    LetterFeatureParser letterFeatureParser = new LetterFeatureParser();
    Set<LetterFeature<?>> letterFeatures = letterFeatureParser.getLetterFeatureSet(letterFeatureDescriptors);
    LetterGuesser letterGuesser = new LetterGuesser(letterFeatures, letterModel.getDecisionMaker());
    BoundaryDetector boundaryDetector = null;
    LetterGuessObserver letterGuessObserver = null;
    if (jochreSession.getSplitModel() != null && jochreSession.getMergeModel() != null) {
        boundaryDetector = new DeterministicBoundaryDetector(jochreSession.getSplitModel(), jochreSession.getMergeModel(), jochreSession);
        OriginalShapeLetterAssigner shapeLetterAssigner = new OriginalShapeLetterAssigner();
        shapeLetterAssigner.setEvaluate(false);
        shapeLetterAssigner.setSingleLetterMethod(false);
        letterGuessObserver = shapeLetterAssigner;
    } else {
        boundaryDetector = new OriginalBoundaryDetector();
        LetterAssigner letterAssigner = new LetterAssigner();
        letterGuessObserver = letterAssigner;
    }
    ImageAnalyser analyser = new BeamSearchImageAnalyser(boundaryDetector, letterGuesser, wordChooser, jochreSession);
    analyser.addObserver(letterGuessObserver);
    JochreDocumentGenerator documentGenerator = new JochreDocumentGenerator(sourceFile.getName(), "", jochreSession);
    documentGenerator.addDocumentObserver(analyser);
    for (DocumentObserver observer : observers) documentGenerator.addDocumentObserver(observer);
    if (!sourceFile.exists())
        throw new JochreException("The file " + sourceFile.getPath() + " does not exist");
    if (sourceFile.getName().toLowerCase().endsWith(".pdf")) {
        PdfDocumentProcessor pdfDocumentProcessor = new PdfDocumentProcessor(sourceFile, pages, documentGenerator);
        for (PdfImageObserver imageObserver : imageObservers) {
            pdfDocumentProcessor.addImageObserver(imageObserver);
        }
        pdfDocumentProcessor.process();
    } else if (sourceFile.getName().toLowerCase().endsWith(".png") || sourceFile.getName().toLowerCase().endsWith(".jpg") || sourceFile.getName().toLowerCase().endsWith(".jpeg") || sourceFile.getName().toLowerCase().endsWith(".gif")) {
        ImageDocumentExtractor extractor = new ImageDocumentExtractor(sourceFile, documentGenerator);
        extractor.extractDocument();
    } else if (sourceFile.isDirectory()) {
        ImageDocumentExtractor extractor = new ImageDocumentExtractor(sourceFile, documentGenerator);
        extractor.extractDocument();
    } else {
        throw new RuntimeException("Unrecognised file extension");
    }
}
Also used : PdfImageObserver(com.joliciel.jochre.utils.pdf.PdfImageObserver) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) ImageAnalyser(com.joliciel.jochre.analyser.ImageAnalyser) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) BoundaryDetector(com.joliciel.jochre.boundaries.BoundaryDetector) LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) DeterministicBoundaryDetector(com.joliciel.jochre.boundaries.DeterministicBoundaryDetector) DeterministicBoundaryDetector(com.joliciel.jochre.boundaries.DeterministicBoundaryDetector) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) LetterAssigner(com.joliciel.jochre.analyser.LetterAssigner) DocumentObserver(com.joliciel.jochre.doc.DocumentObserver) LetterGuesser(com.joliciel.jochre.letterGuesser.LetterGuesser) LetterGuessObserver(com.joliciel.jochre.analyser.LetterGuessObserver) JochreDocumentGenerator(com.joliciel.jochre.doc.JochreDocumentGenerator) PdfDocumentProcessor(com.joliciel.jochre.pdf.PdfDocumentProcessor) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) JochreException(com.joliciel.jochre.utils.JochreException) LetterFeature(com.joliciel.jochre.letterGuesser.features.LetterFeature) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) LetterFeatureParser(com.joliciel.jochre.letterGuesser.features.LetterFeatureParser) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) ImageDocumentExtractor(com.joliciel.jochre.doc.ImageDocumentExtractor)

Example 7 with DocumentObserver

use of com.joliciel.jochre.doc.DocumentObserver in project jochre by urieli.

the class Jochre method doCommandEvaluate.

/**
 * Evaluate a given letter guessing model.
 *  @param criteria
 *          the criteria used to select the evaluation corpus
 */
public void doCommandEvaluate(CorpusSelectionCriteria criteria, File outputDir, MostLikelyWordChooser wordChooser, boolean reconstructLetters, boolean save, String suffix, boolean includeBeam, List<DocumentObserver> observers) throws IOException {
    ClassificationModel letterModel = jochreSession.getLetterModel();
    List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors();
    LetterFeatureParser letterFeatureParser = new LetterFeatureParser();
    Set<LetterFeature<?>> letterFeatures = letterFeatureParser.getLetterFeatureSet(letterFeatureDescriptors);
    LetterGuesser letterGuesser = new LetterGuesser(letterFeatures, letterModel.getDecisionMaker());
    String baseName = jochreSession.getLetterModelPath().substring(0, jochreSession.getLetterModelPath().indexOf("."));
    if (baseName.lastIndexOf("/") > 0)
        baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
    baseName += suffix;
    BoundaryDetector boundaryDetector = null;
    if (reconstructLetters) {
        ShapeSplitter splitter = new TrainingCorpusShapeSplitter(jochreSession);
        ShapeMerger merger = new TrainingCorpusShapeMerger();
        boundaryDetector = new LetterByLetterBoundaryDetector(splitter, merger, jochreSession);
    } else {
        boundaryDetector = new OriginalBoundaryDetector();
    }
    ImageAnalyser evaluator = new BeamSearchImageAnalyser(boundaryDetector, letterGuesser, wordChooser, jochreSession);
    FScoreObserver fScoreObserver = null;
    LetterValidator letterValidator = new ComponentCharacterValidator(jochreSession);
    if (reconstructLetters) {
        OriginalShapeLetterAssigner originalShapeLetterAssigner = new OriginalShapeLetterAssigner();
        originalShapeLetterAssigner.setEvaluate(true);
        originalShapeLetterAssigner.setSave(save);
        originalShapeLetterAssigner.setLetterValidator(letterValidator);
        fScoreObserver = originalShapeLetterAssigner;
    } else {
        LetterAssigner letterAssigner = new LetterAssigner();
        letterAssigner.setSave(save);
        evaluator.addObserver(letterAssigner);
        fScoreObserver = new SimpleLetterFScoreObserver(letterValidator, jochreSession);
    }
    evaluator.addObserver(fScoreObserver);
    ErrorLogger errorLogger = new ErrorLogger(jochreSession);
    Writer errorWriter = null;
    File errorFile = new File(outputDir, baseName + "_errors.txt");
    errorFile.delete();
    errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(errorFile, true), "UTF8"));
    errorLogger.setErrorWriter(errorWriter);
    evaluator.addObserver(errorLogger);
    LexiconErrorWriter lexiconErrorWriter = new LexiconErrorWriter(outputDir, baseName, wordChooser, jochreSession);
    if (documentGroups != null)
        lexiconErrorWriter.setDocumentGroups(documentGroups);
    lexiconErrorWriter.setIncludeBeam(includeBeam);
    // find all document names (alphabetical ordering)
    Set<String> documentNameSet = new TreeSet<>();
    JochreCorpusImageReader imageReader1 = new JochreCorpusImageReader(jochreSession);
    CorpusSelectionCriteria docCriteria = new CorpusSelectionCriteria();
    docCriteria.setImageStatusesToInclude(criteria.getImageStatusesToInclude());
    docCriteria.setImageId(criteria.getImageId());
    docCriteria.setDocumentId(criteria.getDocumentId());
    docCriteria.setDocumentIds(criteria.getDocumentIds());
    imageReader1.setSelectionCriteria(docCriteria);
    JochreDocument currentDoc = null;
    while (imageReader1.hasNext()) {
        JochreImage image = imageReader1.next();
        if (!image.getPage().getDocument().equals(currentDoc)) {
            currentDoc = image.getPage().getDocument();
            documentNameSet.add(currentDoc.getName());
        }
    }
    List<String> documentNames = new ArrayList<>(documentNameSet);
    lexiconErrorWriter.setDocumentNames(documentNames);
    evaluator.addObserver(lexiconErrorWriter);
    JochreCorpusImageProcessor imageProcessor = new JochreCorpusImageProcessor(criteria, jochreSession);
    imageProcessor.addObserver(evaluator);
    for (DocumentObserver observer : observers) imageProcessor.addObserver(observer);
    try {
        imageProcessor.process();
    } finally {
        if (errorWriter != null)
            errorWriter.close();
    }
    LOG.debug("F-score for " + jochreSession.getLetterModelPath() + ": " + fScoreObserver.getFScoreCalculator().getTotalFScore());
    String modelFileName = baseName;
    if (reconstructLetters)
        modelFileName += "_Reconstruct";
    File fscoreFile = new File(outputDir, modelFileName + "_fscores.csv");
    Writer fscoreWriter = errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fscoreFile, true), jochreSession.getCsvEncoding()));
    fScoreObserver.getFScoreCalculator().writeScoresToCSV(fscoreWriter);
}
Also used : LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) ImageAnalyser(com.joliciel.jochre.analyser.ImageAnalyser) TrainingCorpusShapeMerger(com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger) LexiconErrorWriter(com.joliciel.jochre.lexicon.LexiconErrorWriter) ArrayList(java.util.ArrayList) JochreDocument(com.joliciel.jochre.doc.JochreDocument) BufferedWriter(java.io.BufferedWriter) JochreCorpusImageReader(com.joliciel.jochre.graphics.JochreCorpusImageReader) LetterValidator(com.joliciel.jochre.letterGuesser.LetterValidator) JochreCorpusImageProcessor(com.joliciel.jochre.graphics.JochreCorpusImageProcessor) LetterFeature(com.joliciel.jochre.letterGuesser.features.LetterFeature) TreeSet(java.util.TreeSet) LetterFeatureParser(com.joliciel.jochre.letterGuesser.features.LetterFeatureParser) RecursiveShapeSplitter(com.joliciel.jochre.boundaries.RecursiveShapeSplitter) TrainingCorpusShapeSplitter(com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter) ShapeSplitter(com.joliciel.jochre.boundaries.ShapeSplitter) JochreImage(com.joliciel.jochre.graphics.JochreImage) CorpusSelectionCriteria(com.joliciel.jochre.graphics.CorpusSelectionCriteria) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) BoundaryDetector(com.joliciel.jochre.boundaries.BoundaryDetector) LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) DeterministicBoundaryDetector(com.joliciel.jochre.boundaries.DeterministicBoundaryDetector) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) LetterAssigner(com.joliciel.jochre.analyser.LetterAssigner) DocumentObserver(com.joliciel.jochre.doc.DocumentObserver) SimpleLetterFScoreObserver(com.joliciel.jochre.analyser.SimpleLetterFScoreObserver) LetterGuesser(com.joliciel.jochre.letterGuesser.LetterGuesser) ErrorLogger(com.joliciel.jochre.analyser.ErrorLogger) SimpleLetterFScoreObserver(com.joliciel.jochre.analyser.SimpleLetterFScoreObserver) FScoreObserver(com.joliciel.jochre.analyser.FScoreObserver) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) TrainingCorpusShapeMerger(com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger) ShapeMerger(com.joliciel.jochre.boundaries.ShapeMerger) FileOutputStream(java.io.FileOutputStream) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) OutputStreamWriter(java.io.OutputStreamWriter) TrainingCorpusShapeSplitter(com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter) ComponentCharacterValidator(com.joliciel.jochre.letterGuesser.ComponentCharacterValidator) File(java.io.File) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) UnknownWordListWriter(com.joliciel.jochre.lexicon.UnknownWordListWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) LexiconErrorWriter(com.joliciel.jochre.lexicon.LexiconErrorWriter) OutputStreamWriter(java.io.OutputStreamWriter)

Example 8 with DocumentObserver

use of com.joliciel.jochre.doc.DocumentObserver in project jochre by urieli.

the class Jochre method execute.

/**
 * Usage (* indicates optional):<br/>
 * Jochre load [filename] [isoLanguageCode] [firstPage]* [lastPage]*<br/>
 * Loads a file (pdf or image) and segments it into letters. The analysed
 * version is stored in the persistent store. Writes [filename].xml to the same
 * location, to enable the user to indicate the text to associate with this
 * file.<br/>
 * Jochre extract [filename] [outputDirectory] [firstPage]* [lastPage]*<br/>
 * Extracts images form a pdf file.<br/>
 */
public void execute(Map<String, String> argMap) throws Exception {
    if (argMap.size() == 0) {
        System.out.println("See jochre wiki for usage");
        return;
    }
    String logConfigPath = argMap.get("logConfigFile");
    if (logConfigPath != null) {
        argMap.remove("logConfigFile");
        JochreLogUtils.configureLogging(logConfigPath);
    }
    String command = "";
    String inFilePath = "";
    String inDirPath = null;
    String userFriendlyName = "";
    String outputDirPath = null;
    String outputFilePath = null;
    int firstPage = -1;
    int lastPage = -1;
    Set<Integer> pages = Collections.emptySet();
    int shapeId = -1;
    int docId = -1;
    int imageId = 0;
    int userId = -1;
    int imageCount = 0;
    int multiplier = 0;
    boolean save = false;
    ImageStatus[] imageSet = null;
    boolean reconstructLetters = false;
    int excludeImageId = 0;
    int crossValidationSize = -1;
    int includeIndex = -1;
    int excludeIndex = -1;
    Set<Integer> documentSet = null;
    String suffix = "";
    String docGroupPath = null;
    boolean includeBeam = false;
    List<OutputFormat> outputFormats = new ArrayList<>();
    String docSelectionPath = null;
    List<String> featureDescriptors = null;
    boolean includeDate = false;
    for (Entry<String, String> argMapEntry : argMap.entrySet()) {
        String argName = argMapEntry.getKey();
        String argValue = argMapEntry.getValue();
        if (argName.equals("command"))
            command = argValue;
        else if (argName.equals("file"))
            inFilePath = argValue;
        else if (argName.equals("name"))
            userFriendlyName = argValue;
        else if (argName.equals("first"))
            firstPage = Integer.parseInt(argValue);
        else if (argName.equals("last"))
            lastPage = Integer.parseInt(argValue);
        else if (argName.equals("pages")) {
            final String WITH_DELIMITER = "((?<=%1$s)|(?=%1$s))";
            final Pattern numberPattern = Pattern.compile("\\d+");
            final String[] parts = argValue.split(String.format(WITH_DELIMITER, "[\\-,]"));
            int number = -1;
            boolean inRange = false;
            final Set<Integer> myPages = new HashSet<>();
            for (String part : parts) {
                if (numberPattern.matcher(part).matches()) {
                    int lowerBound = number;
                    number = Integer.parseInt(part);
                    if (inRange) {
                        if (lowerBound > number)
                            throw new IllegalArgumentException("Lower bound (" + lowerBound + ") greater than upper bound (" + number + "): " + argValue);
                        IntStream.rangeClosed(lowerBound, number).forEach(i -> myPages.add(i));
                        number = -1;
                        inRange = false;
                    }
                } else if (part.equals(",")) {
                    if (number >= 0)
                        myPages.add(number);
                    number = -1;
                } else if (part.equals("-")) {
                    if (inRange)
                        throw new IllegalArgumentException("Unable to parse pages (unclosed range): " + argValue);
                    if (number < 0)
                        throw new IllegalArgumentException("Range without lower bound: " + argValue);
                    inRange = true;
                } else {
                    throw new IllegalArgumentException("Unable to parse pages - unexpected character '" + part + "': " + argValue);
                }
            }
            if (inRange) {
                throw new IllegalArgumentException("Unable to parse pages (unclosed range): " + argValue);
            }
            if (number >= 0)
                myPages.add(number);
            pages = myPages;
        } else if (argName.equals("inDir"))
            inDirPath = argValue;
        else if (argName.equals("outDir"))
            outputDirPath = argValue;
        else if (argName.equals("outputFile"))
            outputFilePath = argValue;
        else if (argName.equals("save"))
            save = (argValue.equals("true"));
        else if (argName.equals("shapeId"))
            shapeId = Integer.parseInt(argValue);
        else if (argName.equals("imageId"))
            imageId = Integer.parseInt(argValue);
        else if (argName.equals("docId"))
            docId = Integer.parseInt(argValue);
        else if (argName.equals("userId"))
            userId = Integer.parseInt(argValue);
        else if (argName.equals("imageCount"))
            imageCount = Integer.parseInt(argValue);
        else if (argName.equals("multiplier"))
            multiplier = Integer.parseInt(argValue);
        else if (argName.equals("imageStatus")) {
            String[] statusCodes = argValue.split(",");
            Set<ImageStatus> imageStasuses = new HashSet<>();
            for (String statusCode : statusCodes) {
                if (statusCode.equals("heldOut"))
                    imageStasuses.add(ImageStatus.TRAINING_HELD_OUT);
                else if (statusCode.equals("test"))
                    imageStasuses.add(ImageStatus.TRAINING_TEST);
                else if (statusCode.equals("training"))
                    imageStasuses.add(ImageStatus.TRAINING_VALIDATED);
                else if (statusCode.equals("all")) {
                    imageStasuses.add(ImageStatus.TRAINING_VALIDATED);
                    imageStasuses.add(ImageStatus.TRAINING_HELD_OUT);
                    imageStasuses.add(ImageStatus.TRAINING_TEST);
                } else
                    throw new RuntimeException("Unknown imageSet: " + statusCode);
            }
            imageSet = new ImageStatus[imageStasuses.size()];
            int i = 0;
            for (ImageStatus imageStatus : imageStasuses) {
                imageSet[i++] = imageStatus;
            }
        } else if (argName.equals("reconstructLetters"))
            reconstructLetters = (argValue.equals("true"));
        else if (argName.equals("excludeImageId"))
            excludeImageId = Integer.parseInt(argValue);
        else if (argName.equals("crossValidationSize"))
            crossValidationSize = Integer.parseInt(argValue);
        else if (argName.equals("includeIndex"))
            includeIndex = Integer.parseInt(argValue);
        else if (argName.equals("excludeIndex"))
            excludeIndex = Integer.parseInt(argValue);
        else if (argName.equals("docSet")) {
            String[] docIdArray = argValue.split(",");
            documentSet = new HashSet<>();
            for (String docIdString : docIdArray) {
                int oneId = Integer.parseInt(docIdString);
                documentSet.add(oneId);
            }
        } else if (argName.equals("docSelection")) {
            docSelectionPath = argValue;
        } else if (argName.equals("docGroupFile"))
            docGroupPath = argValue;
        else if (argName.equals("suffix"))
            suffix = argValue;
        else if (argName.equals("includeBeam"))
            includeBeam = argValue.equalsIgnoreCase("true");
        else if (argName.equals("outputFormat")) {
            outputFormats = new ArrayList<>();
            String[] outputFormatStrings = argValue.split(",");
            for (String outputFormatString : outputFormatStrings) {
                outputFormats.add(OutputFormat.valueOf(outputFormatString));
            }
            if (outputFormats.size() == 0)
                throw new JochreException("At least one outputFormat required.");
        } else if (argName.equals("features")) {
            featureDescriptors = new ArrayList<>();
            InputStream featureFile = new FileInputStream(new File(argValue));
            try (Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(featureFile, "UTF-8")))) {
                while (scanner.hasNextLine()) {
                    String descriptor = scanner.nextLine();
                    featureDescriptors.add(descriptor);
                    LOG.debug(descriptor);
                }
            }
        } else if (argName.equals("includeDate")) {
            includeDate = argValue.equalsIgnoreCase("true");
        } else {
            throw new RuntimeException("Unknown argument: " + argName);
        }
    }
    if (pages.isEmpty() && (firstPage >= 0 || lastPage >= 0)) {
        if (firstPage < 0)
            firstPage = 0;
        if (lastPage < 0)
            lastPage = config.getInt("jochre.pdf.max-page");
        pages = IntStream.rangeClosed(firstPage, lastPage).boxed().collect(Collectors.toSet());
    }
    long startTime = System.currentTimeMillis();
    try {
        this.setUserId(userId);
        CorpusSelectionCriteria criteria = new CorpusSelectionCriteria();
        if (docSelectionPath != null) {
            File docSelectionFile = new File(docSelectionPath);
            Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(docSelectionFile), jochreSession.getEncoding())));
            criteria.loadSelection(scanner);
            scanner.close();
        } else {
            criteria.setImageId(imageId);
            criteria.setImageCount(imageCount);
            if (imageSet != null)
                criteria.setImageStatusesToInclude(imageSet);
            criteria.setExcludeImageId(excludeImageId);
            criteria.setCrossValidationSize(crossValidationSize);
            criteria.setIncludeIndex(includeIndex);
            criteria.setExcludeIndex(excludeIndex);
            criteria.setDocumentId(docId);
            criteria.setDocumentIds(documentSet);
        }
        if (LOG.isDebugEnabled())
            LOG.debug(criteria.getAttributes().toString());
        if (docGroupPath != null) {
            File docGroupFile = new File(docGroupPath);
            Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(docGroupFile), jochreSession.getEncoding())));
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine();
                int equalsPos = line.indexOf('=');
                String groupName = line.substring(0, equalsPos);
                String[] ids = line.substring(equalsPos + 1).split(",");
                Set<Integer> idSet = new HashSet<>();
                for (String idString : ids) {
                    idSet.add(Integer.parseInt(idString));
                }
                documentGroups.put(groupName, idSet);
            }
            scanner.close();
        }
        MostLikelyWordChooser wordChooser = new MostLikelyWordChooser(jochreSession);
        File outputDir = null;
        File outputFile = null;
        if (outputDirPath != null) {
            outputDir = new File(outputDirPath);
        } else if (outputFilePath != null) {
            outputFile = new File(outputFilePath);
            outputDir = outputFile.getParentFile();
        }
        if (outputDir != null)
            outputDir.mkdirs();
        List<DocumentObserver> observers = null;
        List<PdfImageObserver> imageObservers = null;
        if (outputFormats.size() > 0 && !command.equals("analyseFolder")) {
            if (outputDir == null) {
                throw new JochreException("Either outputDir our outputFile are required with outputFormats");
            }
            String baseName = null;
            if (userFriendlyName != null && userFriendlyName.length() > 0) {
                baseName = userFriendlyName;
            } else if (inFilePath != null && inFilePath.length() > 0) {
                File inFile = new File(inFilePath);
                baseName = this.getBaseName(inFile);
            }
            observers = this.getObservers(outputFormats, baseName, outputDir, includeDate);
            imageObservers = this.getImageObservers(outputFormats, baseName, outputDir);
        }
        if (userFriendlyName.length() == 0)
            userFriendlyName = inFilePath;
        if (command.equals("segment")) {
            this.doCommandSegment(inFilePath, userFriendlyName, outputDir, save, pages);
        } else if (command.equals("extract")) {
            this.doCommandExtractImages(inFilePath, outputDir, pages);
        } else if (command.equals("updateImages")) {
            this.doCommandUpdateImages(inFilePath, docId, pages);
        } else if (command.equals("applyFeatures")) {
            this.doCommandApplyFeatures(imageId, shapeId, featureDescriptors);
        } else if (command.equals("train")) {
            this.doCommandTrain(featureDescriptors, criteria, reconstructLetters);
        } else if (command.equals("evaluate") || command.equals("evaluateComplex")) {
            this.doCommandEvaluate(criteria, outputDir, wordChooser, reconstructLetters, save, suffix, includeBeam, observers);
        } else if (command.equals("evaluateFull")) {
            this.doCommandEvaluateFull(criteria, save, outputDir, wordChooser, suffix, observers);
        } else if (command.equals("analyse")) {
            this.doCommandAnalyse(criteria, wordChooser, observers);
        } else if (command.equals("transform")) {
            this.doCommandTransform(criteria, observers, imageObservers);
        } else if (command.equals("trainSplits")) {
            this.doCommandTrainSplits(featureDescriptors, criteria);
        } else if (command.equals("evaluateSplits")) {
            this.doCommandEvaluateSplits(criteria);
        } else if (command.equals("trainMerge")) {
            this.doCommandTrainMerge(featureDescriptors, multiplier, criteria);
        } else if (command.equals("evaluateMerge")) {
            this.doCommandEvaluateMerge(criteria);
        } else if (command.equals("logImage")) {
            this.doCommandLogImage(shapeId);
        } else if (command.equals("testFeature")) {
            this.doCommandTestFeature(shapeId);
        } else if (command.equals("serializeLexicon")) {
            if (outputDir == null) {
                throw new JochreException("Either outputDir our outputFile are required for " + command);
            }
            File inputFile = new File(inFilePath);
            if (inputFile.isDirectory()) {
                File[] lexiconFiles = inputFile.listFiles();
                for (File oneLexFile : lexiconFiles) {
                    LOG.debug(oneLexFile.getName() + ": " + ", size: " + oneLexFile.length());
                    TextFileLexicon lexicon = new TextFileLexicon(oneLexFile, jochreSession.getEncoding());
                    String baseName = oneLexFile.getName().substring(0, oneLexFile.getName().indexOf("."));
                    if (baseName.lastIndexOf("/") > 0)
                        baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
                    File lexiconFile = new File(outputDir, baseName + ".obj");
                    lexicon.serialize(lexiconFile);
                }
            } else {
                LOG.debug(inFilePath + ": " + inputFile.exists() + ", size: " + inputFile.length());
                TextFileLexicon lexicon = new TextFileLexicon(inputFile, jochreSession.getEncoding());
                String baseName = inFilePath.substring(0, inFilePath.indexOf("."));
                if (baseName.lastIndexOf("/") > 0)
                    baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
                File lexiconFile = outputFile;
                if (lexiconFile == null)
                    lexiconFile = new File(outputDir, baseName + ".obj");
                lexicon.serialize(lexiconFile);
            }
        } else if (command.equals("analyseFolder")) {
            File inDir = new File(inDirPath);
            File[] pdfFiles = inDir.listFiles(new FilenameFilter() {

                @Override
                public boolean accept(File dir, String name) {
                    return (name.toLowerCase().endsWith(".pdf"));
                }
            });
            Arrays.sort(pdfFiles);
            for (File pdfFile : pdfFiles) {
                LOG.info("Analysing file: " + pdfFile.getAbsolutePath());
                try {
                    String baseName = this.getBaseName(pdfFile);
                    File analysisDir = new File(inDir, baseName);
                    analysisDir.mkdirs();
                    List<DocumentObserver> pdfObservers = this.getObservers(outputFormats, baseName, analysisDir, includeDate);
                    List<PdfImageObserver> pdfImageObservers = this.getImageObservers(outputFormats, baseName, analysisDir);
                    this.doCommandAnalyse(pdfFile, wordChooser, pages, pdfObservers, pdfImageObservers);
                    File pdfOutputDir = new File(outputDir, baseName);
                    pdfOutputDir.mkdirs();
                    File targetFile = new File(pdfOutputDir, pdfFile.getName());
                    Files.move(pdfFile.toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
                    File[] analysisFiles = analysisDir.listFiles();
                    for (File analysisFile : analysisFiles) {
                        targetFile = new File(pdfOutputDir, analysisFile.getName());
                        Files.move(analysisFile.toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
                    }
                    Files.delete(analysisDir.toPath());
                } catch (Exception e) {
                    // log errors, but continue processing
                    LOG.error("Error processing file: " + pdfFile.getAbsolutePath(), e);
                }
            }
        } else if (command.equals("analyseFile")) {
            File pdfFile = new File(inFilePath);
            this.doCommandAnalyse(pdfFile, wordChooser, pages, observers, imageObservers);
        } else if (command.equals("findSplits")) {
            GraphicsDao graphicsDao = GraphicsDao.getInstance(jochreSession);
            List<Shape> shapesToSplit = graphicsDao.findShapesToSplit(jochreSession.getLocale());
            for (Shape shape : shapesToSplit) {
                LOG.info(shape.toString());
            }
        } else {
            throw new RuntimeException("Unknown command: " + command);
        }
    } catch (Exception e) {
        LOG.error("An error occurred while running Jochre", e);
        throw e;
    } finally {
        long duration = System.currentTimeMillis() - startTime;
        LOG.info("Duration (ms):" + duration);
    }
    LOG.info("#### finished #####");
}
Also used : Arrays(java.util.Arrays) SplitCandidateFinder(com.joliciel.jochre.boundaries.SplitCandidateFinder) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) JochreMergeEventStream(com.joliciel.jochre.boundaries.JochreMergeEventStream) Map(java.util.Map) LetterGuesser(com.joliciel.jochre.letterGuesser.LetterGuesser) Set(java.util.Set) CorpusSelectionCriteria(com.joliciel.jochre.graphics.CorpusSelectionCriteria) LetterFeatureTester(com.joliciel.jochre.letterGuesser.features.LetterFeatureTester) JochreCorpusGroupReader(com.joliciel.jochre.graphics.JochreCorpusGroupReader) TextGetter(com.joliciel.jochre.output.TextGetter) JochrePage(com.joliciel.jochre.doc.JochrePage) LetterFeature(com.joliciel.jochre.letterGuesser.features.LetterFeature) SplitEvaluator(com.joliciel.jochre.boundaries.SplitEvaluator) UnknownWordListWriter(com.joliciel.jochre.lexicon.UnknownWordListWriter) JochreCorpusShapeReader(com.joliciel.jochre.graphics.JochreCorpusShapeReader) FilenameFilter(java.io.FilenameFilter) OutcomeEqualiserEventStream(com.joliciel.talismane.machineLearning.OutcomeEqualiserEventStream) TreeSet(java.util.TreeSet) StandardCopyOption(java.nio.file.StandardCopyOption) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) ClassificationEventStream(com.joliciel.talismane.machineLearning.ClassificationEventStream) JochreDocumentGenerator(com.joliciel.jochre.doc.JochreDocumentGenerator) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon) TextExporter(com.joliciel.jochre.output.TextExporter) Files(java.nio.file.Files) Config(com.typesafe.config.Config) BufferedWriter(java.io.BufferedWriter) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) TextFormat(com.joliciel.jochre.output.TextGetter.TextFormat) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) InputStreamReader(java.io.InputStreamReader) JochreXMLExporter(com.joliciel.jochre.output.JochreXMLExporter) File(java.io.File) PdfImageObserver(com.joliciel.jochre.utils.pdf.PdfImageObserver) BoundaryDetector(com.joliciel.jochre.boundaries.BoundaryDetector) JochreCorpusImageReader(com.joliciel.jochre.graphics.JochreCorpusImageReader) MetaDataExporter(com.joliciel.jochre.output.MetaDataExporter) LetterFeatureParser(com.joliciel.jochre.letterGuesser.features.LetterFeatureParser) BufferedReader(java.io.BufferedReader) JochreLetterEventStream(com.joliciel.jochre.letterGuesser.JochreLetterEventStream) ClassificationModelTrainer(com.joliciel.talismane.machineLearning.ClassificationModelTrainer) LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) FScoreCalculator(com.joliciel.jochre.stats.FScoreCalculator) SourceFileProcessor(com.joliciel.jochre.doc.SourceFileProcessor) PdfImageVisitor(com.joliciel.jochre.utils.pdf.PdfImageVisitor) LoggerFactory(org.slf4j.LoggerFactory) Scanner(java.util.Scanner) RecursiveShapeSplitter(com.joliciel.jochre.boundaries.RecursiveShapeSplitter) GraphicsDao(com.joliciel.jochre.graphics.GraphicsDao) ImageStatus(com.joliciel.jochre.graphics.ImageStatus) MostLikelyWordChooser(com.joliciel.jochre.lexicon.MostLikelyWordChooser) PdfImageSaver(com.joliciel.jochre.pdf.PdfImageSaver) ComponentCharacterValidator(com.joliciel.jochre.letterGuesser.ComponentCharacterValidator) DocumentDao(com.joliciel.jochre.doc.DocumentDao) MergeEvaluator(com.joliciel.jochre.boundaries.MergeEvaluator) TrainingCorpusShapeMerger(com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger) JochreLogUtils(com.joliciel.jochre.utils.JochreLogUtils) LetterGuessObserver(com.joliciel.jochre.analyser.LetterGuessObserver) BufferedImage(java.awt.image.BufferedImage) ErrorLogger(com.joliciel.jochre.analyser.ErrorLogger) Shape(com.joliciel.jochre.graphics.Shape) TrainingCorpusShapeSplitter(com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter) ImageDocumentExtractor(com.joliciel.jochre.doc.ImageDocumentExtractor) Collectors(java.util.stream.Collectors) List(java.util.List) MergeFeatureParser(com.joliciel.jochre.boundaries.features.MergeFeatureParser) LetterValidator(com.joliciel.jochre.letterGuesser.LetterValidator) Writer(java.io.Writer) Entry(java.util.Map.Entry) ModelTrainerFactory(com.joliciel.talismane.machineLearning.ModelTrainerFactory) PdfDocumentProcessor(com.joliciel.jochre.pdf.PdfDocumentProcessor) ShapeFeature(com.joliciel.jochre.graphics.features.ShapeFeature) Pattern(java.util.regex.Pattern) ImageAnalyser(com.joliciel.jochre.analyser.ImageAnalyser) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) MergeFeature(com.joliciel.jochre.boundaries.features.MergeFeature) DocumentObserver(com.joliciel.jochre.doc.DocumentObserver) JochreException(com.joliciel.jochre.utils.JochreException) AbbyyFineReader8Exporter(com.joliciel.jochre.output.AbbyyFineReader8Exporter) IntStream(java.util.stream.IntStream) JochreDocument(com.joliciel.jochre.doc.JochreDocument) JochreCorpusImageProcessor(com.joliciel.jochre.graphics.JochreCorpusImageProcessor) HashMap(java.util.HashMap) JochrePageByPageExporter(com.joliciel.jochre.output.JochrePageByPageExporter) JochreSplitEventStream(com.joliciel.jochre.boundaries.JochreSplitEventStream) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) User(com.joliciel.jochre.security.User) LexiconErrorWriter(com.joliciel.jochre.lexicon.LexiconErrorWriter) SplitFeature(com.joliciel.jochre.boundaries.features.SplitFeature) SplitFeatureParser(com.joliciel.jochre.boundaries.features.SplitFeatureParser) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) OutputStreamWriter(java.io.OutputStreamWriter) ConfigFactory(com.typesafe.config.ConfigFactory) DeterministicBoundaryDetector(com.joliciel.jochre.boundaries.DeterministicBoundaryDetector) CorpusLexiconBuilder(com.joliciel.jochre.lexicon.CorpusLexiconBuilder) Logger(org.slf4j.Logger) SecurityDao(com.joliciel.jochre.security.SecurityDao) FileInputStream(java.io.FileInputStream) JochreImage(com.joliciel.jochre.graphics.JochreImage) SimpleLetterFScoreObserver(com.joliciel.jochre.analyser.SimpleLetterFScoreObserver) LetterAssigner(com.joliciel.jochre.analyser.LetterAssigner) VerticalElongationFeature(com.joliciel.jochre.graphics.features.VerticalElongationFeature) ShapeMerger(com.joliciel.jochre.boundaries.ShapeMerger) FScoreObserver(com.joliciel.jochre.analyser.FScoreObserver) ShapeSplitter(com.joliciel.jochre.boundaries.ShapeSplitter) Collections(java.util.Collections) AltoXMLExporter(com.joliciel.jochre.output.AltoXMLExporter) InputStream(java.io.InputStream) Scanner(java.util.Scanner) Set(java.util.Set) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Shape(com.joliciel.jochre.graphics.Shape) ImageStatus(com.joliciel.jochre.graphics.ImageStatus) ArrayList(java.util.ArrayList) FilenameFilter(java.io.FilenameFilter) JochreException(com.joliciel.jochre.utils.JochreException) GraphicsDao(com.joliciel.jochre.graphics.GraphicsDao) HashSet(java.util.HashSet) PdfImageObserver(com.joliciel.jochre.utils.pdf.PdfImageObserver) Pattern(java.util.regex.Pattern) InputStreamReader(java.io.InputStreamReader) CorpusSelectionCriteria(com.joliciel.jochre.graphics.CorpusSelectionCriteria) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) DocumentObserver(com.joliciel.jochre.doc.DocumentObserver) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) JochreException(com.joliciel.jochre.utils.JochreException) BufferedReader(java.io.BufferedReader) MostLikelyWordChooser(com.joliciel.jochre.lexicon.MostLikelyWordChooser) File(java.io.File)

Aggregations

DocumentObserver (com.joliciel.jochre.doc.DocumentObserver)8 BeamSearchImageAnalyser (com.joliciel.jochre.analyser.BeamSearchImageAnalyser)5 ImageAnalyser (com.joliciel.jochre.analyser.ImageAnalyser)5 OriginalShapeLetterAssigner (com.joliciel.jochre.analyser.OriginalShapeLetterAssigner)5 JochreCorpusImageProcessor (com.joliciel.jochre.graphics.JochreCorpusImageProcessor)5 LetterGuesser (com.joliciel.jochre.letterGuesser.LetterGuesser)5 LetterFeature (com.joliciel.jochre.letterGuesser.features.LetterFeature)5 LetterFeatureParser (com.joliciel.jochre.letterGuesser.features.LetterFeatureParser)5 ClassificationModel (com.joliciel.talismane.machineLearning.ClassificationModel)5 LetterAssigner (com.joliciel.jochre.analyser.LetterAssigner)4 BoundaryDetector (com.joliciel.jochre.boundaries.BoundaryDetector)4 DeterministicBoundaryDetector (com.joliciel.jochre.boundaries.DeterministicBoundaryDetector)4 LetterByLetterBoundaryDetector (com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector)4 OriginalBoundaryDetector (com.joliciel.jochre.boundaries.OriginalBoundaryDetector)4 ErrorLogger (com.joliciel.jochre.analyser.ErrorLogger)3 RecursiveShapeSplitter (com.joliciel.jochre.boundaries.RecursiveShapeSplitter)3 ShapeMerger (com.joliciel.jochre.boundaries.ShapeMerger)3 ShapeSplitter (com.joliciel.jochre.boundaries.ShapeSplitter)3 TrainingCorpusShapeMerger (com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger)3 TrainingCorpusShapeSplitter (com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter)3