Search in sources :

Example 51 with Shape

use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.

the class Jochre method execute.

/**
 * Usage (* indicates optional):<br/>
 * Jochre load [filename] [isoLanguageCode] [firstPage]* [lastPage]*<br/>
 * Loads a file (pdf or image) and segments it into letters. The analysed
 * version is stored in the persistent store. Writes [filename].xml to the same
 * location, to enable the user to indicate the text to associate with this
 * file.<br/>
 * Jochre extract [filename] [outputDirectory] [firstPage]* [lastPage]*<br/>
 * Extracts images form a pdf file.<br/>
 */
public void execute(Map<String, String> argMap) throws Exception {
    if (argMap.size() == 0) {
        System.out.println("See jochre wiki for usage");
        return;
    }
    String logConfigPath = argMap.get("logConfigFile");
    if (logConfigPath != null) {
        argMap.remove("logConfigFile");
        JochreLogUtils.configureLogging(logConfigPath);
    }
    String command = "";
    String inFilePath = "";
    String inDirPath = null;
    String userFriendlyName = "";
    String outputDirPath = null;
    String outputFilePath = null;
    int firstPage = -1;
    int lastPage = -1;
    Set<Integer> pages = Collections.emptySet();
    int shapeId = -1;
    int docId = -1;
    int imageId = 0;
    int userId = -1;
    int imageCount = 0;
    int multiplier = 0;
    boolean save = false;
    ImageStatus[] imageSet = null;
    boolean reconstructLetters = false;
    int excludeImageId = 0;
    int crossValidationSize = -1;
    int includeIndex = -1;
    int excludeIndex = -1;
    Set<Integer> documentSet = null;
    String suffix = "";
    String docGroupPath = null;
    boolean includeBeam = false;
    List<OutputFormat> outputFormats = new ArrayList<>();
    String docSelectionPath = null;
    List<String> featureDescriptors = null;
    boolean includeDate = false;
    for (Entry<String, String> argMapEntry : argMap.entrySet()) {
        String argName = argMapEntry.getKey();
        String argValue = argMapEntry.getValue();
        if (argName.equals("command"))
            command = argValue;
        else if (argName.equals("file"))
            inFilePath = argValue;
        else if (argName.equals("name"))
            userFriendlyName = argValue;
        else if (argName.equals("first"))
            firstPage = Integer.parseInt(argValue);
        else if (argName.equals("last"))
            lastPage = Integer.parseInt(argValue);
        else if (argName.equals("pages")) {
            final String WITH_DELIMITER = "((?<=%1$s)|(?=%1$s))";
            final Pattern numberPattern = Pattern.compile("\\d+");
            final String[] parts = argValue.split(String.format(WITH_DELIMITER, "[\\-,]"));
            int number = -1;
            boolean inRange = false;
            final Set<Integer> myPages = new HashSet<>();
            for (String part : parts) {
                if (numberPattern.matcher(part).matches()) {
                    int lowerBound = number;
                    number = Integer.parseInt(part);
                    if (inRange) {
                        if (lowerBound > number)
                            throw new IllegalArgumentException("Lower bound (" + lowerBound + ") greater than upper bound (" + number + "): " + argValue);
                        IntStream.rangeClosed(lowerBound, number).forEach(i -> myPages.add(i));
                        number = -1;
                        inRange = false;
                    }
                } else if (part.equals(",")) {
                    if (number >= 0)
                        myPages.add(number);
                    number = -1;
                } else if (part.equals("-")) {
                    if (inRange)
                        throw new IllegalArgumentException("Unable to parse pages (unclosed range): " + argValue);
                    if (number < 0)
                        throw new IllegalArgumentException("Range without lower bound: " + argValue);
                    inRange = true;
                } else {
                    throw new IllegalArgumentException("Unable to parse pages - unexpected character '" + part + "': " + argValue);
                }
            }
            if (inRange) {
                throw new IllegalArgumentException("Unable to parse pages (unclosed range): " + argValue);
            }
            if (number >= 0)
                myPages.add(number);
            pages = myPages;
        } else if (argName.equals("inDir"))
            inDirPath = argValue;
        else if (argName.equals("outDir"))
            outputDirPath = argValue;
        else if (argName.equals("outputFile"))
            outputFilePath = argValue;
        else if (argName.equals("save"))
            save = (argValue.equals("true"));
        else if (argName.equals("shapeId"))
            shapeId = Integer.parseInt(argValue);
        else if (argName.equals("imageId"))
            imageId = Integer.parseInt(argValue);
        else if (argName.equals("docId"))
            docId = Integer.parseInt(argValue);
        else if (argName.equals("userId"))
            userId = Integer.parseInt(argValue);
        else if (argName.equals("imageCount"))
            imageCount = Integer.parseInt(argValue);
        else if (argName.equals("multiplier"))
            multiplier = Integer.parseInt(argValue);
        else if (argName.equals("imageStatus")) {
            String[] statusCodes = argValue.split(",");
            Set<ImageStatus> imageStasuses = new HashSet<>();
            for (String statusCode : statusCodes) {
                if (statusCode.equals("heldOut"))
                    imageStasuses.add(ImageStatus.TRAINING_HELD_OUT);
                else if (statusCode.equals("test"))
                    imageStasuses.add(ImageStatus.TRAINING_TEST);
                else if (statusCode.equals("training"))
                    imageStasuses.add(ImageStatus.TRAINING_VALIDATED);
                else if (statusCode.equals("all")) {
                    imageStasuses.add(ImageStatus.TRAINING_VALIDATED);
                    imageStasuses.add(ImageStatus.TRAINING_HELD_OUT);
                    imageStasuses.add(ImageStatus.TRAINING_TEST);
                } else
                    throw new RuntimeException("Unknown imageSet: " + statusCode);
            }
            imageSet = new ImageStatus[imageStasuses.size()];
            int i = 0;
            for (ImageStatus imageStatus : imageStasuses) {
                imageSet[i++] = imageStatus;
            }
        } else if (argName.equals("reconstructLetters"))
            reconstructLetters = (argValue.equals("true"));
        else if (argName.equals("excludeImageId"))
            excludeImageId = Integer.parseInt(argValue);
        else if (argName.equals("crossValidationSize"))
            crossValidationSize = Integer.parseInt(argValue);
        else if (argName.equals("includeIndex"))
            includeIndex = Integer.parseInt(argValue);
        else if (argName.equals("excludeIndex"))
            excludeIndex = Integer.parseInt(argValue);
        else if (argName.equals("docSet")) {
            String[] docIdArray = argValue.split(",");
            documentSet = new HashSet<>();
            for (String docIdString : docIdArray) {
                int oneId = Integer.parseInt(docIdString);
                documentSet.add(oneId);
            }
        } else if (argName.equals("docSelection")) {
            docSelectionPath = argValue;
        } else if (argName.equals("docGroupFile"))
            docGroupPath = argValue;
        else if (argName.equals("suffix"))
            suffix = argValue;
        else if (argName.equals("includeBeam"))
            includeBeam = argValue.equalsIgnoreCase("true");
        else if (argName.equals("outputFormat")) {
            outputFormats = new ArrayList<>();
            String[] outputFormatStrings = argValue.split(",");
            for (String outputFormatString : outputFormatStrings) {
                outputFormats.add(OutputFormat.valueOf(outputFormatString));
            }
            if (outputFormats.size() == 0)
                throw new JochreException("At least one outputFormat required.");
        } else if (argName.equals("features")) {
            featureDescriptors = new ArrayList<>();
            InputStream featureFile = new FileInputStream(new File(argValue));
            try (Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(featureFile, "UTF-8")))) {
                while (scanner.hasNextLine()) {
                    String descriptor = scanner.nextLine();
                    featureDescriptors.add(descriptor);
                    LOG.debug(descriptor);
                }
            }
        } else if (argName.equals("includeDate")) {
            includeDate = argValue.equalsIgnoreCase("true");
        } else {
            throw new RuntimeException("Unknown argument: " + argName);
        }
    }
    if (pages.isEmpty() && (firstPage >= 0 || lastPage >= 0)) {
        if (firstPage < 0)
            firstPage = 0;
        if (lastPage < 0)
            lastPage = config.getInt("jochre.pdf.max-page");
        pages = IntStream.rangeClosed(firstPage, lastPage).boxed().collect(Collectors.toSet());
    }
    long startTime = System.currentTimeMillis();
    try {
        this.setUserId(userId);
        CorpusSelectionCriteria criteria = new CorpusSelectionCriteria();
        if (docSelectionPath != null) {
            File docSelectionFile = new File(docSelectionPath);
            Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(docSelectionFile), jochreSession.getEncoding())));
            criteria.loadSelection(scanner);
            scanner.close();
        } else {
            criteria.setImageId(imageId);
            criteria.setImageCount(imageCount);
            if (imageSet != null)
                criteria.setImageStatusesToInclude(imageSet);
            criteria.setExcludeImageId(excludeImageId);
            criteria.setCrossValidationSize(crossValidationSize);
            criteria.setIncludeIndex(includeIndex);
            criteria.setExcludeIndex(excludeIndex);
            criteria.setDocumentId(docId);
            criteria.setDocumentIds(documentSet);
        }
        if (LOG.isDebugEnabled())
            LOG.debug(criteria.getAttributes().toString());
        if (docGroupPath != null) {
            File docGroupFile = new File(docGroupPath);
            Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(docGroupFile), jochreSession.getEncoding())));
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine();
                int equalsPos = line.indexOf('=');
                String groupName = line.substring(0, equalsPos);
                String[] ids = line.substring(equalsPos + 1).split(",");
                Set<Integer> idSet = new HashSet<>();
                for (String idString : ids) {
                    idSet.add(Integer.parseInt(idString));
                }
                documentGroups.put(groupName, idSet);
            }
            scanner.close();
        }
        MostLikelyWordChooser wordChooser = new MostLikelyWordChooser(jochreSession);
        File outputDir = null;
        File outputFile = null;
        if (outputDirPath != null) {
            outputDir = new File(outputDirPath);
        } else if (outputFilePath != null) {
            outputFile = new File(outputFilePath);
            outputDir = outputFile.getParentFile();
        }
        if (outputDir != null)
            outputDir.mkdirs();
        List<DocumentObserver> observers = null;
        List<PdfImageObserver> imageObservers = null;
        if (outputFormats.size() > 0 && !command.equals("analyseFolder")) {
            if (outputDir == null) {
                throw new JochreException("Either outputDir our outputFile are required with outputFormats");
            }
            String baseName = null;
            if (userFriendlyName != null && userFriendlyName.length() > 0) {
                baseName = userFriendlyName;
            } else if (inFilePath != null && inFilePath.length() > 0) {
                File inFile = new File(inFilePath);
                baseName = this.getBaseName(inFile);
            }
            observers = this.getObservers(outputFormats, baseName, outputDir, includeDate);
            imageObservers = this.getImageObservers(outputFormats, baseName, outputDir);
        }
        if (userFriendlyName.length() == 0)
            userFriendlyName = inFilePath;
        if (command.equals("segment")) {
            this.doCommandSegment(inFilePath, userFriendlyName, outputDir, save, pages);
        } else if (command.equals("extract")) {
            this.doCommandExtractImages(inFilePath, outputDir, pages);
        } else if (command.equals("updateImages")) {
            this.doCommandUpdateImages(inFilePath, docId, pages);
        } else if (command.equals("applyFeatures")) {
            this.doCommandApplyFeatures(imageId, shapeId, featureDescriptors);
        } else if (command.equals("train")) {
            this.doCommandTrain(featureDescriptors, criteria, reconstructLetters);
        } else if (command.equals("evaluate") || command.equals("evaluateComplex")) {
            this.doCommandEvaluate(criteria, outputDir, wordChooser, reconstructLetters, save, suffix, includeBeam, observers);
        } else if (command.equals("evaluateFull")) {
            this.doCommandEvaluateFull(criteria, save, outputDir, wordChooser, suffix, observers);
        } else if (command.equals("analyse")) {
            this.doCommandAnalyse(criteria, wordChooser, observers);
        } else if (command.equals("transform")) {
            this.doCommandTransform(criteria, observers, imageObservers);
        } else if (command.equals("trainSplits")) {
            this.doCommandTrainSplits(featureDescriptors, criteria);
        } else if (command.equals("evaluateSplits")) {
            this.doCommandEvaluateSplits(criteria);
        } else if (command.equals("trainMerge")) {
            this.doCommandTrainMerge(featureDescriptors, multiplier, criteria);
        } else if (command.equals("evaluateMerge")) {
            this.doCommandEvaluateMerge(criteria);
        } else if (command.equals("logImage")) {
            this.doCommandLogImage(shapeId);
        } else if (command.equals("testFeature")) {
            this.doCommandTestFeature(shapeId);
        } else if (command.equals("serializeLexicon")) {
            if (outputDir == null) {
                throw new JochreException("Either outputDir our outputFile are required for " + command);
            }
            File inputFile = new File(inFilePath);
            if (inputFile.isDirectory()) {
                File[] lexiconFiles = inputFile.listFiles();
                for (File oneLexFile : lexiconFiles) {
                    LOG.debug(oneLexFile.getName() + ": " + ", size: " + oneLexFile.length());
                    TextFileLexicon lexicon = new TextFileLexicon(oneLexFile, jochreSession.getEncoding());
                    String baseName = oneLexFile.getName().substring(0, oneLexFile.getName().indexOf("."));
                    if (baseName.lastIndexOf("/") > 0)
                        baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
                    File lexiconFile = new File(outputDir, baseName + ".obj");
                    lexicon.serialize(lexiconFile);
                }
            } else {
                LOG.debug(inFilePath + ": " + inputFile.exists() + ", size: " + inputFile.length());
                TextFileLexicon lexicon = new TextFileLexicon(inputFile, jochreSession.getEncoding());
                String baseName = inFilePath.substring(0, inFilePath.indexOf("."));
                if (baseName.lastIndexOf("/") > 0)
                    baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
                File lexiconFile = outputFile;
                if (lexiconFile == null)
                    lexiconFile = new File(outputDir, baseName + ".obj");
                lexicon.serialize(lexiconFile);
            }
        } else if (command.equals("analyseFolder")) {
            File inDir = new File(inDirPath);
            File[] pdfFiles = inDir.listFiles(new FilenameFilter() {

                @Override
                public boolean accept(File dir, String name) {
                    return (name.toLowerCase().endsWith(".pdf"));
                }
            });
            Arrays.sort(pdfFiles);
            for (File pdfFile : pdfFiles) {
                LOG.info("Analysing file: " + pdfFile.getAbsolutePath());
                try {
                    String baseName = this.getBaseName(pdfFile);
                    File analysisDir = new File(inDir, baseName);
                    analysisDir.mkdirs();
                    List<DocumentObserver> pdfObservers = this.getObservers(outputFormats, baseName, analysisDir, includeDate);
                    List<PdfImageObserver> pdfImageObservers = this.getImageObservers(outputFormats, baseName, analysisDir);
                    this.doCommandAnalyse(pdfFile, wordChooser, pages, pdfObservers, pdfImageObservers);
                    File pdfOutputDir = new File(outputDir, baseName);
                    pdfOutputDir.mkdirs();
                    File targetFile = new File(pdfOutputDir, pdfFile.getName());
                    Files.move(pdfFile.toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
                    File[] analysisFiles = analysisDir.listFiles();
                    for (File analysisFile : analysisFiles) {
                        targetFile = new File(pdfOutputDir, analysisFile.getName());
                        Files.move(analysisFile.toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
                    }
                    Files.delete(analysisDir.toPath());
                } catch (Exception e) {
                    // log errors, but continue processing
                    LOG.error("Error processing file: " + pdfFile.getAbsolutePath(), e);
                }
            }
        } else if (command.equals("analyseFile")) {
            File pdfFile = new File(inFilePath);
            this.doCommandAnalyse(pdfFile, wordChooser, pages, observers, imageObservers);
        } else if (command.equals("findSplits")) {
            GraphicsDao graphicsDao = GraphicsDao.getInstance(jochreSession);
            List<Shape> shapesToSplit = graphicsDao.findShapesToSplit(jochreSession.getLocale());
            for (Shape shape : shapesToSplit) {
                LOG.info(shape.toString());
            }
        } else {
            throw new RuntimeException("Unknown command: " + command);
        }
    } catch (Exception e) {
        LOG.error("An error occurred while running Jochre", e);
        throw e;
    } finally {
        long duration = System.currentTimeMillis() - startTime;
        LOG.info("Duration (ms):" + duration);
    }
    LOG.info("#### finished #####");
}
Also used : Arrays(java.util.Arrays) SplitCandidateFinder(com.joliciel.jochre.boundaries.SplitCandidateFinder) OriginalBoundaryDetector(com.joliciel.jochre.boundaries.OriginalBoundaryDetector) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) JochreMergeEventStream(com.joliciel.jochre.boundaries.JochreMergeEventStream) Map(java.util.Map) LetterGuesser(com.joliciel.jochre.letterGuesser.LetterGuesser) Set(java.util.Set) CorpusSelectionCriteria(com.joliciel.jochre.graphics.CorpusSelectionCriteria) LetterFeatureTester(com.joliciel.jochre.letterGuesser.features.LetterFeatureTester) JochreCorpusGroupReader(com.joliciel.jochre.graphics.JochreCorpusGroupReader) TextGetter(com.joliciel.jochre.output.TextGetter) JochrePage(com.joliciel.jochre.doc.JochrePage) LetterFeature(com.joliciel.jochre.letterGuesser.features.LetterFeature) SplitEvaluator(com.joliciel.jochre.boundaries.SplitEvaluator) UnknownWordListWriter(com.joliciel.jochre.lexicon.UnknownWordListWriter) JochreCorpusShapeReader(com.joliciel.jochre.graphics.JochreCorpusShapeReader) FilenameFilter(java.io.FilenameFilter) OutcomeEqualiserEventStream(com.joliciel.talismane.machineLearning.OutcomeEqualiserEventStream) TreeSet(java.util.TreeSet) StandardCopyOption(java.nio.file.StandardCopyOption) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) ClassificationEventStream(com.joliciel.talismane.machineLearning.ClassificationEventStream) JochreDocumentGenerator(com.joliciel.jochre.doc.JochreDocumentGenerator) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon) TextExporter(com.joliciel.jochre.output.TextExporter) Files(java.nio.file.Files) Config(com.typesafe.config.Config) BufferedWriter(java.io.BufferedWriter) BeamSearchImageAnalyser(com.joliciel.jochre.analyser.BeamSearchImageAnalyser) TextFormat(com.joliciel.jochre.output.TextGetter.TextFormat) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) InputStreamReader(java.io.InputStreamReader) JochreXMLExporter(com.joliciel.jochre.output.JochreXMLExporter) File(java.io.File) PdfImageObserver(com.joliciel.jochre.utils.pdf.PdfImageObserver) BoundaryDetector(com.joliciel.jochre.boundaries.BoundaryDetector) JochreCorpusImageReader(com.joliciel.jochre.graphics.JochreCorpusImageReader) MetaDataExporter(com.joliciel.jochre.output.MetaDataExporter) LetterFeatureParser(com.joliciel.jochre.letterGuesser.features.LetterFeatureParser) BufferedReader(java.io.BufferedReader) JochreLetterEventStream(com.joliciel.jochre.letterGuesser.JochreLetterEventStream) ClassificationModelTrainer(com.joliciel.talismane.machineLearning.ClassificationModelTrainer) LetterByLetterBoundaryDetector(com.joliciel.jochre.boundaries.LetterByLetterBoundaryDetector) FScoreCalculator(com.joliciel.jochre.stats.FScoreCalculator) SourceFileProcessor(com.joliciel.jochre.doc.SourceFileProcessor) PdfImageVisitor(com.joliciel.jochre.utils.pdf.PdfImageVisitor) LoggerFactory(org.slf4j.LoggerFactory) Scanner(java.util.Scanner) RecursiveShapeSplitter(com.joliciel.jochre.boundaries.RecursiveShapeSplitter) GraphicsDao(com.joliciel.jochre.graphics.GraphicsDao) ImageStatus(com.joliciel.jochre.graphics.ImageStatus) MostLikelyWordChooser(com.joliciel.jochre.lexicon.MostLikelyWordChooser) PdfImageSaver(com.joliciel.jochre.pdf.PdfImageSaver) ComponentCharacterValidator(com.joliciel.jochre.letterGuesser.ComponentCharacterValidator) DocumentDao(com.joliciel.jochre.doc.DocumentDao) MergeEvaluator(com.joliciel.jochre.boundaries.MergeEvaluator) TrainingCorpusShapeMerger(com.joliciel.jochre.boundaries.TrainingCorpusShapeMerger) JochreLogUtils(com.joliciel.jochre.utils.JochreLogUtils) LetterGuessObserver(com.joliciel.jochre.analyser.LetterGuessObserver) BufferedImage(java.awt.image.BufferedImage) ErrorLogger(com.joliciel.jochre.analyser.ErrorLogger) Shape(com.joliciel.jochre.graphics.Shape) TrainingCorpusShapeSplitter(com.joliciel.jochre.boundaries.TrainingCorpusShapeSplitter) ImageDocumentExtractor(com.joliciel.jochre.doc.ImageDocumentExtractor) Collectors(java.util.stream.Collectors) List(java.util.List) MergeFeatureParser(com.joliciel.jochre.boundaries.features.MergeFeatureParser) LetterValidator(com.joliciel.jochre.letterGuesser.LetterValidator) Writer(java.io.Writer) Entry(java.util.Map.Entry) ModelTrainerFactory(com.joliciel.talismane.machineLearning.ModelTrainerFactory) PdfDocumentProcessor(com.joliciel.jochre.pdf.PdfDocumentProcessor) ShapeFeature(com.joliciel.jochre.graphics.features.ShapeFeature) Pattern(java.util.regex.Pattern) ImageAnalyser(com.joliciel.jochre.analyser.ImageAnalyser) OriginalShapeLetterAssigner(com.joliciel.jochre.analyser.OriginalShapeLetterAssigner) MergeFeature(com.joliciel.jochre.boundaries.features.MergeFeature) DocumentObserver(com.joliciel.jochre.doc.DocumentObserver) JochreException(com.joliciel.jochre.utils.JochreException) AbbyyFineReader8Exporter(com.joliciel.jochre.output.AbbyyFineReader8Exporter) IntStream(java.util.stream.IntStream) JochreDocument(com.joliciel.jochre.doc.JochreDocument) JochreCorpusImageProcessor(com.joliciel.jochre.graphics.JochreCorpusImageProcessor) HashMap(java.util.HashMap) JochrePageByPageExporter(com.joliciel.jochre.output.JochrePageByPageExporter) JochreSplitEventStream(com.joliciel.jochre.boundaries.JochreSplitEventStream) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) User(com.joliciel.jochre.security.User) LexiconErrorWriter(com.joliciel.jochre.lexicon.LexiconErrorWriter) SplitFeature(com.joliciel.jochre.boundaries.features.SplitFeature) SplitFeatureParser(com.joliciel.jochre.boundaries.features.SplitFeatureParser) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) OutputStreamWriter(java.io.OutputStreamWriter) ConfigFactory(com.typesafe.config.ConfigFactory) DeterministicBoundaryDetector(com.joliciel.jochre.boundaries.DeterministicBoundaryDetector) CorpusLexiconBuilder(com.joliciel.jochre.lexicon.CorpusLexiconBuilder) Logger(org.slf4j.Logger) SecurityDao(com.joliciel.jochre.security.SecurityDao) FileInputStream(java.io.FileInputStream) JochreImage(com.joliciel.jochre.graphics.JochreImage) SimpleLetterFScoreObserver(com.joliciel.jochre.analyser.SimpleLetterFScoreObserver) LetterAssigner(com.joliciel.jochre.analyser.LetterAssigner) VerticalElongationFeature(com.joliciel.jochre.graphics.features.VerticalElongationFeature) ShapeMerger(com.joliciel.jochre.boundaries.ShapeMerger) FScoreObserver(com.joliciel.jochre.analyser.FScoreObserver) ShapeSplitter(com.joliciel.jochre.boundaries.ShapeSplitter) Collections(java.util.Collections) AltoXMLExporter(com.joliciel.jochre.output.AltoXMLExporter) InputStream(java.io.InputStream) Scanner(java.util.Scanner) Set(java.util.Set) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Shape(com.joliciel.jochre.graphics.Shape) ImageStatus(com.joliciel.jochre.graphics.ImageStatus) ArrayList(java.util.ArrayList) FilenameFilter(java.io.FilenameFilter) JochreException(com.joliciel.jochre.utils.JochreException) GraphicsDao(com.joliciel.jochre.graphics.GraphicsDao) HashSet(java.util.HashSet) PdfImageObserver(com.joliciel.jochre.utils.pdf.PdfImageObserver) Pattern(java.util.regex.Pattern) InputStreamReader(java.io.InputStreamReader) CorpusSelectionCriteria(com.joliciel.jochre.graphics.CorpusSelectionCriteria) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) DocumentObserver(com.joliciel.jochre.doc.DocumentObserver) TextFileLexicon(com.joliciel.jochre.lexicon.TextFileLexicon) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) JochreException(com.joliciel.jochre.utils.JochreException) BufferedReader(java.io.BufferedReader) MostLikelyWordChooser(com.joliciel.jochre.lexicon.MostLikelyWordChooser) File(java.io.File)

Example 52 with Shape

use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.

the class BadGuessCollector method onFinish.

@Override
public void onFinish() {
    for (int shapeId : shapeIdsToAnalyse.keySet()) {
        Shape shape = this.graphicsDao.loadShape(shapeId);
        String bestOutcome = shapeIdsToAnalyse.get(shapeId);
        LOG.debug("### Shape " + shape);
        LOG.debug("Expected: " + shape.getLetter() + " Guessed: " + bestOutcome);
        shape.writeImageToLog();
    }
}
Also used : Shape(com.joliciel.jochre.graphics.Shape)

Example 53 with Shape

use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.

the class DeterministicBoundaryDetector method findBoundaries.

@Override
public List<ShapeSequence> findBoundaries(GroupOfShapes group) {
    // find the possible shape sequences that make up this group
    ShapeSequence bestSequence = new ShapeSequence();
    for (Shape shape : group.getShapes()) {
        // check if shape is wide enough to bother with
        double widthRatio = (double) shape.getWidth() / (double) shape.getXHeight();
        double heightRatio = (double) shape.getHeight() / (double) shape.getXHeight();
        // Splitting/merging shapes as required
        ShapeSequence bestSplitSequence = null;
        if (this.shapeSplitter != null && widthRatio >= minWidthRatioForSplit && heightRatio >= minHeightRatioForSplit) {
            List<ShapeSequence> splitSequences = shapeSplitter.split(shape);
            double bestProb = 0;
            for (ShapeSequence splitSequence : splitSequences) {
                if (splitSequence.getScore() > bestProb) {
                    bestSplitSequence = splitSequence;
                    bestProb = splitSequence.getScore();
                }
            }
            if (bestProb < minProbabilityForDecision) {
                // create a sequence containing only this shape
                ShapeSequence singleShapeSequence = new ShapeSequence();
                singleShapeSequence.addShape(shape);
                bestSplitSequence = singleShapeSequence;
            }
        } else {
            // create a sequence containing only this shape
            ShapeSequence singleShapeSequence = new ShapeSequence();
            singleShapeSequence.addShape(shape);
            bestSplitSequence = singleShapeSequence;
        }
        ShapeInSequence previousShapeInSequence = null;
        Shape previousShape = null;
        if (bestSequence.size() > 0) {
            previousShapeInSequence = bestSequence.get(bestSequence.size() - 1);
            previousShape = previousShapeInSequence.getShape();
        }
        ShapeInSequence firstShapeInSequence = bestSplitSequence.get(0);
        Shape firstShape = firstShapeInSequence.getShape();
        double mergeProb = 0;
        if (this.shapeMerger != null && previousShape != null) {
            ShapePair mergeCandidate = new ShapePair(previousShape, shape);
            double mergeCandidateWidthRatio = 0;
            double mergeCandidateDistanceRatio = 0;
            mergeCandidateWidthRatio = (double) mergeCandidate.getWidth() / (double) mergeCandidate.getXHeight();
            mergeCandidateDistanceRatio = (double) mergeCandidate.getInnerDistance() / (double) mergeCandidate.getXHeight();
            if (mergeCandidateWidthRatio <= maxWidthRatioForMerge && mergeCandidateDistanceRatio <= maxDistanceRatioForMerge) {
                mergeProb = shapeMerger.checkMerge(previousShape, firstShape);
            }
        }
        if (mergeProb > minProbabilityForDecision) {
            Shape mergedShape = shapeMerger.merge(previousShape, firstShape);
            bestSequence.remove(bestSequence.size() - 1);
            List<Shape> originalShapesForMerge = new ArrayList<Shape>();
            originalShapesForMerge.addAll(previousShapeInSequence.getOriginalShapes());
            originalShapesForMerge.addAll(firstShapeInSequence.getOriginalShapes());
            bestSequence.addShape(mergedShape, originalShapesForMerge);
            boolean isFirstShape = true;
            for (ShapeInSequence splitShape : bestSplitSequence) {
                if (!isFirstShape)
                    bestSequence.add(splitShape);
                isFirstShape = false;
            }
            Decision mergeDecision = new Decision(MergeOutcome.DO_MERGE.name(), mergeProb);
            bestSequence.addDecision(mergeDecision);
            for (Decision splitDecision : bestSplitSequence.getDecisions()) bestSequence.addDecision(splitDecision);
        } else {
            if (mergeProb > 0) {
                Decision mergeDecision = new Decision(MergeOutcome.DO_NOT_MERGE.name(), 1 - mergeProb);
                bestSequence.addDecision(mergeDecision);
            }
            for (Decision splitDecision : bestSplitSequence.getDecisions()) bestSequence.addDecision(splitDecision);
            for (ShapeInSequence splitShape : bestSplitSequence) {
                bestSequence.add(splitShape);
            }
        }
    }
    // next shape in group
    List<ShapeSequence> result = new ArrayList<ShapeSequence>();
    result.add(bestSequence);
    return result;
}
Also used : Shape(com.joliciel.jochre.graphics.Shape) ArrayList(java.util.ArrayList) Decision(com.joliciel.talismane.machineLearning.Decision)

Example 54 with Shape

use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.

the class MergeEvaluator method evaluate.

public FScoreCalculator<String> evaluate(JochreCorpusGroupReader groupReader, ShapeMerger shapeMerger) {
    LOG.debug("evaluate");
    FScoreCalculator<String> fScoreCalculator = new FScoreCalculator<String>();
    while (groupReader.hasNext()) {
        GroupOfShapes group = groupReader.next();
        Shape previousShape = null;
        for (Shape shape : group.getShapes()) {
            if (previousShape != null) {
                ShapePair mergeCandidate = new ShapePair(previousShape, shape);
                double widthRatio = 0;
                double distanceRatio = 0;
                if (mergeCandidate.getXHeight() > 0) {
                    widthRatio = (double) mergeCandidate.getWidth() / (double) mergeCandidate.getXHeight();
                    distanceRatio = (double) mergeCandidate.getInnerDistance() / (double) mergeCandidate.getXHeight();
                }
                boolean shouldMerge = false;
                if (mergeCandidate.getFirstShape().getLetter().startsWith("|")) {
                    if (mergeCandidate.getSecondShape().getLetter().length() == 0 || mergeCandidate.getSecondShape().getLetter().endsWith("|"))
                        shouldMerge = true;
                } else if (mergeCandidate.getSecondShape().getLetter().endsWith("|")) {
                    if (mergeCandidate.getFirstShape().getLetter().length() == 0)
                        shouldMerge = true;
                }
                if (LOG.isTraceEnabled()) {
                    LOG.trace(mergeCandidate.toString());
                    LOG.trace("widthRatio: " + widthRatio);
                    LOG.trace("distanceRatio: " + distanceRatio);
                    LOG.trace("shouldMerge: " + shouldMerge);
                }
                if (widthRatio <= maxWidthRatio && distanceRatio <= maxDistanceRatio) {
                    double mergeProb = shapeMerger.checkMerge(previousShape, shape);
                    boolean wantsToMerge = (mergeProb >= minProbabilityForDecision);
                    fScoreCalculator.increment(shouldMerge ? "YES" : "NO", wantsToMerge ? "YES" : "NO");
                } else {
                    LOG.trace("too wide");
                    if (shouldMerge)
                        fScoreCalculator.increment("YES", "WIDE");
                    else
                        fScoreCalculator.increment("NO", "NO");
                }
            // too wide?
            }
            // have previous shape?
            previousShape = shape;
        }
    // next shape
    }
    // next group
    return fScoreCalculator;
}
Also used : Shape(com.joliciel.jochre.graphics.Shape) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) FScoreCalculator(com.joliciel.jochre.stats.FScoreCalculator)

Example 55 with Shape

use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.

the class RecursiveShapeSplitter method split.

List<ShapeSequence> split(Shape shape, int depth, Shape originalShape, boolean leftToRight) {
    String padding = "-";
    for (int i = 0; i < depth; i++) padding += "-";
    padding += " ";
    if (LOG.isTraceEnabled()) {
        LOG.trace(padding + "Splitting shape: " + shape.getLeft() + " , " + shape.getRight());
        LOG.trace(padding + "depth: " + depth);
    }
    List<ShapeSequence> shapeSequences = new ArrayList<ShapeSequence>();
    // check if shape is wide enough to bother with
    double widthRatio = (double) shape.getWidth() / (double) shape.getXHeight();
    if (LOG.isTraceEnabled())
        LOG.trace(padding + "widthRatio: " + widthRatio);
    if (widthRatio < minWidthRatio || depth >= maxDepth) {
        if (LOG.isTraceEnabled())
            LOG.trace(padding + "too narrow or too deep");
        ShapeSequence shapeSequence = new ShapeSequence();
        shapeSequence.addShape(shape, originalShape);
        shapeSequences.add(shapeSequence);
    } else {
        List<Split> splitCandidates = this.splitCandidateFinder.findSplitCandidates(shape);
        TreeSet<ShapeSequence> myShapeSequences = new TreeSet<ShapeSequence>();
        TreeSet<WeightedOutcome<Split>> weightedSplits = new TreeSet<WeightedOutcome<Split>>();
        for (Split splitCandidate : splitCandidates) {
            double splitProb = this.shouldSplit(splitCandidate);
            WeightedOutcome<Split> weightedSplit = new WeightedOutcome<Split>(splitCandidate, splitProb);
            weightedSplits.add(weightedSplit);
        }
        double maxSplitProb = 0.0;
        if (weightedSplits.size() > 0)
            maxSplitProb = weightedSplits.first().getWeight();
        double noSplitProb = 1 - maxSplitProb;
        if (noSplitProb > maxSplitProb)
            maxSplitProb = noSplitProb;
        Split noSplit = new Split(shape, jochreSession);
        noSplit.setPosition(-1);
        WeightedOutcome<Split> weightedNoSplit = new WeightedOutcome<Split>(noSplit, noSplitProb);
        weightedSplits.add(weightedNoSplit);
        boolean topCandidate = true;
        double topCandidateWeight = 1.0;
        for (WeightedOutcome<Split> weightedSplit : weightedSplits) {
            Split splitCandidate = weightedSplit.getOutcome();
            double splitProb = weightedSplit.getWeight();
            if (LOG.isTraceEnabled())
                LOG.trace(padding + "splitCandidate: left=" + splitCandidate.getShape().getLeft() + ", pos=" + splitCandidate.getPosition() + ", initial prob: " + splitProb);
            if (LOG.isTraceEnabled()) {
                if (topCandidate) {
                    LOG.trace(padding + "topCandidate");
                }
            }
            if (splitCandidate.getPosition() < 0) {
                // This is the no-split candidate
                if (topCandidate)
                    topCandidateWeight = 1.0;
                ShapeSequence shapeSequence = new ShapeSequence();
                shapeSequence.addShape(shape, originalShape);
                double prob = (splitProb / maxSplitProb) * topCandidateWeight;
                if (LOG.isTraceEnabled())
                    LOG.trace(padding + "noSplit prob=(" + splitProb + " / " + maxSplitProb + ") * " + topCandidateWeight + " = " + prob);
                Decision decision = new Decision(SplitOutcome.DO_NOT_SPLIT.name(), prob);
                shapeSequence.addDecision(decision);
                myShapeSequences.add(shapeSequence);
            } else {
                // a proper split
                Shape leftShape = shape.getJochreImage().getShape(shape.getLeft(), shape.getTop(), shape.getLeft() + splitCandidate.getPosition(), shape.getBottom());
                Shape rightShape = shape.getJochreImage().getShape(shape.getLeft() + splitCandidate.getPosition() + 1, shape.getTop(), shape.getRight(), shape.getBottom());
                // for each split recursively try to split it again up to depth of m
                // Note: m=2 is probably enough, since we're not expecting more than 4
                // letters per shape (3 splits)
                List<ShapeSequence> leftShapeSequences = this.split(leftShape, depth + 1, originalShape, leftToRight);
                List<ShapeSequence> rightShapeSequences = this.split(rightShape, depth + 1, originalShape, leftToRight);
                if (topCandidate) {
                    // find the no-split sequence in each sub-sequence
                    ShapeSequence noSplitLeft = null;
                    for (ShapeSequence leftShapeSequence : leftShapeSequences) {
                        if (leftShapeSequence.size() == 1) {
                            noSplitLeft = leftShapeSequence;
                            break;
                        }
                    }
                    ShapeSequence noSplitRight = null;
                    for (ShapeSequence rightShapeSequence : rightShapeSequences) {
                        if (rightShapeSequence.size() == 1) {
                            noSplitRight = rightShapeSequence;
                            break;
                        }
                    }
                    // we should be guaranteed to find a noSplitLeft and noSplitRight
                    // since a no-split candidate is always returned
                    topCandidateWeight = noSplitLeft.getScore() * noSplitRight.getScore();
                    if (LOG.isTraceEnabled())
                        LOG.trace(padding + "topCandidateWeight=" + noSplitLeft.getScore() + " *" + noSplitRight.getScore() + " = " + topCandidateWeight);
                }
                for (ShapeSequence leftShapeSequence : leftShapeSequences) {
                    for (ShapeSequence rightShapeSequence : rightShapeSequences) {
                        ShapeSequence newSequence = null;
                        if (leftToRight)
                            newSequence = new ShapeSequence(leftShapeSequence, rightShapeSequence);
                        else
                            newSequence = new ShapeSequence(rightShapeSequence, leftShapeSequence);
                        if (LOG.isTraceEnabled()) {
                            StringBuilder sb = new StringBuilder();
                            for (ShapeInSequence splitShape : newSequence) {
                                sb.append("(" + splitShape.getShape().getLeft() + "," + splitShape.getShape().getRight() + ") ");
                            }
                            LOG.trace(padding + sb.toString());
                        }
                        double totalProb = 1.0;
                        for (Decision decision : newSequence.getDecisions()) {
                            totalProb = totalProb * decision.getProbability();
                        }
                        newSequence.getDecisions().clear();
                        double prob = 0.0;
                        if (topCandidate) {
                            prob = totalProb * (splitProb / maxSplitProb);
                            if (LOG.isTraceEnabled())
                                LOG.trace(padding + "prob=" + totalProb + " * (" + splitProb + " / " + maxSplitProb + ") = " + prob);
                        } else {
                            prob = totalProb * (splitProb / maxSplitProb) * topCandidateWeight;
                            if (LOG.isTraceEnabled())
                                LOG.trace(padding + "prob=" + totalProb + " * (" + splitProb + " / " + maxSplitProb + ") * " + topCandidateWeight + " = " + prob);
                        }
                        Decision decision = new Decision(SplitOutcome.DO_SPLIT.name(), prob);
                        newSequence.addDecision(decision);
                        myShapeSequences.add(newSequence);
                    }
                }
            }
            topCandidate = false;
        }
        int i = 0;
        for (ShapeSequence shapeSequence : myShapeSequences) {
            // probability
            if (shapeSequence.size() == 1 || i < beamWidth) {
                shapeSequences.add(shapeSequence);
            }
            i++;
        }
    }
    return shapeSequences;
}
Also used : Shape(com.joliciel.jochre.graphics.Shape) ArrayList(java.util.ArrayList) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) TreeSet(java.util.TreeSet)

Aggregations

Shape (com.joliciel.jochre.graphics.Shape)74 ArrayList (java.util.ArrayList)22 GroupOfShapes (com.joliciel.jochre.graphics.GroupOfShapes)14 JochreImage (com.joliciel.jochre.graphics.JochreImage)13 Paragraph (com.joliciel.jochre.graphics.Paragraph)9 RowOfShapes (com.joliciel.jochre.graphics.RowOfShapes)9 Decision (com.joliciel.talismane.machineLearning.Decision)8 Test (org.junit.Test)8 JochreSession (com.joliciel.jochre.JochreSession)7 JochrePage (com.joliciel.jochre.doc.JochrePage)7 Config (com.typesafe.config.Config)7 TreeSet (java.util.TreeSet)7 JochreDocument (com.joliciel.jochre.doc.JochreDocument)6 BufferedImage (java.awt.image.BufferedImage)6 ShapeInSequence (com.joliciel.jochre.boundaries.ShapeInSequence)5 ShapeSequence (com.joliciel.jochre.boundaries.ShapeSequence)5 GraphicsDao (com.joliciel.jochre.graphics.GraphicsDao)5 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)5 SplitFeature (com.joliciel.jochre.boundaries.features.SplitFeature)4 JochreException (com.joliciel.jochre.utils.JochreException)4