Search in sources :

Example 1 with GroupOfShapes

use of com.joliciel.jochre.graphics.GroupOfShapes in project jochre by urieli.

the class SegmentationTest method testSegmentation.

@Test
public void testSegmentation() throws Exception {
    // TODO: Note currently this requires high thresholds to work
    // Need to decide if this is valid in general, or only for these samples
    System.setProperty("config.file", "src/test/resources/testHighThresholds.conf");
    ConfigFactory.invalidateCaches();
    Config config = ConfigFactory.load();
    JochreSession jochreSession = new JochreSession(config);
    boolean writePixelsToLog = true;
    for (int imageNumber = 1; imageNumber <= 4; imageNumber++) {
        if (imageNumber != 1)
            continue;
        String imageName = "";
        String suffix = "";
        String text = "";
        String fileName = "";
        String userFileName;
        int rowCount = 2;
        int shapeCountRow1;
        int shapeCountRow2;
        int groupCountRow1;
        int groupCountRow2;
        int groupCountRow3 = 0;
        int shapeCountRow1Group1;
        int shapeCountRow2Group1;
        if (imageNumber == 1) {
            imageName = "MotlPeysiDemKhazns2RowsShort2";
            suffix = "jpg";
            text = "איך געה מיט אייך קיינער אין דער וועלט";
            fileName = "MotlPeysiDemKhazns2RowsShort2.pdf";
            userFileName = "Motl Peysi Dem Khazns";
            shapeCountRow1 = 13;
            shapeCountRow2 = 17;
            groupCountRow1 = 4;
            groupCountRow2 = 4;
            shapeCountRow1Group1 = 3;
            shapeCountRow2Group1 = 6;
        } else if (imageNumber == 2) {
            imageName = "MegileLiderZeresh";
            suffix = "png";
            text = "זרש, די מכשפה, װאָס שעלט ווי אַ מגפה";
            fileName = "MegileLiderManger.pdf";
            userFileName = "Megile Lider";
            shapeCountRow1 = 12;
            shapeCountRow2 = 17;
            groupCountRow1 = 3;
            groupCountRow2 = 5;
            shapeCountRow1Group1 = 4;
            shapeCountRow2Group1 = 4;
        } else if (imageNumber == 3) {
            imageName = "MendeleMoykherSforimVol1_41_0Excerpt";
            suffix = "png";
            text = "ער הייסט יאַנקיל, בעריל,";
            fileName = "MendeleMoykherSforimVol1_41_0.png";
            userFileName = "MendeleMoykherSforimVol1_41_0";
            shapeCountRow1 = 20;
            shapeCountRow2 = 0;
            groupCountRow1 = 4;
            groupCountRow2 = 0;
            shapeCountRow1Group1 = 2;
            shapeCountRow2Group1 = 0;
        } else {
            imageName = "JoinedLetterTest";
            suffix = "png";
            text = "Joined Letter Test";
            fileName = "JoinedLetterTest.png";
            userFileName = "JoinedLetterTest";
            rowCount = 2;
            shapeCountRow1 = 23;
            shapeCountRow2 = 23;
            groupCountRow1 = 4;
            groupCountRow2 = 4;
            groupCountRow3 = 5;
            shapeCountRow1Group1 = 6;
            shapeCountRow2Group1 = 5;
        }
        LOG.debug("######### imageName: " + imageName);
        // String fileName = "data/Zelmenyaners3Words.gif";
        InputStream imageFileStream = getClass().getResourceAsStream("/com/joliciel/jochre/test/resources/" + imageName + "." + suffix);
        assertNotNull(imageFileStream);
        BufferedImage image = ImageIO.read(imageFileStream);
        JochreDocument doc = new JochreDocument(jochreSession);
        doc.setFileName(fileName);
        doc.setName(userFileName);
        JochrePage page = doc.newPage();
        SourceImage sourceImage = page.newJochreImage(image, imageName);
        sourceImage.setWhiteGapFillFactor(5);
        sourceImage.setImageStatus(ImageStatus.AUTO_NEW);
        if (writePixelsToLog) {
            LOG.debug("i012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789");
            for (int y = 0; y < sourceImage.getHeight(); y++) {
                String line = "" + y;
                for (int x = 0; x < sourceImage.getWidth(); x++) {
                    if (sourceImage.isPixelBlack(x, y, sourceImage.getBlackThreshold()))
                        line += "x";
                    else
                        line += "o";
                }
                LOG.debug(line);
            }
        }
        Segmenter segmenter = new Segmenter(sourceImage, jochreSession);
        segmenter.segment();
        if (segmenter.isDrawSegmentation()) {
            BufferedImage segmentedImage = segmenter.getSegmentedImage();
            File tempDir = new File(System.getProperty("java.io.tmpdir"));
            ImageIO.write(segmentedImage, "PNG", new File(tempDir, imageName + "_seg.png"));
        }
        JochreImage jochreImage = sourceImage;
        int i = 0;
        boolean firstShape = true;
        int midPixelFirstShape = 0;
        int midPixelFirstShapeRaw = 0;
        for (Paragraph paragraph : jochreImage.getParagraphs()) {
            for (RowOfShapes row : paragraph.getRows()) {
                int j = 0;
                LOG.debug("============= Row " + i + " ================");
                for (Shape shape : row.getShapes()) {
                    LOG.debug("Shape (" + i + "," + j + "). Left = " + shape.getLeft() + ". Top = " + shape.getTop() + ". Right = " + shape.getRight() + ". Bottom = " + shape.getBottom() + ". Group: " + shape.getGroup().getIndex());
                    if (firstShape) {
                        midPixelFirstShape = shape.getPixel(3, 3);
                        midPixelFirstShapeRaw = shape.getRawPixel(3, 3);
                        firstShape = false;
                    }
                    if (writePixelsToLog) {
                        for (int y = 0; y < shape.getHeight(); y++) {
                            String line = "";
                            if (y == shape.getMeanLine())
                                line += "M";
                            else if (y == shape.getBaseLine())
                                line += "B";
                            else
                                line += y;
                            for (int x = 0; x < shape.getWidth(); x++) {
                                if (shape.isPixelBlack(x, y, sourceImage.getBlackThreshold()))
                                    line += "x";
                                else
                                    line += "o";
                            }
                            LOG.debug(line);
                        }
                    }
                    j++;
                }
                // next shape
                i++;
            }
        // next row
        }
        // next paragraph
        i = 0;
        for (Paragraph paragraph : jochreImage.getParagraphs()) {
            for (RowOfShapes row : paragraph.getRows()) {
                for (GroupOfShapes group : row.getGroups()) {
                    for (Shape shape : group.getShapes()) {
                        if (i < text.length()) {
                            String letter = text.substring(i, i + 1);
                            String nextLetter = "";
                            if (i + 1 < text.length())
                                nextLetter = text.substring(i + 1, i + 2);
                            if (nextLetter.equals("ָֹ") || nextLetter.equals("ַ")) {
                                letter += nextLetter;
                                i++;
                            }
                            LOG.debug("Letter: " + letter);
                            shape.setLetter(letter);
                        }
                        i++;
                    }
                    // to skip the space
                    i++;
                    LOG.debug("Space");
                }
            // next group
            }
        // next row
        }
        // next paragraph
        List<ShapeFeature<?>> features = new ArrayList<ShapeFeature<?>>();
        features.add(new VerticalElongationFeature());
        features.add(new VerticalSizeFeature());
        features.add(new TouchesBaseLineFeature());
        features.add(new TouchesMeanLineFeature());
        features.add(new EmptyCentreFeature());
        i = 0;
        DecimalFormat df = new DecimalFormat("0.00");
        firstShape = true;
        int totalRowCount = 0;
        for (Paragraph paragraph : jochreImage.getParagraphs()) {
            for (RowOfShapes row : paragraph.getRows()) {
                totalRowCount++;
                LOG.debug("============= Row " + i + " ================");
                int j = 0;
                for (GroupOfShapes group : row.getGroups()) {
                    for (Shape shape : group.getShapes()) {
                        LOG.debug("============= Shape (" + i + "," + j + ") ================");
                        LOG.debug("Left = " + shape.getLeft() + ". Top = " + shape.getTop() + ". Right = " + shape.getRight() + ". Bottom = " + shape.getBottom());
                        LOG.debug("Letter " + shape.getLetter());
                        if (firstShape) {
                            LOG.debug("mid pixel: " + midPixelFirstShape);
                            assertEquals(midPixelFirstShape, shape.getPixel(3, 3));
                            LOG.debug("mid pixel raw: " + midPixelFirstShapeRaw);
                            assertEquals(midPixelFirstShapeRaw, shape.getRawPixel(3, 3));
                            firstShape = false;
                        }
                        if (writePixelsToLog) {
                            for (int y = 0; y < shape.getHeight(); y++) {
                                String line = "";
                                if (y == shape.getMeanLine())
                                    line += "M";
                                else if (y == shape.getBaseLine())
                                    line += "B";
                                else
                                    line += y;
                                for (int x = 0; x < shape.getWidth(); x++) {
                                    if (shape.isPixelBlack(x, y, sourceImage.getBlackThreshold()))
                                        line += "x";
                                    else
                                        line += "o";
                                }
                                LOG.debug(line);
                            }
                        }
                        double[][] totals = shape.getBrightnessBySection(5, 5, 1, SectionBrightnessMeasurementMethod.RAW);
                        LOG.debug("Brightness counts");
                        for (int y = 0; y < totals[0].length; y++) {
                            String line = "";
                            for (int x = 0; x < totals.length; x++) {
                                line += df.format(totals[x][y]) + "\t";
                            }
                            LOG.debug(line);
                        }
                        for (ShapeFeature<?> feature : features) {
                            RuntimeEnvironment env = new RuntimeEnvironment();
                            FeatureResult<?> outcome = feature.check(shape, env);
                            LOG.debug(outcome.toString());
                        }
                    }
                    if (i == 0) {
                        if (j == 0)
                            assertEquals(shapeCountRow1Group1, group.getShapes().size());
                    } else if (i == 1) {
                        if (j == 0)
                            assertEquals(shapeCountRow2Group1, group.getShapes().size());
                    }
                    j++;
                }
                if (i == 0)
                    assertEquals(groupCountRow1, row.getGroups().size());
                else if (i == 1)
                    assertEquals(groupCountRow2, row.getGroups().size());
                else if (i == 2)
                    assertEquals(groupCountRow3, row.getGroups().size());
                if (i == 0)
                    assertEquals(shapeCountRow1, row.getShapes().size());
                else if (i == 1)
                    assertEquals(shapeCountRow2, row.getShapes().size());
                i++;
            }
        // next row
        }
        // next paragraph
        assertEquals(rowCount, totalRowCount);
    }
    // next test image
    LOG.debug("************** Finished ***********");
}
Also used : ShapeFeature(com.joliciel.jochre.graphics.features.ShapeFeature) Shape(com.joliciel.jochre.graphics.Shape) SourceImage(com.joliciel.jochre.graphics.SourceImage) Config(com.typesafe.config.Config) DecimalFormat(java.text.DecimalFormat) ArrayList(java.util.ArrayList) VerticalSizeFeature(com.joliciel.jochre.graphics.features.VerticalSizeFeature) JochreDocument(com.joliciel.jochre.doc.JochreDocument) TouchesBaseLineFeature(com.joliciel.jochre.graphics.features.TouchesBaseLineFeature) BufferedImage(java.awt.image.BufferedImage) JochreSession(com.joliciel.jochre.JochreSession) EmptyCentreFeature(com.joliciel.jochre.graphics.features.EmptyCentreFeature) JochreImage(com.joliciel.jochre.graphics.JochreImage) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) TouchesMeanLineFeature(com.joliciel.jochre.graphics.features.TouchesMeanLineFeature) InputStream(java.io.InputStream) RowOfShapes(com.joliciel.jochre.graphics.RowOfShapes) VerticalElongationFeature(com.joliciel.jochre.graphics.features.VerticalElongationFeature) Segmenter(com.joliciel.jochre.graphics.Segmenter) JochrePage(com.joliciel.jochre.doc.JochrePage) Paragraph(com.joliciel.jochre.graphics.Paragraph) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) File(java.io.File) Test(org.junit.Test)

Example 2 with GroupOfShapes

use of com.joliciel.jochre.graphics.GroupOfShapes in project jochre by urieli.

the class TextGetterImplTest method testGetTextFontSizes.

@Test
public void testGetTextFontSizes() {
    final JochreDocument doc = mock(JochreDocument.class);
    final JochrePage page = mock(JochrePage.class);
    final JochreImage jochreImage = mock(JochreImage.class);
    final Paragraph paragraph = mock(Paragraph.class);
    final RowOfShapes row = mock(RowOfShapes.class);
    final GroupOfShapes group1 = mock(GroupOfShapes.class);
    final GroupOfShapes group2 = mock(GroupOfShapes.class);
    final GroupOfShapes group3 = mock(GroupOfShapes.class);
    final GroupOfShapes group4 = mock(GroupOfShapes.class);
    final Shape shape1 = mock(Shape.class);
    final Shape shape2 = mock(Shape.class);
    final Shape shape3 = mock(Shape.class);
    final Shape shape4 = mock(Shape.class);
    final List<Paragraph> paragraphs = new ArrayList<>();
    paragraphs.add(paragraph);
    final List<RowOfShapes> rows = new ArrayList<>();
    rows.add(row);
    when(jochreImage.getPage()).thenReturn(page);
    when(page.getDocument()).thenReturn(doc);
    when(doc.isLeftToRight()).thenReturn(true);
    when(jochreImage.getParagraphs()).thenReturn(paragraphs);
    when(paragraph.getRows()).thenReturn(rows);
    List<GroupOfShapes> groups = new ArrayList<>();
    groups.add(group1);
    groups.add(group2);
    groups.add(group3);
    groups.add(group4);
    when(row.getGroups()).thenReturn(groups);
    List<Shape> shapes1 = new ArrayList<>();
    shapes1.add(shape1);
    when(group1.getShapes()).thenReturn(shapes1);
    when(group1.getXHeight()).thenReturn(10);
    List<Shape> shapes2 = new ArrayList<>();
    shapes2.add(shape2);
    when(group2.getShapes()).thenReturn(shapes2);
    when(group2.getXHeight()).thenReturn(20);
    List<Shape> shapes3 = new ArrayList<>();
    shapes3.add(shape3);
    when(group3.getShapes()).thenReturn(shapes3);
    when(group3.getXHeight()).thenReturn(10);
    List<Shape> shapes4 = new ArrayList<>();
    shapes4.add(shape4);
    when(group4.getShapes()).thenReturn(shapes4);
    when(group4.getXHeight()).thenReturn(5);
    when(shape1.getLetter()).thenReturn("A");
    when(shape1.getXHeight()).thenReturn(10);
    when(shape2.getLetter()).thenReturn("B");
    when(shape2.getXHeight()).thenReturn(20);
    when(shape3.getLetter()).thenReturn("C");
    when(shape3.getXHeight()).thenReturn(10);
    when(shape4.getLetter()).thenReturn("D");
    when(shape4.getXHeight()).thenReturn(5);
    StringWriter writer = new StringWriter();
    TextGetter textGetter = new TextGetter(writer, TextFormat.XHTML);
    textGetter.onImageComplete(jochreImage);
    String result = writer.toString();
    LOG.debug(result);
    assertEquals("<p dir=\"rtl\">A <big>B </big>C <small>D </small></p>", result);
}
Also used : JochreImage(com.joliciel.jochre.graphics.JochreImage) Shape(com.joliciel.jochre.graphics.Shape) ArrayList(java.util.ArrayList) RowOfShapes(com.joliciel.jochre.graphics.RowOfShapes) JochreDocument(com.joliciel.jochre.doc.JochreDocument) JochrePage(com.joliciel.jochre.doc.JochrePage) Paragraph(com.joliciel.jochre.graphics.Paragraph) StringWriter(java.io.StringWriter) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) Test(org.junit.Test)

Example 3 with GroupOfShapes

use of com.joliciel.jochre.graphics.GroupOfShapes in project jochre by urieli.

the class TextGetterImplTest method testGetText.

@Test
public void testGetText() {
    final JochreDocument doc = mock(JochreDocument.class);
    final JochrePage page = mock(JochrePage.class);
    final JochreImage jochreImage = mock(JochreImage.class);
    final Paragraph paragraph = mock(Paragraph.class);
    final RowOfShapes row = mock(RowOfShapes.class);
    final GroupOfShapes group = mock(GroupOfShapes.class);
    final Shape shape1 = mock(Shape.class);
    final Shape shape2 = mock(Shape.class);
    final Shape shape3 = mock(Shape.class);
    final Shape shape4 = mock(Shape.class);
    final Shape shape5 = mock(Shape.class);
    final Shape shape6 = mock(Shape.class);
    final Shape shape7 = mock(Shape.class);
    final Shape shape8 = mock(Shape.class);
    final Shape shape9 = mock(Shape.class);
    final List<Paragraph> paragraphs = new ArrayList<>();
    paragraphs.add(paragraph);
    final List<RowOfShapes> rows = new ArrayList<>();
    rows.add(row);
    final List<GroupOfShapes> groups = new ArrayList<>();
    groups.add(group);
    when(jochreImage.getPage()).thenReturn(page);
    when(page.getDocument()).thenReturn(doc);
    when(doc.isLeftToRight()).thenReturn(false);
    when(jochreImage.getParagraphs()).thenReturn(paragraphs);
    when(paragraph.getRows()).thenReturn(rows);
    when(row.getGroups()).thenReturn(groups);
    List<Shape> shapes = new ArrayList<>();
    shapes.add(shape1);
    shapes.add(shape2);
    shapes.add(shape3);
    shapes.add(shape4);
    shapes.add(shape5);
    shapes.add(shape6);
    shapes.add(shape7);
    shapes.add(shape8);
    shapes.add(shape9);
    when(group.getShapes()).thenReturn(shapes);
    when(group.getXHeight()).thenReturn(10);
    when(shape1.getLetter()).thenReturn(",");
    when(shape2.getLetter()).thenReturn(",");
    when(shape3.getLetter()).thenReturn("|אַ");
    when(shape4.getLetter()).thenReturn("אַ|");
    when(shape5.getLetter()).thenReturn("|m");
    when(shape6.getLetter()).thenReturn("m|");
    when(shape7.getLetter()).thenReturn("|ש");
    when(shape8.getLetter()).thenReturn("ע|");
    when(shape9.getLetter()).thenReturn(",");
    StringWriter writer = new StringWriter();
    TextGetter textGetter = new TextGetter(writer, TextFormat.PLAIN);
    textGetter.onImageComplete(jochreImage);
    String result = writer.toString();
    LOG.debug(result);
    assertEquals("„אַm|שע|, \n", result);
}
Also used : JochreImage(com.joliciel.jochre.graphics.JochreImage) Shape(com.joliciel.jochre.graphics.Shape) ArrayList(java.util.ArrayList) RowOfShapes(com.joliciel.jochre.graphics.RowOfShapes) JochreDocument(com.joliciel.jochre.doc.JochreDocument) JochrePage(com.joliciel.jochre.doc.JochrePage) Paragraph(com.joliciel.jochre.graphics.Paragraph) StringWriter(java.io.StringWriter) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) Test(org.junit.Test)

Example 4 with GroupOfShapes

use of com.joliciel.jochre.graphics.GroupOfShapes in project jochre by urieli.

the class TextGetter method onImageComplete.

@Override
public void onImageComplete(JochreImage image) {
    try {
        double minRatioBiggerFont = 1.15;
        double maxRatioSmallerFont = 0.85;
        double meanXHeight = 0;
        if (textFormat.equals(TextFormat.XHTML)) {
            Mean xHeightMean = new Mean();
            for (Paragraph paragraph : image.getParagraphs()) {
                if (!paragraph.isJunk()) {
                    for (RowOfShapes row : paragraph.getRows()) {
                        for (GroupOfShapes group : row.getGroups()) {
                            for (Shape shape : group.getShapes()) {
                                xHeightMean.increment(shape.getXHeight());
                            }
                        }
                    }
                }
            }
            meanXHeight = xHeightMean.getResult();
        }
        String paragraphString = "<p>";
        if (!image.isLeftToRight())
            paragraphString = "<p dir=\"rtl\">";
        for (Paragraph paragraph : image.getParagraphs()) {
            if (!paragraph.isJunk()) {
                if (textFormat.equals(TextFormat.XHTML))
                    writer.append(paragraphString);
                Map<Integer, Boolean> fontSizeChanges = new TreeMap<Integer, Boolean>();
                int currentFontSize = 0;
                StringBuilder paragraphText = new StringBuilder();
                String lastWord = "";
                boolean lastRowEndedWithHyphen = false;
                for (RowOfShapes row : paragraph.getRows()) {
                    for (GroupOfShapes group : row.getGroups()) {
                        boolean endOfRowHyphen = false;
                        if (textFormat.equals(TextFormat.XHTML)) {
                            double ratio = group.getXHeight() / meanXHeight;
                            if (ratio >= minRatioBiggerFont) {
                                if (currentFontSize <= 0)
                                    fontSizeChanges.put(paragraphText.length(), true);
                                currentFontSize = 1;
                            } else if (ratio <= maxRatioSmallerFont) {
                                if (currentFontSize >= 0)
                                    fontSizeChanges.put(paragraphText.length(), false);
                                currentFontSize = -1;
                            } else if (currentFontSize != 0) {
                                if (currentFontSize > 0)
                                    fontSizeChanges.put(paragraphText.length(), false);
                                else if (currentFontSize < 0)
                                    fontSizeChanges.put(paragraphText.length(), true);
                                currentFontSize = 0;
                            }
                        }
                        StringBuilder sb = new StringBuilder();
                        StringBuilder currentSequence = new StringBuilder();
                        for (Shape shape : group.getShapes()) {
                            String letter = shape.getLetter();
                            if (letter.startsWith("|")) {
                                // beginning of a gehakte letter
                                currentSequence.append(shape.getLetter());
                                continue;
                            } else if (letter.endsWith("|")) {
                                // end of a gehakte letter
                                if (currentSequence.length() > 0 && currentSequence.charAt(0) == '|') {
                                    String letter1 = currentSequence.toString().substring(1);
                                    String letter2 = letter.substring(0, letter.length() - 1);
                                    if (letter1.equals(letter2)) {
                                        letter = letter1;
                                    } else {
                                        letter = currentSequence.toString() + letter;
                                    }
                                    currentSequence = new StringBuilder();
                                }
                            }
                            if (letter.equals(",")) {
                                // could be ",," = "„"
                                if (currentSequence.length() > 0 && currentSequence.charAt(0) == ',') {
                                    sb.append("„");
                                    currentSequence = new StringBuilder();
                                } else {
                                    currentSequence.append(shape.getLetter());
                                }
                            } else if (letter.equals("'")) {
                                // could be "''" = "“"
                                if (currentSequence.length() > 0 && currentSequence.charAt(0) == '\'') {
                                    sb.append("“");
                                    currentSequence = new StringBuilder();
                                } else {
                                    currentSequence.append(shape.getLetter());
                                }
                            } else if (letter.equals("-")) {
                                if (shape.getIndex() == group.getShapes().size() - 1 && group.getIndex() == row.getGroups().size() - 1 && row.getIndex() != paragraph.getRows().size() - 1) {
                                    // do nothing - dash at the end of the
                                    // line
                                    // we'll assume for now these dashes are
                                    // always supposed to disappear
                                    // though of course they could be used
                                    // in the place of a real mid-word dash
                                    endOfRowHyphen = true;
                                } else {
                                    sb.append(shape.getLetter());
                                }
                            } else {
                                sb.append(currentSequence);
                                currentSequence = new StringBuilder();
                                // generalise this
                                if (letter.equals(",,")) {
                                    sb.append("„");
                                } else if (letter.equals("''")) {
                                    sb.append("“");
                                } else {
                                    sb.append(letter);
                                }
                            }
                        }
                        // next shape
                        sb.append(currentSequence);
                        String word = sb.toString();
                        if (endOfRowHyphen) {
                            lastRowEndedWithHyphen = true;
                            endOfRowHyphen = false;
                        } else if (lastRowEndedWithHyphen) {
                            if (lexicon != null) {
                                String hyphenatedWord = lastWord + "-" + word;
                                int frequency = lexicon.getFrequency(hyphenatedWord);
                                LOG.debug("hyphenatedWord: " + hyphenatedWord + ", Frequency: " + frequency);
                                if (frequency > 0) {
                                    paragraphText.append("-");
                                }
                            }
                            lastRowEndedWithHyphen = false;
                        }
                        lastWord = word;
                        paragraphText.append(word);
                        if (!lastRowEndedWithHyphen)
                            paragraphText.append(' ');
                    }
                // next group
                }
                // next row
                String paragraphStr = paragraphText.toString();
                Writer currentWriter = writer;
                boolean haveFontSizes = fontSizeChanges.size() > 0;
                if (haveFontSizes) {
                    currentWriter = new StringWriter();
                }
                if (image.getPage().getDocument().isLeftToRight()) {
                    currentWriter.append(paragraphText);
                } else {
                    this.appendBidiText(paragraphStr, currentWriter);
                }
                if (haveFontSizes) {
                    currentFontSize = 0;
                    String text = currentWriter.toString();
                    int currentIndex = 0;
                    for (int fontSizeChange : fontSizeChanges.keySet()) {
                        boolean isBigger = fontSizeChanges.get(fontSizeChange);
                        writer.append(text.substring(currentIndex, fontSizeChange));
                        if (isBigger) {
                            if (currentFontSize == 0) {
                                writer.append("<big>");
                                currentFontSize++;
                            } else if (currentFontSize < 0) {
                                writer.append("</small>");
                                currentFontSize++;
                            }
                        } else {
                            if (currentFontSize == 0) {
                                writer.append("<small>");
                                currentFontSize--;
                            } else if (currentFontSize > 0) {
                                writer.append("</big>");
                                currentFontSize--;
                            }
                        }
                        currentIndex = fontSizeChange;
                    }
                    writer.append(text.substring(currentIndex));
                    if (currentFontSize > 0) {
                        writer.append("</big>");
                    } else if (currentFontSize < 0) {
                        writer.append("</small>");
                    }
                }
                if (textFormat.equals(TextFormat.XHTML))
                    writer.append("</p>");
                else
                    writer.append('\n');
                writer.flush();
            }
        // paragraph.isJunk()?
        }
    // next paragraph
    } catch (IOException e) {
        LOG.error("Failed writing to " + this.getClass().getSimpleName(), e);
        throw new RuntimeException(e);
    }
}
Also used : Mean(org.apache.commons.math.stat.descriptive.moment.Mean) Shape(com.joliciel.jochre.graphics.Shape) RowOfShapes(com.joliciel.jochre.graphics.RowOfShapes) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Paragraph(com.joliciel.jochre.graphics.Paragraph) StringWriter(java.io.StringWriter) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) StringWriter(java.io.StringWriter) Writer(java.io.Writer)

Example 5 with GroupOfShapes

use of com.joliciel.jochre.graphics.GroupOfShapes in project jochre by urieli.

the class LexiconErrorWriter method onGuessSequence.

@Override
public void onGuessSequence(LetterSequence bestSequence) {
    try {
        int realFrequency = 0;
        if (wordChooser != null)
            realFrequency = wordChooser.getFrequency(bestSequence, false);
        boolean error = !bestSequence.getRealWord().equals(bestSequence.getGuessedWord());
        boolean known = realFrequency > 0;
        boolean badSeg = bestSequence.getRealSequence().contains("[") || bestSequence.getRealSequence().contains("|");
        for (int i = 0; i < 3; i++) {
            Writer writer = null;
            if (i == 0) {
                writer = allWordWriter;
            } else if (i == 1) {
                if (error)
                    writer = allErrorWriter;
                else
                    continue;
            } else {
                int j = 0;
                List<ErrorStatistics> statList = new ArrayList<LexiconErrorWriter.ErrorStatistics>();
                statList.add(errorMap.get(ALL_GROUP));
                statList.add(errorMap.get(currentDoc.getName()));
                for (String docGroupName : documentGroups.keySet()) {
                    if (documentGroups.get(docGroupName).contains(currentDoc.getId()))
                        statList.add(errorMap.get(docGroupName));
                }
                if (beamContainsRightWord) {
                    if (error) {
                        for (ErrorStatistics stats : statList) stats.answerInBeamErrorCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.answerInBeamCorrectCount++;
                    }
                    beamContainsRightWord = false;
                }
                Linguistics linguistics = jochreSession.getLinguistics();
                for (ShapeInSequence shapeInSequence : bestSequence.getUnderlyingShapeSequence()) {
                    String letterGuess = bestSequence.getLetters().get(j++);
                    String letter = shapeInSequence.getShape().getLetter();
                    boolean badSegLetter = letter.contains("|") || letter.length() == 0 || (letter.length() > 1 && !linguistics.getDualCharacterLetters().contains(letter));
                    if (letter.equals(letterGuess)) {
                        if (known) {
                            for (ErrorStatistics stats : statList) stats.knownWordCorrectLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.unknownWordCorrectLetterCount++;
                        }
                        if (badSegLetter) {
                            for (ErrorStatistics stats : statList) stats.badSegCorrectLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.goodSegCorrectLetterCount++;
                        }
                    } else {
                        if (known) {
                            for (ErrorStatistics stats : statList) stats.knownWordErrorLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.unknownWordErrorLetterCount++;
                        }
                        if (badSegLetter) {
                            for (ErrorStatistics stats : statList) stats.badSegErrorLetterCount++;
                        } else {
                            for (ErrorStatistics stats : statList) stats.goodSegErrorLetterCount++;
                        }
                    }
                }
                if (error && known) {
                    for (ErrorStatistics stats : statList) stats.knownWordErrorCount++;
                    writer = knownWordErrorWriter;
                } else if (error && !known) {
                    for (ErrorStatistics stats : statList) stats.unknownWordErrorCount++;
                    writer = unknownWordErrorWriter;
                } else if (!error && known) {
                    for (ErrorStatistics stats : statList) stats.knownWordCorrectCount++;
                    writer = knownWordCorrectWriter;
                } else if (!error && !known) {
                    for (ErrorStatistics stats : statList) stats.unknownWordCorrectCount++;
                    writer = unknownWordCorrectWriter;
                }
                if (error) {
                    if (badSeg) {
                        for (ErrorStatistics stats : statList) stats.badSegErrorCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.goodSegErrorCount++;
                    }
                } else {
                    if (badSeg) {
                        for (ErrorStatistics stats : statList) stats.badSegCorrectCount++;
                    } else {
                        for (ErrorStatistics stats : statList) stats.goodSegCorrectCount++;
                    }
                }
            }
            writer.write(CSV.format(bestSequence.getRealSequence()));
            writer.write(CSV.format(bestSequence.getRealWord()));
            writer.write(CSV.format(bestSequence.getGuessedSequence()));
            writer.write(CSV.format(bestSequence.getGuessedWord()));
            if (i < 2) {
                writer.write(CSV.format(known ? 1 : 0));
                writer.write(CSV.format(error ? 1 : 0));
            }
            writer.write(CSV.format(realFrequency));
            writer.write(CSV.format(bestSequence.getFrequency()));
            GroupOfShapes group = bestSequence.getGroups().get(0);
            writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getDocument().getName()));
            writer.write(CSV.format(group.getRow().getParagraph().getImage().getPage().getIndex()));
            writer.write(CSV.format(group.getRow().getParagraph().getIndex()));
            writer.write(CSV.format(group.getRow().getIndex()));
            writer.write(CSV.format(group.getIndex()));
            writer.write(CSV.format(group.getId()));
            if (this.includeBeam) {
                if (finalSequences != null) {
                    for (LetterSequence sequence : finalSequences) {
                        writer.write(CSV.format(sequence.getGuessedSequence()));
                        writer.write(CSV.format(sequence.getScore()));
                        writer.write(CSV.format(sequence.getAdjustedScore()));
                    }
                }
                writer.write(CSV.format(""));
                if (holdoverSequences != null) {
                    for (LetterSequence sequence : holdoverSequences) {
                        writer.write(CSV.format(sequence.getGuessedSequence()));
                        writer.write(CSV.format(sequence.getScore()));
                        writer.write(CSV.format(sequence.getAdjustedScore()));
                    }
                }
            }
            writer.write("\n");
            writer.flush();
        }
    } catch (IOException e) {
        LOG.error("Failed to write to LexiconErrorWriter", e);
        throw new RuntimeException(e);
    }
}
Also used : LetterSequence(com.joliciel.jochre.letterGuesser.LetterSequence) IOException(java.io.IOException) GroupOfShapes(com.joliciel.jochre.graphics.GroupOfShapes) Linguistics(com.joliciel.jochre.lang.Linguistics) ArrayList(java.util.ArrayList) List(java.util.List) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) ShapeInSequence(com.joliciel.jochre.boundaries.ShapeInSequence)

Aggregations

GroupOfShapes (com.joliciel.jochre.graphics.GroupOfShapes)18 Shape (com.joliciel.jochre.graphics.Shape)14 Paragraph (com.joliciel.jochre.graphics.Paragraph)10 RowOfShapes (com.joliciel.jochre.graphics.RowOfShapes)10 ArrayList (java.util.ArrayList)9 JochreImage (com.joliciel.jochre.graphics.JochreImage)6 ShapeInSequence (com.joliciel.jochre.boundaries.ShapeInSequence)4 JochreDocument (com.joliciel.jochre.doc.JochreDocument)4 JochrePage (com.joliciel.jochre.doc.JochrePage)4 LetterSequence (com.joliciel.jochre.letterGuesser.LetterSequence)4 Test (org.junit.Test)4 ShapeSequence (com.joliciel.jochre.boundaries.ShapeSequence)3 IOException (java.io.IOException)3 StringWriter (java.io.StringWriter)3 List (java.util.List)3 JochreSession (com.joliciel.jochre.JochreSession)2 Config (com.typesafe.config.Config)2 Writer (java.io.Writer)2 TreeMap (java.util.TreeMap)2 GraphicsDao (com.joliciel.jochre.graphics.GraphicsDao)1