use of com.joliciel.jochre.doc.JochrePage in project jochre by urieli.
the class SegmentationTest method testSegmentation.
@Test
public void testSegmentation() throws Exception {
// TODO: Note currently this requires high thresholds to work
// Need to decide if this is valid in general, or only for these samples
System.setProperty("config.file", "src/test/resources/testHighThresholds.conf");
ConfigFactory.invalidateCaches();
Config config = ConfigFactory.load();
JochreSession jochreSession = new JochreSession(config);
boolean writePixelsToLog = true;
for (int imageNumber = 1; imageNumber <= 4; imageNumber++) {
if (imageNumber != 1)
continue;
String imageName = "";
String suffix = "";
String text = "";
String fileName = "";
String userFileName;
int rowCount = 2;
int shapeCountRow1;
int shapeCountRow2;
int groupCountRow1;
int groupCountRow2;
int groupCountRow3 = 0;
int shapeCountRow1Group1;
int shapeCountRow2Group1;
if (imageNumber == 1) {
imageName = "MotlPeysiDemKhazns2RowsShort2";
suffix = "jpg";
text = "איך געה מיט אייך קיינער אין דער וועלט";
fileName = "MotlPeysiDemKhazns2RowsShort2.pdf";
userFileName = "Motl Peysi Dem Khazns";
shapeCountRow1 = 13;
shapeCountRow2 = 17;
groupCountRow1 = 4;
groupCountRow2 = 4;
shapeCountRow1Group1 = 3;
shapeCountRow2Group1 = 6;
} else if (imageNumber == 2) {
imageName = "MegileLiderZeresh";
suffix = "png";
text = "זרש, די מכשפה, װאָס שעלט ווי אַ מגפה";
fileName = "MegileLiderManger.pdf";
userFileName = "Megile Lider";
shapeCountRow1 = 12;
shapeCountRow2 = 17;
groupCountRow1 = 3;
groupCountRow2 = 5;
shapeCountRow1Group1 = 4;
shapeCountRow2Group1 = 4;
} else if (imageNumber == 3) {
imageName = "MendeleMoykherSforimVol1_41_0Excerpt";
suffix = "png";
text = "ער הייסט יאַנקיל, בעריל,";
fileName = "MendeleMoykherSforimVol1_41_0.png";
userFileName = "MendeleMoykherSforimVol1_41_0";
shapeCountRow1 = 20;
shapeCountRow2 = 0;
groupCountRow1 = 4;
groupCountRow2 = 0;
shapeCountRow1Group1 = 2;
shapeCountRow2Group1 = 0;
} else {
imageName = "JoinedLetterTest";
suffix = "png";
text = "Joined Letter Test";
fileName = "JoinedLetterTest.png";
userFileName = "JoinedLetterTest";
rowCount = 2;
shapeCountRow1 = 23;
shapeCountRow2 = 23;
groupCountRow1 = 4;
groupCountRow2 = 4;
groupCountRow3 = 5;
shapeCountRow1Group1 = 6;
shapeCountRow2Group1 = 5;
}
LOG.debug("######### imageName: " + imageName);
// String fileName = "data/Zelmenyaners3Words.gif";
InputStream imageFileStream = getClass().getResourceAsStream("/com/joliciel/jochre/test/resources/" + imageName + "." + suffix);
assertNotNull(imageFileStream);
BufferedImage image = ImageIO.read(imageFileStream);
JochreDocument doc = new JochreDocument(jochreSession);
doc.setFileName(fileName);
doc.setName(userFileName);
JochrePage page = doc.newPage();
SourceImage sourceImage = page.newJochreImage(image, imageName);
sourceImage.setWhiteGapFillFactor(5);
sourceImage.setImageStatus(ImageStatus.AUTO_NEW);
if (writePixelsToLog) {
LOG.debug("i012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789");
for (int y = 0; y < sourceImage.getHeight(); y++) {
String line = "" + y;
for (int x = 0; x < sourceImage.getWidth(); x++) {
if (sourceImage.isPixelBlack(x, y, sourceImage.getBlackThreshold()))
line += "x";
else
line += "o";
}
LOG.debug(line);
}
}
Segmenter segmenter = new Segmenter(sourceImage, jochreSession);
segmenter.segment();
if (segmenter.isDrawSegmentation()) {
BufferedImage segmentedImage = segmenter.getSegmentedImage();
File tempDir = new File(System.getProperty("java.io.tmpdir"));
ImageIO.write(segmentedImage, "PNG", new File(tempDir, imageName + "_seg.png"));
}
JochreImage jochreImage = sourceImage;
int i = 0;
boolean firstShape = true;
int midPixelFirstShape = 0;
int midPixelFirstShapeRaw = 0;
for (Paragraph paragraph : jochreImage.getParagraphs()) {
for (RowOfShapes row : paragraph.getRows()) {
int j = 0;
LOG.debug("============= Row " + i + " ================");
for (Shape shape : row.getShapes()) {
LOG.debug("Shape (" + i + "," + j + "). Left = " + shape.getLeft() + ". Top = " + shape.getTop() + ". Right = " + shape.getRight() + ". Bottom = " + shape.getBottom() + ". Group: " + shape.getGroup().getIndex());
if (firstShape) {
midPixelFirstShape = shape.getPixel(3, 3);
midPixelFirstShapeRaw = shape.getRawPixel(3, 3);
firstShape = false;
}
if (writePixelsToLog) {
for (int y = 0; y < shape.getHeight(); y++) {
String line = "";
if (y == shape.getMeanLine())
line += "M";
else if (y == shape.getBaseLine())
line += "B";
else
line += y;
for (int x = 0; x < shape.getWidth(); x++) {
if (shape.isPixelBlack(x, y, sourceImage.getBlackThreshold()))
line += "x";
else
line += "o";
}
LOG.debug(line);
}
}
j++;
}
// next shape
i++;
}
// next row
}
// next paragraph
i = 0;
for (Paragraph paragraph : jochreImage.getParagraphs()) {
for (RowOfShapes row : paragraph.getRows()) {
for (GroupOfShapes group : row.getGroups()) {
for (Shape shape : group.getShapes()) {
if (i < text.length()) {
String letter = text.substring(i, i + 1);
String nextLetter = "";
if (i + 1 < text.length())
nextLetter = text.substring(i + 1, i + 2);
if (nextLetter.equals("ָֹ") || nextLetter.equals("ַ")) {
letter += nextLetter;
i++;
}
LOG.debug("Letter: " + letter);
shape.setLetter(letter);
}
i++;
}
// to skip the space
i++;
LOG.debug("Space");
}
// next group
}
// next row
}
// next paragraph
List<ShapeFeature<?>> features = new ArrayList<ShapeFeature<?>>();
features.add(new VerticalElongationFeature());
features.add(new VerticalSizeFeature());
features.add(new TouchesBaseLineFeature());
features.add(new TouchesMeanLineFeature());
features.add(new EmptyCentreFeature());
i = 0;
DecimalFormat df = new DecimalFormat("0.00");
firstShape = true;
int totalRowCount = 0;
for (Paragraph paragraph : jochreImage.getParagraphs()) {
for (RowOfShapes row : paragraph.getRows()) {
totalRowCount++;
LOG.debug("============= Row " + i + " ================");
int j = 0;
for (GroupOfShapes group : row.getGroups()) {
for (Shape shape : group.getShapes()) {
LOG.debug("============= Shape (" + i + "," + j + ") ================");
LOG.debug("Left = " + shape.getLeft() + ". Top = " + shape.getTop() + ". Right = " + shape.getRight() + ". Bottom = " + shape.getBottom());
LOG.debug("Letter " + shape.getLetter());
if (firstShape) {
LOG.debug("mid pixel: " + midPixelFirstShape);
assertEquals(midPixelFirstShape, shape.getPixel(3, 3));
LOG.debug("mid pixel raw: " + midPixelFirstShapeRaw);
assertEquals(midPixelFirstShapeRaw, shape.getRawPixel(3, 3));
firstShape = false;
}
if (writePixelsToLog) {
for (int y = 0; y < shape.getHeight(); y++) {
String line = "";
if (y == shape.getMeanLine())
line += "M";
else if (y == shape.getBaseLine())
line += "B";
else
line += y;
for (int x = 0; x < shape.getWidth(); x++) {
if (shape.isPixelBlack(x, y, sourceImage.getBlackThreshold()))
line += "x";
else
line += "o";
}
LOG.debug(line);
}
}
double[][] totals = shape.getBrightnessBySection(5, 5, 1, SectionBrightnessMeasurementMethod.RAW);
LOG.debug("Brightness counts");
for (int y = 0; y < totals[0].length; y++) {
String line = "";
for (int x = 0; x < totals.length; x++) {
line += df.format(totals[x][y]) + "\t";
}
LOG.debug(line);
}
for (ShapeFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> outcome = feature.check(shape, env);
LOG.debug(outcome.toString());
}
}
if (i == 0) {
if (j == 0)
assertEquals(shapeCountRow1Group1, group.getShapes().size());
} else if (i == 1) {
if (j == 0)
assertEquals(shapeCountRow2Group1, group.getShapes().size());
}
j++;
}
if (i == 0)
assertEquals(groupCountRow1, row.getGroups().size());
else if (i == 1)
assertEquals(groupCountRow2, row.getGroups().size());
else if (i == 2)
assertEquals(groupCountRow3, row.getGroups().size());
if (i == 0)
assertEquals(shapeCountRow1, row.getShapes().size());
else if (i == 1)
assertEquals(shapeCountRow2, row.getShapes().size());
i++;
}
// next row
}
// next paragraph
assertEquals(rowCount, totalRowCount);
}
// next test image
LOG.debug("************** Finished ***********");
}
use of com.joliciel.jochre.doc.JochrePage in project jochre by urieli.
the class TextGetterImplTest method testGetTextFontSizes.
@Test
public void testGetTextFontSizes() {
final JochreDocument doc = mock(JochreDocument.class);
final JochrePage page = mock(JochrePage.class);
final JochreImage jochreImage = mock(JochreImage.class);
final Paragraph paragraph = mock(Paragraph.class);
final RowOfShapes row = mock(RowOfShapes.class);
final GroupOfShapes group1 = mock(GroupOfShapes.class);
final GroupOfShapes group2 = mock(GroupOfShapes.class);
final GroupOfShapes group3 = mock(GroupOfShapes.class);
final GroupOfShapes group4 = mock(GroupOfShapes.class);
final Shape shape1 = mock(Shape.class);
final Shape shape2 = mock(Shape.class);
final Shape shape3 = mock(Shape.class);
final Shape shape4 = mock(Shape.class);
final List<Paragraph> paragraphs = new ArrayList<>();
paragraphs.add(paragraph);
final List<RowOfShapes> rows = new ArrayList<>();
rows.add(row);
when(jochreImage.getPage()).thenReturn(page);
when(page.getDocument()).thenReturn(doc);
when(doc.isLeftToRight()).thenReturn(true);
when(jochreImage.getParagraphs()).thenReturn(paragraphs);
when(paragraph.getRows()).thenReturn(rows);
List<GroupOfShapes> groups = new ArrayList<>();
groups.add(group1);
groups.add(group2);
groups.add(group3);
groups.add(group4);
when(row.getGroups()).thenReturn(groups);
List<Shape> shapes1 = new ArrayList<>();
shapes1.add(shape1);
when(group1.getShapes()).thenReturn(shapes1);
when(group1.getXHeight()).thenReturn(10);
List<Shape> shapes2 = new ArrayList<>();
shapes2.add(shape2);
when(group2.getShapes()).thenReturn(shapes2);
when(group2.getXHeight()).thenReturn(20);
List<Shape> shapes3 = new ArrayList<>();
shapes3.add(shape3);
when(group3.getShapes()).thenReturn(shapes3);
when(group3.getXHeight()).thenReturn(10);
List<Shape> shapes4 = new ArrayList<>();
shapes4.add(shape4);
when(group4.getShapes()).thenReturn(shapes4);
when(group4.getXHeight()).thenReturn(5);
when(shape1.getLetter()).thenReturn("A");
when(shape1.getXHeight()).thenReturn(10);
when(shape2.getLetter()).thenReturn("B");
when(shape2.getXHeight()).thenReturn(20);
when(shape3.getLetter()).thenReturn("C");
when(shape3.getXHeight()).thenReturn(10);
when(shape4.getLetter()).thenReturn("D");
when(shape4.getXHeight()).thenReturn(5);
StringWriter writer = new StringWriter();
TextGetter textGetter = new TextGetter(writer, TextFormat.XHTML);
textGetter.onImageComplete(jochreImage);
String result = writer.toString();
LOG.debug(result);
assertEquals("<p dir=\"rtl\">A <big>B </big>C <small>D </small></p>", result);
}
use of com.joliciel.jochre.doc.JochrePage in project jochre by urieli.
the class TextGetterImplTest method testGetText.
@Test
public void testGetText() {
final JochreDocument doc = mock(JochreDocument.class);
final JochrePage page = mock(JochrePage.class);
final JochreImage jochreImage = mock(JochreImage.class);
final Paragraph paragraph = mock(Paragraph.class);
final RowOfShapes row = mock(RowOfShapes.class);
final GroupOfShapes group = mock(GroupOfShapes.class);
final Shape shape1 = mock(Shape.class);
final Shape shape2 = mock(Shape.class);
final Shape shape3 = mock(Shape.class);
final Shape shape4 = mock(Shape.class);
final Shape shape5 = mock(Shape.class);
final Shape shape6 = mock(Shape.class);
final Shape shape7 = mock(Shape.class);
final Shape shape8 = mock(Shape.class);
final Shape shape9 = mock(Shape.class);
final List<Paragraph> paragraphs = new ArrayList<>();
paragraphs.add(paragraph);
final List<RowOfShapes> rows = new ArrayList<>();
rows.add(row);
final List<GroupOfShapes> groups = new ArrayList<>();
groups.add(group);
when(jochreImage.getPage()).thenReturn(page);
when(page.getDocument()).thenReturn(doc);
when(doc.isLeftToRight()).thenReturn(false);
when(jochreImage.getParagraphs()).thenReturn(paragraphs);
when(paragraph.getRows()).thenReturn(rows);
when(row.getGroups()).thenReturn(groups);
List<Shape> shapes = new ArrayList<>();
shapes.add(shape1);
shapes.add(shape2);
shapes.add(shape3);
shapes.add(shape4);
shapes.add(shape5);
shapes.add(shape6);
shapes.add(shape7);
shapes.add(shape8);
shapes.add(shape9);
when(group.getShapes()).thenReturn(shapes);
when(group.getXHeight()).thenReturn(10);
when(shape1.getLetter()).thenReturn(",");
when(shape2.getLetter()).thenReturn(",");
when(shape3.getLetter()).thenReturn("|אַ");
when(shape4.getLetter()).thenReturn("אַ|");
when(shape5.getLetter()).thenReturn("|m");
when(shape6.getLetter()).thenReturn("m|");
when(shape7.getLetter()).thenReturn("|ש");
when(shape8.getLetter()).thenReturn("ע|");
when(shape9.getLetter()).thenReturn(",");
StringWriter writer = new StringWriter();
TextGetter textGetter = new TextGetter(writer, TextFormat.PLAIN);
textGetter.onImageComplete(jochreImage);
String result = writer.toString();
LOG.debug(result);
assertEquals("„אַm|שע|, \n", result);
}
use of com.joliciel.jochre.doc.JochrePage in project jochre by urieli.
the class PdfDocumentProcessor method process.
/**
* Visit the images and return the JochreDocument containing them.
*/
public JochreDocument process() {
try {
LOG.debug("PdfImageVisitorImpl.visitImages");
if (this.currentMonitor != null)
currentMonitor.setCurrentAction("imageMonitor.extractingNextImage");
JochreDocument jochreDocument = this.documentProcessor.onDocumentStart();
jochreDocument.setTotalPageCount(this.getPageCount());
for (Entry<String, String> field : this.getFields().entrySet()) {
jochreDocument.getFields().put(field.getKey(), field.getValue());
}
this.visitImages();
JochrePage finalPage = jochreDocument.getCurrentPage();
if (finalPage != null) {
documentProcessor.onPageComplete(finalPage);
}
this.documentProcessor.onDocumentComplete(jochreDocument);
this.documentProcessor.onAnalysisComplete();
if (this.currentMonitor != null) {
currentMonitor.setFinished(true);
}
return jochreDocument;
} catch (Exception e) {
if (this.currentMonitor != null)
this.currentMonitor.setException(e);
LOG.error("Failed processing in " + this.getClass().getSimpleName(), e);
throw new RuntimeException(e);
}
}
use of com.joliciel.jochre.doc.JochrePage in project jochre by urieli.
the class TrainingCorpusShapeSplitterTest method testSplit.
@Test
public void testSplit() throws Exception {
System.setProperty("config.file", "src/test/resources/testDualCharacters.conf");
ConfigFactory.invalidateCaches();
Config config = ConfigFactory.load();
final JochreSession jochreSession = new JochreSession(config);
final Shape shape = mock(Shape.class);
final Shape shape1 = mock(Shape.class);
final Shape shape2 = mock(Shape.class);
final Shape shape3 = mock(Shape.class);
final Shape shape4 = mock(Shape.class);
final GroupOfShapes group = mock(GroupOfShapes.class);
final RowOfShapes row = mock(RowOfShapes.class);
final Paragraph paragraph = mock(Paragraph.class);
final JochreImage jochreImage = mock(JochreImage.class);
final JochrePage jochrePage = mock(JochrePage.class);
final JochreDocument jochreDocument = mock(JochreDocument.class);
final Iterator<Split> i = (Iterator<Split>) mock(Iterator.class);
final List<Split> splits = (List<Split>) mock(List.class);
final Split split1 = mock(Split.class);
final Split split2 = mock(Split.class);
final Split split3 = mock(Split.class);
when(shape.getLetter()).thenReturn("אָבּער");
when(shape.getLeft()).thenReturn(100);
when(shape.getRight()).thenReturn(200);
when(shape.getTop()).thenReturn(100);
when(shape.getBottom()).thenReturn(200);
when(shape.getGroup()).thenReturn(group);
when(shape.getJochreImage()).thenReturn(jochreImage);
when(group.getRow()).thenReturn(row);
when(row.getParagraph()).thenReturn(paragraph);
when(paragraph.getImage()).thenReturn(jochreImage);
when(jochreImage.getPage()).thenReturn(jochrePage);
when(jochrePage.getDocument()).thenReturn(jochreDocument);
when(jochreDocument.getLocale()).thenReturn(jochreSession.getLocale());
when(shape.getSplits()).thenReturn(splits);
when(splits.iterator()).thenReturn(i);
when(i.hasNext()).thenReturn(true).thenReturn(true).thenReturn(true).thenReturn(false);
when(i.next()).thenReturn(split1).thenReturn(split2).thenReturn(split3);
when(split1.getPosition()).thenReturn(35);
when(split2.getPosition()).thenReturn(59);
when(split3.getPosition()).thenReturn(82);
when(jochreImage.getShape(100, 100, 135, 200)).thenReturn(shape1);
when(jochreImage.getShape(136, 100, 159, 200)).thenReturn(shape2);
when(jochreImage.getShape(160, 100, 182, 200)).thenReturn(shape3);
when(jochreImage.getShape(183, 100, 200, 200)).thenReturn(shape4);
LOG.debug(shape.toString());
LOG.debug(shape.getLetter());
TrainingCorpusShapeSplitter splitter = new TrainingCorpusShapeSplitter(jochreSession);
List<ShapeSequence> result = splitter.split(shape);
ShapeSequence shapeSequence = result.get(0);
assertEquals(4, shapeSequence.size());
LOG.debug("Split into: " + shapeSequence.toString());
verify(shape1).setLetter("אָ");
verify(shape2).setLetter("בּ");
verify(shape3).setLetter("ע");
verify(shape4).setLetter("ר");
}
Aggregations