use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.
the class Jochre method execute.
/**
* Usage (* indicates optional):<br/>
* Jochre load [filename] [isoLanguageCode] [firstPage]* [lastPage]*<br/>
* Loads a file (pdf or image) and segments it into letters. The analysed
* version is stored in the persistent store. Writes [filename].xml to the same
* location, to enable the user to indicate the text to associate with this
* file.<br/>
* Jochre extract [filename] [outputDirectory] [firstPage]* [lastPage]*<br/>
* Extracts images form a pdf file.<br/>
*/
public void execute(Map<String, String> argMap) throws Exception {
if (argMap.size() == 0) {
System.out.println("See jochre wiki for usage");
return;
}
String logConfigPath = argMap.get("logConfigFile");
if (logConfigPath != null) {
argMap.remove("logConfigFile");
JochreLogUtils.configureLogging(logConfigPath);
}
String command = "";
String inFilePath = "";
String inDirPath = null;
String userFriendlyName = "";
String outputDirPath = null;
String outputFilePath = null;
int firstPage = -1;
int lastPage = -1;
Set<Integer> pages = Collections.emptySet();
int shapeId = -1;
int docId = -1;
int imageId = 0;
int userId = -1;
int imageCount = 0;
int multiplier = 0;
boolean save = false;
ImageStatus[] imageSet = null;
boolean reconstructLetters = false;
int excludeImageId = 0;
int crossValidationSize = -1;
int includeIndex = -1;
int excludeIndex = -1;
Set<Integer> documentSet = null;
String suffix = "";
String docGroupPath = null;
boolean includeBeam = false;
List<OutputFormat> outputFormats = new ArrayList<>();
String docSelectionPath = null;
List<String> featureDescriptors = null;
boolean includeDate = false;
for (Entry<String, String> argMapEntry : argMap.entrySet()) {
String argName = argMapEntry.getKey();
String argValue = argMapEntry.getValue();
if (argName.equals("command"))
command = argValue;
else if (argName.equals("file"))
inFilePath = argValue;
else if (argName.equals("name"))
userFriendlyName = argValue;
else if (argName.equals("first"))
firstPage = Integer.parseInt(argValue);
else if (argName.equals("last"))
lastPage = Integer.parseInt(argValue);
else if (argName.equals("pages")) {
final String WITH_DELIMITER = "((?<=%1$s)|(?=%1$s))";
final Pattern numberPattern = Pattern.compile("\\d+");
final String[] parts = argValue.split(String.format(WITH_DELIMITER, "[\\-,]"));
int number = -1;
boolean inRange = false;
final Set<Integer> myPages = new HashSet<>();
for (String part : parts) {
if (numberPattern.matcher(part).matches()) {
int lowerBound = number;
number = Integer.parseInt(part);
if (inRange) {
if (lowerBound > number)
throw new IllegalArgumentException("Lower bound (" + lowerBound + ") greater than upper bound (" + number + "): " + argValue);
IntStream.rangeClosed(lowerBound, number).forEach(i -> myPages.add(i));
number = -1;
inRange = false;
}
} else if (part.equals(",")) {
if (number >= 0)
myPages.add(number);
number = -1;
} else if (part.equals("-")) {
if (inRange)
throw new IllegalArgumentException("Unable to parse pages (unclosed range): " + argValue);
if (number < 0)
throw new IllegalArgumentException("Range without lower bound: " + argValue);
inRange = true;
} else {
throw new IllegalArgumentException("Unable to parse pages - unexpected character '" + part + "': " + argValue);
}
}
if (inRange) {
throw new IllegalArgumentException("Unable to parse pages (unclosed range): " + argValue);
}
if (number >= 0)
myPages.add(number);
pages = myPages;
} else if (argName.equals("inDir"))
inDirPath = argValue;
else if (argName.equals("outDir"))
outputDirPath = argValue;
else if (argName.equals("outputFile"))
outputFilePath = argValue;
else if (argName.equals("save"))
save = (argValue.equals("true"));
else if (argName.equals("shapeId"))
shapeId = Integer.parseInt(argValue);
else if (argName.equals("imageId"))
imageId = Integer.parseInt(argValue);
else if (argName.equals("docId"))
docId = Integer.parseInt(argValue);
else if (argName.equals("userId"))
userId = Integer.parseInt(argValue);
else if (argName.equals("imageCount"))
imageCount = Integer.parseInt(argValue);
else if (argName.equals("multiplier"))
multiplier = Integer.parseInt(argValue);
else if (argName.equals("imageStatus")) {
String[] statusCodes = argValue.split(",");
Set<ImageStatus> imageStasuses = new HashSet<>();
for (String statusCode : statusCodes) {
if (statusCode.equals("heldOut"))
imageStasuses.add(ImageStatus.TRAINING_HELD_OUT);
else if (statusCode.equals("test"))
imageStasuses.add(ImageStatus.TRAINING_TEST);
else if (statusCode.equals("training"))
imageStasuses.add(ImageStatus.TRAINING_VALIDATED);
else if (statusCode.equals("all")) {
imageStasuses.add(ImageStatus.TRAINING_VALIDATED);
imageStasuses.add(ImageStatus.TRAINING_HELD_OUT);
imageStasuses.add(ImageStatus.TRAINING_TEST);
} else
throw new RuntimeException("Unknown imageSet: " + statusCode);
}
imageSet = new ImageStatus[imageStasuses.size()];
int i = 0;
for (ImageStatus imageStatus : imageStasuses) {
imageSet[i++] = imageStatus;
}
} else if (argName.equals("reconstructLetters"))
reconstructLetters = (argValue.equals("true"));
else if (argName.equals("excludeImageId"))
excludeImageId = Integer.parseInt(argValue);
else if (argName.equals("crossValidationSize"))
crossValidationSize = Integer.parseInt(argValue);
else if (argName.equals("includeIndex"))
includeIndex = Integer.parseInt(argValue);
else if (argName.equals("excludeIndex"))
excludeIndex = Integer.parseInt(argValue);
else if (argName.equals("docSet")) {
String[] docIdArray = argValue.split(",");
documentSet = new HashSet<>();
for (String docIdString : docIdArray) {
int oneId = Integer.parseInt(docIdString);
documentSet.add(oneId);
}
} else if (argName.equals("docSelection")) {
docSelectionPath = argValue;
} else if (argName.equals("docGroupFile"))
docGroupPath = argValue;
else if (argName.equals("suffix"))
suffix = argValue;
else if (argName.equals("includeBeam"))
includeBeam = argValue.equalsIgnoreCase("true");
else if (argName.equals("outputFormat")) {
outputFormats = new ArrayList<>();
String[] outputFormatStrings = argValue.split(",");
for (String outputFormatString : outputFormatStrings) {
outputFormats.add(OutputFormat.valueOf(outputFormatString));
}
if (outputFormats.size() == 0)
throw new JochreException("At least one outputFormat required.");
} else if (argName.equals("features")) {
featureDescriptors = new ArrayList<>();
InputStream featureFile = new FileInputStream(new File(argValue));
try (Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(featureFile, "UTF-8")))) {
while (scanner.hasNextLine()) {
String descriptor = scanner.nextLine();
featureDescriptors.add(descriptor);
LOG.debug(descriptor);
}
}
} else if (argName.equals("includeDate")) {
includeDate = argValue.equalsIgnoreCase("true");
} else {
throw new RuntimeException("Unknown argument: " + argName);
}
}
if (pages.isEmpty() && (firstPage >= 0 || lastPage >= 0)) {
if (firstPage < 0)
firstPage = 0;
if (lastPage < 0)
lastPage = config.getInt("jochre.pdf.max-page");
pages = IntStream.rangeClosed(firstPage, lastPage).boxed().collect(Collectors.toSet());
}
long startTime = System.currentTimeMillis();
try {
this.setUserId(userId);
CorpusSelectionCriteria criteria = new CorpusSelectionCriteria();
if (docSelectionPath != null) {
File docSelectionFile = new File(docSelectionPath);
Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(docSelectionFile), jochreSession.getEncoding())));
criteria.loadSelection(scanner);
scanner.close();
} else {
criteria.setImageId(imageId);
criteria.setImageCount(imageCount);
if (imageSet != null)
criteria.setImageStatusesToInclude(imageSet);
criteria.setExcludeImageId(excludeImageId);
criteria.setCrossValidationSize(crossValidationSize);
criteria.setIncludeIndex(includeIndex);
criteria.setExcludeIndex(excludeIndex);
criteria.setDocumentId(docId);
criteria.setDocumentIds(documentSet);
}
if (LOG.isDebugEnabled())
LOG.debug(criteria.getAttributes().toString());
if (docGroupPath != null) {
File docGroupFile = new File(docGroupPath);
Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(docGroupFile), jochreSession.getEncoding())));
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
int equalsPos = line.indexOf('=');
String groupName = line.substring(0, equalsPos);
String[] ids = line.substring(equalsPos + 1).split(",");
Set<Integer> idSet = new HashSet<>();
for (String idString : ids) {
idSet.add(Integer.parseInt(idString));
}
documentGroups.put(groupName, idSet);
}
scanner.close();
}
MostLikelyWordChooser wordChooser = new MostLikelyWordChooser(jochreSession);
File outputDir = null;
File outputFile = null;
if (outputDirPath != null) {
outputDir = new File(outputDirPath);
} else if (outputFilePath != null) {
outputFile = new File(outputFilePath);
outputDir = outputFile.getParentFile();
}
if (outputDir != null)
outputDir.mkdirs();
List<DocumentObserver> observers = null;
List<PdfImageObserver> imageObservers = null;
if (outputFormats.size() > 0 && !command.equals("analyseFolder")) {
if (outputDir == null) {
throw new JochreException("Either outputDir our outputFile are required with outputFormats");
}
String baseName = null;
if (userFriendlyName != null && userFriendlyName.length() > 0) {
baseName = userFriendlyName;
} else if (inFilePath != null && inFilePath.length() > 0) {
File inFile = new File(inFilePath);
baseName = this.getBaseName(inFile);
}
observers = this.getObservers(outputFormats, baseName, outputDir, includeDate);
imageObservers = this.getImageObservers(outputFormats, baseName, outputDir);
}
if (userFriendlyName.length() == 0)
userFriendlyName = inFilePath;
if (command.equals("segment")) {
this.doCommandSegment(inFilePath, userFriendlyName, outputDir, save, pages);
} else if (command.equals("extract")) {
this.doCommandExtractImages(inFilePath, outputDir, pages);
} else if (command.equals("updateImages")) {
this.doCommandUpdateImages(inFilePath, docId, pages);
} else if (command.equals("applyFeatures")) {
this.doCommandApplyFeatures(imageId, shapeId, featureDescriptors);
} else if (command.equals("train")) {
this.doCommandTrain(featureDescriptors, criteria, reconstructLetters);
} else if (command.equals("evaluate") || command.equals("evaluateComplex")) {
this.doCommandEvaluate(criteria, outputDir, wordChooser, reconstructLetters, save, suffix, includeBeam, observers);
} else if (command.equals("evaluateFull")) {
this.doCommandEvaluateFull(criteria, save, outputDir, wordChooser, suffix, observers);
} else if (command.equals("analyse")) {
this.doCommandAnalyse(criteria, wordChooser, observers);
} else if (command.equals("transform")) {
this.doCommandTransform(criteria, observers, imageObservers);
} else if (command.equals("trainSplits")) {
this.doCommandTrainSplits(featureDescriptors, criteria);
} else if (command.equals("evaluateSplits")) {
this.doCommandEvaluateSplits(criteria);
} else if (command.equals("trainMerge")) {
this.doCommandTrainMerge(featureDescriptors, multiplier, criteria);
} else if (command.equals("evaluateMerge")) {
this.doCommandEvaluateMerge(criteria);
} else if (command.equals("logImage")) {
this.doCommandLogImage(shapeId);
} else if (command.equals("testFeature")) {
this.doCommandTestFeature(shapeId);
} else if (command.equals("serializeLexicon")) {
if (outputDir == null) {
throw new JochreException("Either outputDir our outputFile are required for " + command);
}
File inputFile = new File(inFilePath);
if (inputFile.isDirectory()) {
File[] lexiconFiles = inputFile.listFiles();
for (File oneLexFile : lexiconFiles) {
LOG.debug(oneLexFile.getName() + ": " + ", size: " + oneLexFile.length());
TextFileLexicon lexicon = new TextFileLexicon(oneLexFile, jochreSession.getEncoding());
String baseName = oneLexFile.getName().substring(0, oneLexFile.getName().indexOf("."));
if (baseName.lastIndexOf("/") > 0)
baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
File lexiconFile = new File(outputDir, baseName + ".obj");
lexicon.serialize(lexiconFile);
}
} else {
LOG.debug(inFilePath + ": " + inputFile.exists() + ", size: " + inputFile.length());
TextFileLexicon lexicon = new TextFileLexicon(inputFile, jochreSession.getEncoding());
String baseName = inFilePath.substring(0, inFilePath.indexOf("."));
if (baseName.lastIndexOf("/") > 0)
baseName = baseName.substring(baseName.lastIndexOf("/") + 1);
File lexiconFile = outputFile;
if (lexiconFile == null)
lexiconFile = new File(outputDir, baseName + ".obj");
lexicon.serialize(lexiconFile);
}
} else if (command.equals("analyseFolder")) {
File inDir = new File(inDirPath);
File[] pdfFiles = inDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return (name.toLowerCase().endsWith(".pdf"));
}
});
Arrays.sort(pdfFiles);
for (File pdfFile : pdfFiles) {
LOG.info("Analysing file: " + pdfFile.getAbsolutePath());
try {
String baseName = this.getBaseName(pdfFile);
File analysisDir = new File(inDir, baseName);
analysisDir.mkdirs();
List<DocumentObserver> pdfObservers = this.getObservers(outputFormats, baseName, analysisDir, includeDate);
List<PdfImageObserver> pdfImageObservers = this.getImageObservers(outputFormats, baseName, analysisDir);
this.doCommandAnalyse(pdfFile, wordChooser, pages, pdfObservers, pdfImageObservers);
File pdfOutputDir = new File(outputDir, baseName);
pdfOutputDir.mkdirs();
File targetFile = new File(pdfOutputDir, pdfFile.getName());
Files.move(pdfFile.toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
File[] analysisFiles = analysisDir.listFiles();
for (File analysisFile : analysisFiles) {
targetFile = new File(pdfOutputDir, analysisFile.getName());
Files.move(analysisFile.toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
}
Files.delete(analysisDir.toPath());
} catch (Exception e) {
// log errors, but continue processing
LOG.error("Error processing file: " + pdfFile.getAbsolutePath(), e);
}
}
} else if (command.equals("analyseFile")) {
File pdfFile = new File(inFilePath);
this.doCommandAnalyse(pdfFile, wordChooser, pages, observers, imageObservers);
} else if (command.equals("findSplits")) {
GraphicsDao graphicsDao = GraphicsDao.getInstance(jochreSession);
List<Shape> shapesToSplit = graphicsDao.findShapesToSplit(jochreSession.getLocale());
for (Shape shape : shapesToSplit) {
LOG.info(shape.toString());
}
} else {
throw new RuntimeException("Unknown command: " + command);
}
} catch (Exception e) {
LOG.error("An error occurred while running Jochre", e);
throw e;
} finally {
long duration = System.currentTimeMillis() - startTime;
LOG.info("Duration (ms):" + duration);
}
LOG.info("#### finished #####");
}
use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.
the class BadGuessCollector method onFinish.
@Override
public void onFinish() {
for (int shapeId : shapeIdsToAnalyse.keySet()) {
Shape shape = this.graphicsDao.loadShape(shapeId);
String bestOutcome = shapeIdsToAnalyse.get(shapeId);
LOG.debug("### Shape " + shape);
LOG.debug("Expected: " + shape.getLetter() + " Guessed: " + bestOutcome);
shape.writeImageToLog();
}
}
use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.
the class DeterministicBoundaryDetector method findBoundaries.
@Override
public List<ShapeSequence> findBoundaries(GroupOfShapes group) {
// find the possible shape sequences that make up this group
ShapeSequence bestSequence = new ShapeSequence();
for (Shape shape : group.getShapes()) {
// check if shape is wide enough to bother with
double widthRatio = (double) shape.getWidth() / (double) shape.getXHeight();
double heightRatio = (double) shape.getHeight() / (double) shape.getXHeight();
// Splitting/merging shapes as required
ShapeSequence bestSplitSequence = null;
if (this.shapeSplitter != null && widthRatio >= minWidthRatioForSplit && heightRatio >= minHeightRatioForSplit) {
List<ShapeSequence> splitSequences = shapeSplitter.split(shape);
double bestProb = 0;
for (ShapeSequence splitSequence : splitSequences) {
if (splitSequence.getScore() > bestProb) {
bestSplitSequence = splitSequence;
bestProb = splitSequence.getScore();
}
}
if (bestProb < minProbabilityForDecision) {
// create a sequence containing only this shape
ShapeSequence singleShapeSequence = new ShapeSequence();
singleShapeSequence.addShape(shape);
bestSplitSequence = singleShapeSequence;
}
} else {
// create a sequence containing only this shape
ShapeSequence singleShapeSequence = new ShapeSequence();
singleShapeSequence.addShape(shape);
bestSplitSequence = singleShapeSequence;
}
ShapeInSequence previousShapeInSequence = null;
Shape previousShape = null;
if (bestSequence.size() > 0) {
previousShapeInSequence = bestSequence.get(bestSequence.size() - 1);
previousShape = previousShapeInSequence.getShape();
}
ShapeInSequence firstShapeInSequence = bestSplitSequence.get(0);
Shape firstShape = firstShapeInSequence.getShape();
double mergeProb = 0;
if (this.shapeMerger != null && previousShape != null) {
ShapePair mergeCandidate = new ShapePair(previousShape, shape);
double mergeCandidateWidthRatio = 0;
double mergeCandidateDistanceRatio = 0;
mergeCandidateWidthRatio = (double) mergeCandidate.getWidth() / (double) mergeCandidate.getXHeight();
mergeCandidateDistanceRatio = (double) mergeCandidate.getInnerDistance() / (double) mergeCandidate.getXHeight();
if (mergeCandidateWidthRatio <= maxWidthRatioForMerge && mergeCandidateDistanceRatio <= maxDistanceRatioForMerge) {
mergeProb = shapeMerger.checkMerge(previousShape, firstShape);
}
}
if (mergeProb > minProbabilityForDecision) {
Shape mergedShape = shapeMerger.merge(previousShape, firstShape);
bestSequence.remove(bestSequence.size() - 1);
List<Shape> originalShapesForMerge = new ArrayList<Shape>();
originalShapesForMerge.addAll(previousShapeInSequence.getOriginalShapes());
originalShapesForMerge.addAll(firstShapeInSequence.getOriginalShapes());
bestSequence.addShape(mergedShape, originalShapesForMerge);
boolean isFirstShape = true;
for (ShapeInSequence splitShape : bestSplitSequence) {
if (!isFirstShape)
bestSequence.add(splitShape);
isFirstShape = false;
}
Decision mergeDecision = new Decision(MergeOutcome.DO_MERGE.name(), mergeProb);
bestSequence.addDecision(mergeDecision);
for (Decision splitDecision : bestSplitSequence.getDecisions()) bestSequence.addDecision(splitDecision);
} else {
if (mergeProb > 0) {
Decision mergeDecision = new Decision(MergeOutcome.DO_NOT_MERGE.name(), 1 - mergeProb);
bestSequence.addDecision(mergeDecision);
}
for (Decision splitDecision : bestSplitSequence.getDecisions()) bestSequence.addDecision(splitDecision);
for (ShapeInSequence splitShape : bestSplitSequence) {
bestSequence.add(splitShape);
}
}
}
// next shape in group
List<ShapeSequence> result = new ArrayList<ShapeSequence>();
result.add(bestSequence);
return result;
}
use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.
the class MergeEvaluator method evaluate.
public FScoreCalculator<String> evaluate(JochreCorpusGroupReader groupReader, ShapeMerger shapeMerger) {
LOG.debug("evaluate");
FScoreCalculator<String> fScoreCalculator = new FScoreCalculator<String>();
while (groupReader.hasNext()) {
GroupOfShapes group = groupReader.next();
Shape previousShape = null;
for (Shape shape : group.getShapes()) {
if (previousShape != null) {
ShapePair mergeCandidate = new ShapePair(previousShape, shape);
double widthRatio = 0;
double distanceRatio = 0;
if (mergeCandidate.getXHeight() > 0) {
widthRatio = (double) mergeCandidate.getWidth() / (double) mergeCandidate.getXHeight();
distanceRatio = (double) mergeCandidate.getInnerDistance() / (double) mergeCandidate.getXHeight();
}
boolean shouldMerge = false;
if (mergeCandidate.getFirstShape().getLetter().startsWith("|")) {
if (mergeCandidate.getSecondShape().getLetter().length() == 0 || mergeCandidate.getSecondShape().getLetter().endsWith("|"))
shouldMerge = true;
} else if (mergeCandidate.getSecondShape().getLetter().endsWith("|")) {
if (mergeCandidate.getFirstShape().getLetter().length() == 0)
shouldMerge = true;
}
if (LOG.isTraceEnabled()) {
LOG.trace(mergeCandidate.toString());
LOG.trace("widthRatio: " + widthRatio);
LOG.trace("distanceRatio: " + distanceRatio);
LOG.trace("shouldMerge: " + shouldMerge);
}
if (widthRatio <= maxWidthRatio && distanceRatio <= maxDistanceRatio) {
double mergeProb = shapeMerger.checkMerge(previousShape, shape);
boolean wantsToMerge = (mergeProb >= minProbabilityForDecision);
fScoreCalculator.increment(shouldMerge ? "YES" : "NO", wantsToMerge ? "YES" : "NO");
} else {
LOG.trace("too wide");
if (shouldMerge)
fScoreCalculator.increment("YES", "WIDE");
else
fScoreCalculator.increment("NO", "NO");
}
// too wide?
}
// have previous shape?
previousShape = shape;
}
// next shape
}
// next group
return fScoreCalculator;
}
use of com.joliciel.jochre.graphics.Shape in project jochre by urieli.
the class RecursiveShapeSplitter method split.
List<ShapeSequence> split(Shape shape, int depth, Shape originalShape, boolean leftToRight) {
String padding = "-";
for (int i = 0; i < depth; i++) padding += "-";
padding += " ";
if (LOG.isTraceEnabled()) {
LOG.trace(padding + "Splitting shape: " + shape.getLeft() + " , " + shape.getRight());
LOG.trace(padding + "depth: " + depth);
}
List<ShapeSequence> shapeSequences = new ArrayList<ShapeSequence>();
// check if shape is wide enough to bother with
double widthRatio = (double) shape.getWidth() / (double) shape.getXHeight();
if (LOG.isTraceEnabled())
LOG.trace(padding + "widthRatio: " + widthRatio);
if (widthRatio < minWidthRatio || depth >= maxDepth) {
if (LOG.isTraceEnabled())
LOG.trace(padding + "too narrow or too deep");
ShapeSequence shapeSequence = new ShapeSequence();
shapeSequence.addShape(shape, originalShape);
shapeSequences.add(shapeSequence);
} else {
List<Split> splitCandidates = this.splitCandidateFinder.findSplitCandidates(shape);
TreeSet<ShapeSequence> myShapeSequences = new TreeSet<ShapeSequence>();
TreeSet<WeightedOutcome<Split>> weightedSplits = new TreeSet<WeightedOutcome<Split>>();
for (Split splitCandidate : splitCandidates) {
double splitProb = this.shouldSplit(splitCandidate);
WeightedOutcome<Split> weightedSplit = new WeightedOutcome<Split>(splitCandidate, splitProb);
weightedSplits.add(weightedSplit);
}
double maxSplitProb = 0.0;
if (weightedSplits.size() > 0)
maxSplitProb = weightedSplits.first().getWeight();
double noSplitProb = 1 - maxSplitProb;
if (noSplitProb > maxSplitProb)
maxSplitProb = noSplitProb;
Split noSplit = new Split(shape, jochreSession);
noSplit.setPosition(-1);
WeightedOutcome<Split> weightedNoSplit = new WeightedOutcome<Split>(noSplit, noSplitProb);
weightedSplits.add(weightedNoSplit);
boolean topCandidate = true;
double topCandidateWeight = 1.0;
for (WeightedOutcome<Split> weightedSplit : weightedSplits) {
Split splitCandidate = weightedSplit.getOutcome();
double splitProb = weightedSplit.getWeight();
if (LOG.isTraceEnabled())
LOG.trace(padding + "splitCandidate: left=" + splitCandidate.getShape().getLeft() + ", pos=" + splitCandidate.getPosition() + ", initial prob: " + splitProb);
if (LOG.isTraceEnabled()) {
if (topCandidate) {
LOG.trace(padding + "topCandidate");
}
}
if (splitCandidate.getPosition() < 0) {
// This is the no-split candidate
if (topCandidate)
topCandidateWeight = 1.0;
ShapeSequence shapeSequence = new ShapeSequence();
shapeSequence.addShape(shape, originalShape);
double prob = (splitProb / maxSplitProb) * topCandidateWeight;
if (LOG.isTraceEnabled())
LOG.trace(padding + "noSplit prob=(" + splitProb + " / " + maxSplitProb + ") * " + topCandidateWeight + " = " + prob);
Decision decision = new Decision(SplitOutcome.DO_NOT_SPLIT.name(), prob);
shapeSequence.addDecision(decision);
myShapeSequences.add(shapeSequence);
} else {
// a proper split
Shape leftShape = shape.getJochreImage().getShape(shape.getLeft(), shape.getTop(), shape.getLeft() + splitCandidate.getPosition(), shape.getBottom());
Shape rightShape = shape.getJochreImage().getShape(shape.getLeft() + splitCandidate.getPosition() + 1, shape.getTop(), shape.getRight(), shape.getBottom());
// for each split recursively try to split it again up to depth of m
// Note: m=2 is probably enough, since we're not expecting more than 4
// letters per shape (3 splits)
List<ShapeSequence> leftShapeSequences = this.split(leftShape, depth + 1, originalShape, leftToRight);
List<ShapeSequence> rightShapeSequences = this.split(rightShape, depth + 1, originalShape, leftToRight);
if (topCandidate) {
// find the no-split sequence in each sub-sequence
ShapeSequence noSplitLeft = null;
for (ShapeSequence leftShapeSequence : leftShapeSequences) {
if (leftShapeSequence.size() == 1) {
noSplitLeft = leftShapeSequence;
break;
}
}
ShapeSequence noSplitRight = null;
for (ShapeSequence rightShapeSequence : rightShapeSequences) {
if (rightShapeSequence.size() == 1) {
noSplitRight = rightShapeSequence;
break;
}
}
// we should be guaranteed to find a noSplitLeft and noSplitRight
// since a no-split candidate is always returned
topCandidateWeight = noSplitLeft.getScore() * noSplitRight.getScore();
if (LOG.isTraceEnabled())
LOG.trace(padding + "topCandidateWeight=" + noSplitLeft.getScore() + " *" + noSplitRight.getScore() + " = " + topCandidateWeight);
}
for (ShapeSequence leftShapeSequence : leftShapeSequences) {
for (ShapeSequence rightShapeSequence : rightShapeSequences) {
ShapeSequence newSequence = null;
if (leftToRight)
newSequence = new ShapeSequence(leftShapeSequence, rightShapeSequence);
else
newSequence = new ShapeSequence(rightShapeSequence, leftShapeSequence);
if (LOG.isTraceEnabled()) {
StringBuilder sb = new StringBuilder();
for (ShapeInSequence splitShape : newSequence) {
sb.append("(" + splitShape.getShape().getLeft() + "," + splitShape.getShape().getRight() + ") ");
}
LOG.trace(padding + sb.toString());
}
double totalProb = 1.0;
for (Decision decision : newSequence.getDecisions()) {
totalProb = totalProb * decision.getProbability();
}
newSequence.getDecisions().clear();
double prob = 0.0;
if (topCandidate) {
prob = totalProb * (splitProb / maxSplitProb);
if (LOG.isTraceEnabled())
LOG.trace(padding + "prob=" + totalProb + " * (" + splitProb + " / " + maxSplitProb + ") = " + prob);
} else {
prob = totalProb * (splitProb / maxSplitProb) * topCandidateWeight;
if (LOG.isTraceEnabled())
LOG.trace(padding + "prob=" + totalProb + " * (" + splitProb + " / " + maxSplitProb + ") * " + topCandidateWeight + " = " + prob);
}
Decision decision = new Decision(SplitOutcome.DO_SPLIT.name(), prob);
newSequence.addDecision(decision);
myShapeSequences.add(newSequence);
}
}
}
topCandidate = false;
}
int i = 0;
for (ShapeSequence shapeSequence : myShapeSequences) {
// probability
if (shapeSequence.size() == 1 || i < beamWidth) {
shapeSequences.add(shapeSequence);
}
i++;
}
}
return shapeSequences;
}
Aggregations