use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.
the class PatternEventStream method getTaggedTokens.
public List<TaggedToken<TokeniserOutcome>> getTaggedTokens(TokenSequence tokenSequence, List<Integer> tokenSplits) {
List<TaggedToken<TokeniserOutcome>> taggedTokens = new ArrayList<TaggedToken<TokeniserOutcome>>();
for (Token token : tokenSequence.listWithWhiteSpace()) {
TokeniserOutcome outcome = TokeniserOutcome.JOIN;
if (tokenSplits.contains(token.getStartIndex()))
outcome = TokeniserOutcome.SEPARATE;
Decision decision = new Decision(outcome.name());
TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
taggedTokens.add(taggedToken);
}
return taggedTokens;
}
use of com.joliciel.talismane.machineLearning.Decision in project jochre by urieli.
the class RecursiveShapeSplitterTest method testSplitShape.
/**
* Make sure we get 5 equally weighted sequences in the case of a 50/50 prob for
* splitting each time.
*/
@SuppressWarnings("unchecked")
@Test
public void testSplitShape() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
Config config = ConfigFactory.load();
JochreSession jochreSession = new JochreSession(config);
BufferedImage originalImage = new BufferedImage(256, 256, BufferedImage.TYPE_INT_RGB);
final JochreImage jochreImage = new JochreImage(originalImage, jochreSession);
final Shape shape = new Shape(jochreImage, 0, 0, 63, 15, jochreSession);
shape.setBaseLine(12);
shape.setMeanLine(4);
final Shape shape1 = new Shape(jochreImage, 0, 0, 31, 15, jochreSession);
shape1.setBaseLine(12);
shape1.setMeanLine(4);
final Shape shape2 = new Shape(jochreImage, 32, 0, 63, 15, jochreSession);
shape2.setBaseLine(12);
shape2.setMeanLine(4);
final SplitCandidateFinder splitCandidateFinder = mock(SplitCandidateFinder.class);
final DecisionMaker decisionMaker = mock(DecisionMaker.class);
Split split = new Split(shape, jochreSession);
split.setPosition(31);
List<Split> splits = new ArrayList<>();
splits.add(split);
when(splitCandidateFinder.findSplitCandidates(shape)).thenReturn(splits);
Decision yesDecision = new Decision(SplitOutcome.DO_SPLIT.name(), 0.5);
Decision noDecision = new Decision(SplitOutcome.DO_NOT_SPLIT.name(), 0.5);
List<Decision> decisions = new ArrayList<>();
decisions.add(yesDecision);
decisions.add(noDecision);
when(decisionMaker.decide(anyList())).thenReturn(decisions);
Split split1 = new Split(shape1, jochreSession);
split1.setPosition(15);
List<Split> splits1 = new ArrayList<>();
splits1.add(split1);
when(splitCandidateFinder.findSplitCandidates(shape1)).thenReturn(splits1);
Split split2 = new Split(shape2, jochreSession);
split2.setPosition(15);
List<Split> splits2 = new ArrayList<>();
splits2.add(split2);
when(splitCandidateFinder.findSplitCandidates(shape2)).thenReturn(splits2);
Set<SplitFeature<?>> splitFeatures = new TreeSet<>();
RecursiveShapeSplitter splitter = new RecursiveShapeSplitter(splitCandidateFinder, splitFeatures, decisionMaker, jochreSession);
splitter.setBeamWidth(10);
splitter.setMaxDepth(2);
splitter.setMinWidthRatio(1.0);
List<ShapeSequence> shapeSequences = splitter.split(shape);
assertEquals(5, shapeSequences.size());
for (ShapeSequence shapeSequence : shapeSequences) {
assertEquals(1.0, shapeSequence.getScore(), 0.0001);
}
}
use of com.joliciel.talismane.machineLearning.Decision in project jochre by urieli.
the class LetterByLetterBoundaryDetector method findBoundaries.
@Override
public List<ShapeSequence> findBoundaries(GroupOfShapes group) {
// find the possible shape sequences that make up this group
ShapeSequence emptySequence = new ShapeSequence();
PriorityQueue<ShapeSequence> heap = new PriorityQueue<ShapeSequence>();
heap.add(emptySequence);
for (Shape shape : group.getShapes()) {
PriorityQueue<ShapeSequence> previousHeap = heap;
heap = new PriorityQueue<ShapeSequence>();
// check if shape is wide enough to bother with
double widthRatio = (double) shape.getWidth() / (double) shape.getXHeight();
double heightRatio = (double) shape.getHeight() / (double) shape.getXHeight();
// Splitting/merging shapes as required
List<ShapeSequence> splitSequences = null;
if (this.shapeSplitter != null && widthRatio >= minWidthRatioForSplit && heightRatio >= minHeightRatioForSplit) {
splitSequences = shapeSplitter.split(shape);
} else {
// create a sequence containing only this shape
ShapeSequence singleShapeSequence = new ShapeSequence();
singleShapeSequence.addShape(shape);
splitSequences = new ArrayList<ShapeSequence>();
splitSequences.add(singleShapeSequence);
}
// limit the breadth to K
int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
for (int j = 0; j < maxSequences; j++) {
ShapeSequence history = previousHeap.poll();
for (ShapeSequence splitSequence : splitSequences) {
ShapeInSequence previousShapeInSequence = null;
Shape previousShape = null;
if (history.size() > 0) {
previousShapeInSequence = history.get(history.size() - 1);
previousShape = previousShapeInSequence.getShape();
}
ShapeInSequence firstShapeInSequence = splitSequence.get(0);
Shape firstShape = firstShapeInSequence.getShape();
double mergeProb = 0;
if (this.shapeMerger != null && previousShape != null) {
ShapePair mergeCandidate = new ShapePair(previousShape, shape);
double mergeCandidateWidthRatio = 0;
double mergeCandidateDistanceRatio = 0;
mergeCandidateWidthRatio = (double) mergeCandidate.getWidth() / (double) mergeCandidate.getXHeight();
mergeCandidateDistanceRatio = (double) mergeCandidate.getInnerDistance() / (double) mergeCandidate.getXHeight();
if (mergeCandidateWidthRatio <= maxWidthRatioForMerge && mergeCandidateDistanceRatio <= maxDistanceRatioForMerge) {
mergeProb = shapeMerger.checkMerge(previousShape, firstShape);
}
}
if (mergeProb > 0) {
Shape mergedShape = shapeMerger.merge(previousShape, firstShape);
ShapeSequence mergedSequence = new ShapeSequence(history);
mergedSequence.remove(mergedSequence.size() - 1);
List<Shape> originalShapesForMerge = new ArrayList<Shape>();
originalShapesForMerge.addAll(previousShapeInSequence.getOriginalShapes());
originalShapesForMerge.addAll(firstShapeInSequence.getOriginalShapes());
mergedSequence.addShape(mergedShape, originalShapesForMerge);
boolean isFirstShape = true;
for (ShapeInSequence splitShape : splitSequence) {
if (!isFirstShape)
mergedSequence.add(splitShape);
isFirstShape = false;
}
heap.add(mergedSequence);
Decision mergeDecision = new Decision(MergeOutcome.DO_MERGE.name(), mergeProb);
mergedSequence.addDecision(mergeDecision);
for (Decision splitDecision : splitSequence.getDecisions()) mergedSequence.addDecision(splitDecision);
}
if (mergeProb < 1) {
ShapeSequence totalSequence = new ShapeSequence(history);
if (mergeProb > 0) {
Decision mergeDecision = new Decision(MergeOutcome.DO_NOT_MERGE.name(), 1 - mergeProb);
totalSequence.addDecision(mergeDecision);
}
for (Decision splitDecision : splitSequence.getDecisions()) totalSequence.addDecision(splitDecision);
for (ShapeInSequence splitShape : splitSequence) {
totalSequence.add(splitShape);
}
heap.add(totalSequence);
}
}
// next split sequence for this shape
}
// next history from previous heap
}
// next shape in group
List<ShapeSequence> result = new ArrayList<ShapeSequence>();
for (int i = 0; i < this.beamWidth; i++) {
if (heap.isEmpty())
break;
ShapeSequence nextSequence = heap.poll();
result.add(nextSequence);
}
return result;
}
use of com.joliciel.talismane.machineLearning.Decision in project jochre by urieli.
the class ShapeMerger method checkMerge.
/**
* Given two sequential shape, returns the probability of a merge.
*/
public double checkMerge(Shape shape1, Shape shape2) {
ShapePair mergeCandidate = new ShapePair(shape1, shape2);
if (LOG.isTraceEnabled())
LOG.trace("mergeCandidate: " + mergeCandidate);
List<FeatureResult<?>> featureResults = new ArrayList<FeatureResult<?>>();
// analyse features
for (MergeFeature<?> feature : mergeFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(mergeCandidate, env);
if (featureResult != null) {
featureResults.add(featureResult);
if (LOG.isTraceEnabled()) {
LOG.trace(featureResult.toString());
}
}
}
List<Decision> decisions = decisionMaker.decide(featureResults);
double yesProb = 0.0;
for (Decision decision : decisions) {
if (decision.getOutcome().equals(MergeOutcome.DO_MERGE)) {
yesProb = decision.getProbability();
break;
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("yesProb: " + yesProb);
}
return yesProb;
}
use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.
the class PosTagRegexBasedCorpusReader method convertToPosTaggedToken.
protected PosTaggedToken convertToPosTaggedToken(CorpusLine corpusLine, PosTagSequence posTagSequence, int index, File currentFile) throws TalismaneException {
Token token = posTagSequence.getTokenSequence().get(index);
PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
PosTag posTag = null;
try {
posTag = posTagSet.getPosTag(corpusLine.getElement(CorpusElement.POSTAG));
} catch (UnknownPosTagException upte) {
String fileName = "";
if (currentFile != null)
fileName = currentFile.getPath();
throw new TalismaneException("Unknown posTag, " + fileName + ", on line " + corpusLine.getLineNumber() + ": " + corpusLine.getElement(CorpusElement.POSTAG));
}
Decision posTagDecision = new Decision(posTag.getCode());
PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
if (LOG.isTraceEnabled()) {
LOG.trace(posTaggedToken.toString());
}
if (corpusLine.hasElement(CorpusElement.POSTAG_COMMENT))
posTaggedToken.setComment(corpusLine.getElement(CorpusElement.POSTAG_COMMENT));
// set the lexical entry if we have one
if (corpusLine.getLexicalEntry() != null) {
List<LexicalEntry> lexicalEntrySet = new ArrayList<>(1);
lexicalEntrySet.add(corpusLine.getLexicalEntry());
posTaggedToken.setLexicalEntries(lexicalEntrySet);
}
posTagSequence.addPosTaggedToken(posTaggedToken);
return posTaggedToken;
}
Aggregations