use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class TokenComparator method compare.
/**
* Evaluate the evaluation corpus against the reference corpus.
*
* @throws TalismaneException
* @throws IOException
*/
public void compare() throws TalismaneException, IOException {
while (referenceCorpusReader.hasNextSentence()) {
TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
TokenSequence guessedSequence = null;
if (evaluationCorpusReader.hasNextSentence())
guessedSequence = evaluationCorpusReader.nextTokenSequence();
else {
throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
}
Sentence sentence = realSequence.getSentence();
// Initially, separate the sentence into tokens using the separators
// provided
TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
realAtomicSequence.findDefaultTokens();
TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
guessedAtomicSequence.findDefaultTokens();
List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
Set<Token> matchedTokens = new HashSet<Token>();
for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
matchingSequences.add(matchSequence);
matchedTokens.addAll(matchSequence.getTokensToCheck());
Token token = null;
for (Token aToken : matchSequence.getTokensToCheck()) {
token = aToken;
if (!aToken.isWhiteSpace()) {
break;
}
}
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences == null) {
matchSequences = new TreeSet<TokenPatternMatchSequence>();
tokenMatchSequenceMap.put(token, matchSequences);
}
matchSequences.add(matchSequence);
}
}
TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
int i = 0;
int mismatches = 0;
for (Token token : realAtomicSequence) {
if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
// skipped stuff at start of sentence on guess, if it's been
// through the parser
TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
Decision decision = new Decision(outcome.name());
decision.addAuthority("_" + this.getClass().getSimpleName());
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences != null) {
decision.addAuthority("_Patterns");
for (TokenPatternMatchSequence matchSequence : matchSequences) {
decision.addAuthority(matchSequence.getTokenPattern().getName());
}
}
guess.addTaggedToken(token, decision, outcome);
mismatches++;
LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
if (mismatches > 6) {
LOG.info("Real sequence: " + realSequence.getSentence().getText());
LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
}
continue;
}
TokeniserOutcome outcome = TokeniserOutcome.JOIN;
if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
outcome = TokeniserOutcome.SEPARATE;
}
Decision decision = new Decision(outcome.name());
decision.addAuthority("_" + this.getClass().getSimpleName());
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences != null) {
decision.addAuthority("_Patterns");
for (TokenPatternMatchSequence matchSequence : matchSequences) {
decision.addAuthority(matchSequence.getTokenPattern().getName());
}
}
guess.addTaggedToken(token, decision, outcome);
i++;
}
List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
guessedAtomicSequences.add(guess);
for (TokenEvaluationObserver observer : observers) {
observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
}
}
for (TokenEvaluationObserver observer : observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class BackwardSearchFeature method checkInternal.
@Override
public FeatureResult<TokenWrapper> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<TokenWrapper> featureResult = null;
int startIndex = token.getIndex() - 1;
int endIndex = 0;
if (startIndexFeature != null) {
FeatureResult<Integer> startIndexResult = startIndexFeature.check(innerWrapper, env);
if (startIndexResult != null) {
startIndex = startIndexResult.getOutcome();
} else {
return null;
}
}
if (endIndexFeature != null) {
FeatureResult<Integer> endIndexResult = endIndexFeature.check(innerWrapper, env);
if (endIndexResult != null) {
endIndex = endIndexResult.getOutcome();
} else {
return null;
}
}
if (startIndex < 0)
return null;
if (endIndex >= token.getTokenSequence().size())
return null;
if (endIndex > startIndex)
return null;
if (startIndex >= token.getTokenSequence().size())
startIndex = token.getTokenSequence().size() - 1;
Token matchingToken = null;
for (int i = startIndex; i >= 0 && i >= endIndex; i--) {
Token oneToken = token.getTokenSequence().get(i);
FeatureResult<Boolean> criterionResult = this.criterion.check(oneToken, env);
if (criterionResult != null && criterionResult.getOutcome()) {
matchingToken = oneToken;
break;
}
if (stopCriterion != null) {
FeatureResult<Boolean> stopCriterionResult = this.stopCriterion.check(oneToken, env);
if (stopCriterionResult != null && stopCriterionResult.getOutcome()) {
break;
}
}
}
if (matchingToken != null) {
featureResult = this.generateResult(matchingToken);
}
return featureResult;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class CountIfFeature method checkInternal.
@Override
public FeatureResult<Integer> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<Integer> featureResult = null;
int startIndex = 0;
int endIndex = token.getTokenSequence().size();
FeatureResult<Integer> startIndexResult = startIndexFeature.check(innerWrapper, env);
if (startIndexResult != null) {
startIndex = startIndexResult.getOutcome();
} else {
return null;
}
if (endIndexFeature != null) {
FeatureResult<Integer> endIndexResult = endIndexFeature.check(innerWrapper, env);
if (endIndexResult != null) {
endIndex = endIndexResult.getOutcome();
} else {
return null;
}
}
if (endIndex < startIndex)
return null;
if (startIndex <= 0)
startIndex = 0;
int count = 0;
for (int i = startIndex; i < token.getTokenSequence().size() && i <= endIndex; i++) {
Token oneToken = token.getTokenSequence().get(i);
FeatureResult<Boolean> criterionResult = this.criterion.check(oneToken, env);
if (criterionResult != null && criterionResult.getOutcome()) {
count++;
}
}
featureResult = this.generateResult(count);
return featureResult;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class ForwardSearchFeature method checkInternal.
@Override
public FeatureResult<TokenWrapper> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<TokenWrapper> featureResult = null;
int startIndex = token.getIndex() + 1;
int endIndex = token.getTokenSequence().size();
if (startIndexFeature != null) {
FeatureResult<Integer> startIndexResult = startIndexFeature.check(innerWrapper, env);
if (startIndexResult != null) {
startIndex = startIndexResult.getOutcome();
} else {
return null;
}
}
if (endIndexFeature != null) {
FeatureResult<Integer> endIndexResult = endIndexFeature.check(innerWrapper, env);
if (endIndexResult != null) {
endIndex = endIndexResult.getOutcome();
} else {
return null;
}
}
if (startIndex >= token.getTokenSequence().size())
return null;
if (endIndex < 0)
return null;
if (endIndex < startIndex)
return null;
if (startIndex < 0)
startIndex = 0;
Token matchingToken = null;
for (int i = startIndex; i < token.getTokenSequence().size() && i <= endIndex; i++) {
Token oneToken = token.getTokenSequence().get(i);
FeatureResult<Boolean> criterionResult = this.criterion.check(oneToken, env);
if (criterionResult != null && criterionResult.getOutcome()) {
matchingToken = oneToken;
break;
}
if (stopCriterion != null) {
FeatureResult<Boolean> stopCriterionResult = this.stopCriterion.check(oneToken, env);
if (stopCriterionResult != null && stopCriterionResult.getOutcome()) {
break;
}
}
}
if (matchingToken != null) {
featureResult = this.generateResult(matchingToken);
}
return featureResult;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LastWordInCompoundFeature method checkInternal.
@Override
public FeatureResult<String> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<String> result = null;
String string = token.getAnalyisText().trim();
if (string.indexOf(' ') >= 0) {
int lastSpace = string.lastIndexOf(' ');
String lastWord = string.substring(lastSpace + 1);
result = this.generateResult(lastWord);
}
return result;
}
Aggregations