use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.
the class PatternEventStream method next.
@Override
public ClassificationEvent next() throws TalismaneException, IOException {
ClassificationEvent event = null;
if (this.hasNext()) {
TokenPatternMatch tokenPatternMatch = currentPatternMatches.get(currentIndex);
TokeniserOutcome outcome = currentOutcomes.get(currentIndex);
String classification = outcome.name();
LOG.debug("next event, pattern match: " + tokenPatternMatch.toString() + ", outcome:" + classification);
List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>();
for (TokenPatternMatchFeature<?> feature : tokenPatternMatchFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(tokenPatternMatch, env);
if (featureResult != null) {
tokenFeatureResults.add(featureResult);
}
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = tokenFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
event = new ClassificationEvent(tokenFeatureResults, classification);
currentIndex++;
if (currentIndex == currentPatternMatches.size()) {
currentPatternMatches = null;
}
}
return event;
}
use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.
the class PatternTokeniser method applyDecision.
TokenisedAtomicTokenSequence applyDecision(Token token, Decision decision, TokenisedAtomicTokenSequence history, TokenPatternMatchSequence matchSequence, Decision defaultDecision) {
TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
TokenisedAtomicTokenSequence tokenisedSequence = new TokenisedAtomicTokenSequence(history);
tokenisedSequence.add(taggedToken);
if (decision.isStatistical())
tokenisedSequence.addDecision(decision);
if (matchSequence != null) {
for (Token otherToken : matchSequence.getTokensToCheck()) {
if (otherToken.equals(token)) {
continue;
}
TaggedToken<TokeniserOutcome> anotherTaggedToken = new TaggedToken<>(otherToken, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
tokenisedSequence.add(anotherTaggedToken);
}
}
return tokenisedSequence;
}
use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.
the class TokenEvaluationCorpusWriter method onNextTokenSequence.
@Override
public void onNextTokenSequence(TokenSequence realSequence, List<TokenisedAtomicTokenSequence> guessedAtomicSequences) throws IOException {
List<Integer> realSplits = realSequence.getTokenSplits();
TokenisedAtomicTokenSequence tokenisedAtomicTokenSequence = guessedAtomicSequences.get(0);
Map<Integer, TokeniserOutcome> realOutcomes = new HashMap<Integer, TokeniserOutcome>();
Map<Integer, TokeniserOutcome> guessedOutcomes = new HashMap<Integer, TokeniserOutcome>();
Map<Integer, List<String>> guessedAuthorities = new HashMap<Integer, List<String>>();
List<Integer> indexes = new ArrayList<Integer>();
corpusWriter.write(realSequence.getSentence().getText() + "\n");
for (TaggedToken<TokeniserOutcome> guessTag : tokenisedAtomicTokenSequence) {
TokeniserOutcome guessDecision = guessTag.getTag();
int startIndex = guessTag.getToken().getStartIndex();
boolean realSplit = realSplits.contains(startIndex);
TokeniserOutcome realDecision = realSplit ? TokeniserOutcome.SEPARATE : TokeniserOutcome.JOIN;
indexes.add(startIndex);
realOutcomes.put(startIndex, realDecision);
guessedOutcomes.put(startIndex, guessDecision);
guessedAuthorities.put(startIndex, guessTag.getDecision().getAuthorities());
}
int prevEndIndex = 0;
for (Token token : realSequence) {
corpusWriter.write(token.getOriginalText());
Set<String> authorities = new TreeSet<String>();
boolean correct = true;
for (int index : indexes) {
if (prevEndIndex <= index && index < token.getEndIndex()) {
correct = correct && realOutcomes.get(index) == guessedOutcomes.get(index);
authorities.addAll(guessedAuthorities.get(index));
}
}
corpusWriter.write("\t" + correct);
for (String authority : authorities) {
if (!authority.startsWith("_")) {
corpusWriter.write("\t" + authority);
}
}
corpusWriter.write("\n");
corpusWriter.flush();
prevEndIndex = token.getEndIndex();
}
corpusWriter.write("\n");
}
use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.
the class TokenFScoreCalculator method onNextTokenSequence.
@Override
public void onNextTokenSequence(TokenSequence realSequence, List<TokenisedAtomicTokenSequence> guessedAtomicSequences) {
List<Integer> realSplits = realSequence.getTokenSplits();
String sentence = realSequence.getSentence().getText().toString();
TokenisedAtomicTokenSequence tokeniserAtomicTokenSequence = guessedAtomicSequences.get(0);
TokenSequence guessedSequence = tokeniserAtomicTokenSequence.inferTokenSequence();
List<Integer> guessedSplits = guessedSequence.getTokenSplits();
if (LOG.isDebugEnabled()) {
int pos = 0;
StringBuilder sb = new StringBuilder();
for (int split : realSplits) {
String aToken = sentence.substring(pos, split);
sb.append('|');
sb.append(aToken);
pos = split;
}
int pos2 = 0;
StringBuilder sb2 = new StringBuilder();
for (int split : guessedSplits) {
String aToken = sentence.substring(pos2, split);
sb2.append('|');
sb2.append(aToken);
pos2 = split;
}
LOG.debug("Real: " + sb.toString());
LOG.debug("Guessed: " + sb2.toString());
}
for (TaggedToken<TokeniserOutcome> guessTag : tokeniserAtomicTokenSequence) {
TokeniserOutcome guessDecision = guessTag.getTag();
boolean realSplit = realSplits.contains(guessTag.getToken().getStartIndex());
TokeniserOutcome realDecision = realSplit ? TokeniserOutcome.SEPARATE : TokeniserOutcome.JOIN;
if (!realDecision.equals(guessDecision)) {
int start1 = guessTag.getToken().getStartIndex() - NUM_CHARS;
int end1 = guessTag.getToken().getStartIndex() + NUM_CHARS;
if (start1 < 0)
start1 = 0;
String startString = sentence.substring(start1, guessTag.getToken().getStartIndex());
startString = StringUtils.padLeft(startString, NUM_CHARS);
if (end1 >= sentence.length())
end1 = sentence.length() - 1;
String symbol = "+";
if (realDecision == TokeniserOutcome.SEPARATE)
symbol = "-";
TokeniserErrorRecord errorRecord = new TokeniserErrorRecord();
errorRecord.realDecision = realDecision;
errorRecord.guessDecision = guessDecision;
errorRecord.context = startString + "[" + symbol + "]" + sentence.substring(guessTag.getToken().getStartIndex(), end1);
LOG.debug("guess " + guessDecision + ", real " + realDecision + ", context: " + errorRecord.context);
for (String authority : guessTag.getDecision().getAuthorities()) {
List<TokeniserErrorRecord> errors = errorMap.get(authority);
if (errors == null) {
errors = new ArrayList<TokeniserErrorRecord>();
errorMap.put(authority, errors);
}
errors.add(errorRecord);
}
}
fScoreCalculator.increment(realDecision, guessDecision);
for (String authority : guessTag.getDecision().getAuthorities()) {
FScoreCalculator<TokeniserOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(authority);
if (taggerFScoreCalculator == null) {
taggerFScoreCalculator = new FScoreCalculator<TokeniserOutcome>();
taggerFScoreCalculators.put(authority, taggerFScoreCalculator);
}
taggerFScoreCalculator.increment(realDecision, guessDecision);
}
}
// next decision
}
use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.
the class TokenComparator method compare.
/**
* Evaluate the evaluation corpus against the reference corpus.
*
* @throws TalismaneException
* @throws IOException
*/
public void compare() throws TalismaneException, IOException {
while (referenceCorpusReader.hasNextSentence()) {
TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
TokenSequence guessedSequence = null;
if (evaluationCorpusReader.hasNextSentence())
guessedSequence = evaluationCorpusReader.nextTokenSequence();
else {
throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
}
Sentence sentence = realSequence.getSentence();
// Initially, separate the sentence into tokens using the separators
// provided
TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
realAtomicSequence.findDefaultTokens();
TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
guessedAtomicSequence.findDefaultTokens();
List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
Set<Token> matchedTokens = new HashSet<Token>();
for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
matchingSequences.add(matchSequence);
matchedTokens.addAll(matchSequence.getTokensToCheck());
Token token = null;
for (Token aToken : matchSequence.getTokensToCheck()) {
token = aToken;
if (!aToken.isWhiteSpace()) {
break;
}
}
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences == null) {
matchSequences = new TreeSet<TokenPatternMatchSequence>();
tokenMatchSequenceMap.put(token, matchSequences);
}
matchSequences.add(matchSequence);
}
}
TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
int i = 0;
int mismatches = 0;
for (Token token : realAtomicSequence) {
if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
// skipped stuff at start of sentence on guess, if it's been
// through the parser
TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
Decision decision = new Decision(outcome.name());
decision.addAuthority("_" + this.getClass().getSimpleName());
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences != null) {
decision.addAuthority("_Patterns");
for (TokenPatternMatchSequence matchSequence : matchSequences) {
decision.addAuthority(matchSequence.getTokenPattern().getName());
}
}
guess.addTaggedToken(token, decision, outcome);
mismatches++;
LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
if (mismatches > 6) {
LOG.info("Real sequence: " + realSequence.getSentence().getText());
LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
}
continue;
}
TokeniserOutcome outcome = TokeniserOutcome.JOIN;
if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
outcome = TokeniserOutcome.SEPARATE;
}
Decision decision = new Decision(outcome.name());
decision.addAuthority("_" + this.getClass().getSimpleName());
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences != null) {
decision.addAuthority("_Patterns");
for (TokenPatternMatchSequence matchSequence : matchSequences) {
decision.addAuthority(matchSequence.getTokenPattern().getName());
}
}
guess.addTaggedToken(token, decision, outcome);
i++;
}
List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
guessedAtomicSequences.add(guess);
for (TokenEvaluationObserver observer : observers) {
observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
}
}
for (TokenEvaluationObserver observer : observers) {
observer.onEvaluationComplete();
}
}
Aggregations