use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class PosTagComparator method evaluate.
/**
* Evaluate the evaluation corpus against the reference corpus.
*
* @throws TalismaneException
* @throws IOException
*/
public void evaluate() throws TalismaneException, IOException {
while (referenceCorpusReader.hasNextSentence()) {
PosTagSequence realPosTagSequence = referenceCorpusReader.nextPosTagSequence();
PosTagSequence guessedPosTagSequence = evaluationCorpusReader.nextPosTagSequence();
List<PosTagSequence> guessedSequences = new ArrayList<PosTagSequence>();
guessedSequences.add(guessedPosTagSequence);
for (PosTagEvaluationObserver observer : this.observers) {
observer.onNextPosTagSequence(realPosTagSequence, guessedSequences);
}
}
for (PosTagEvaluationObserver observer : this.observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class PosTagFScoreCalculator method onNextPosTagSequence.
@Override
public void onNextPosTagSequence(PosTagSequence realSequence, List<PosTagSequence> guessedSequences) throws TalismaneException {
PosTagSequence guessedSequence = guessedSequences.get(0);
int j = 0;
for (int i = 0; i < realSequence.size(); i++) {
TaggedToken<PosTag> realToken = realSequence.get(i);
TaggedToken<PosTag> testToken = guessedSequence.get(j);
// special handling for null tags & empty tokens
if (realToken.getTag().equals(PosTag.NULL_POS_TAG)) {
// required comparisons.
if (testToken.getToken().isEmpty()) {
j++;
}
continue;
} else if (testToken.getToken().isEmpty() && !realToken.getToken().isEmpty()) {
// If the test token is empty, but the real token isn't, we skip
// this as well
// Again, we assume the previous non-empty token took care of
// any required comparisons.
j++;
testToken = guessedSequence.get(j);
}
boolean tokenError = false;
if (realToken.getToken().getStartIndex() == testToken.getToken().getStartIndex() && realToken.getToken().getEndIndex() == testToken.getToken().getEndIndex()) {
// no token error
j++;
if (j == guessedSequence.size()) {
j--;
}
} else {
tokenError = true;
while (realToken.getToken().getEndIndex() >= testToken.getToken().getEndIndex()) {
j++;
if (j == guessedSequence.size()) {
j--;
break;
}
testToken = guessedSequence.get(j);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("Token " + testToken.getToken().getAnalyisText() + ", guessed: " + testToken.getTag().getCode() + " (" + testToken.getDecision().getProbability() + "), actual: " + realToken.getTag().getCode());
}
String result = testToken.getTag().getCode();
if (tokenError)
result = "TOKEN_ERROR";
fScoreCalculator.increment(realToken.getTag().getCode(), result);
if (testToken.getToken().getPossiblePosTags() == null || testToken.getToken().getPossiblePosTags().size() == 0)
fscoreUnknownInLexicon.increment(realToken.getTag().getCode(), result);
else
fscoreKnownInLexicon.increment(realToken.getTag().getCode(), result);
}
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class PosTagLexicalCoverageTester method onNextPosTagSequence.
@Override
public void onNextPosTagSequence(PosTagSequence realSequence, List<PosTagSequence> guessedSequences) throws TalismaneException {
PosTagSequence guessedSequence = guessedSequences.get(0);
for (int i = 0; i < realSequence.size(); i++) {
TaggedToken<PosTag> realToken = realSequence.get(i);
TaggedToken<PosTag> testToken = guessedSequence.get(i);
boolean tokenUnknown = realToken.getToken().getPossiblePosTags() != null && realToken.getToken().getPossiblePosTags().size() == 0;
if (tokenUnknown) {
fscoreUnknownInLexicon.increment(realToken.getTag().getCode(), testToken.getTag().getCode());
unknownWordCount++;
Integer countObj = unknownWords.get(realToken.getTag() + "|" + realToken.getToken().getAnalyisText());
int count = countObj == null ? 0 : countObj.intValue();
unknownWords.put(realToken.getTag() + "|" + realToken.getToken().getAnalyisText(), count + 1);
} else {
knownWordCount++;
knownWords.add(realToken.getToken().getAnalyisText());
}
if (realToken.getTag().getOpenClassIndicator().isClosed() && !realToken.getToken().getPossiblePosTags().contains(realToken.getTag())) {
closedCategoryMismatches.add(realToken.getTag() + "|" + realToken.getToken().getAnalyisText());
}
}
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class PosTaggerEvaluator method evaluate.
/**
* Evaluate a given pos tagger.
*
* @throws TalismaneException
* @throws IOException
*/
public void evaluate() throws TalismaneException, IOException {
while (corpusReader.hasNextSentence()) {
PosTagSequence realPosTagSequence = corpusReader.nextPosTagSequence();
List<TokenSequence> tokenSequences = null;
List<PosTagSequence> guessedSequences = null;
TokenSequence tokenSequence = realPosTagSequence.getTokenSequence();
PosTagSequence guessedSequence = null;
if (this.tokeniser != null) {
Sentence sentence = tokenSequence.getSentence();
tokenSequences = tokeniser.tokenise(sentence);
tokenSequence = tokenSequences.get(0);
} else {
tokenSequences = new ArrayList<TokenSequence>();
tokenSequences.add(tokenSequence);
}
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
guessedSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
guessedSequence = guessedSequences.get(0);
} else {
guessedSequence = posTagger.tagSentence(tokenSequence);
}
if (LOG.isDebugEnabled()) {
StringBuilder stringBuilder = new StringBuilder();
for (PosTaggedToken posTaggedToken : guessedSequence) {
Set<String> lemmas = new TreeSet<String>();
stringBuilder.append(posTaggedToken.getToken().getOriginalText());
stringBuilder.append("[" + posTaggedToken.getTag());
List<LexicalEntry> entries = posTaggedToken.getLexicalEntries();
boolean dropCurrentWord = false;
if (entries.size() > 1)
dropCurrentWord = true;
for (LexicalEntry entry : posTaggedToken.getLexicalEntries()) {
if (!lemmas.contains(entry.getLemma())) {
if (dropCurrentWord && posTaggedToken.getToken().getText().equals(entry.getLemma())) {
dropCurrentWord = false;
continue;
}
stringBuilder.append("|" + entry.getLemma());
// stringBuilder.append("/" + entry.getCategory());
stringBuilder.append("/" + entry.getMorphology());
lemmas.add(entry.getLemma());
}
}
stringBuilder.append("] ");
}
LOG.debug(stringBuilder.toString());
}
for (PosTagEvaluationObserver observer : this.observers) {
observer.onNextPosTagSequence(realPosTagSequence, guessedSequences);
}
}
for (PosTagEvaluationObserver observer : this.observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class TokenSearchFeature method check.
@Override
public FeatureResult<PosTaggedTokenWrapper> check(ParseConfigurationWrapper context, RuntimeEnvironment env) throws TalismaneException {
FeatureResult<PosTaggedTokenWrapper> featureResult = null;
PosTagSequence posTagSequence = context.getParseConfiguration().getPosTagSequence();
int startIndex = 0;
int endIndex = posTagSequence.size() - 1;
if (startIndexFeature != null) {
FeatureResult<Integer> startIndexResult = startIndexFeature.check(context, env);
if (startIndexResult != null) {
startIndex = startIndexResult.getOutcome();
} else {
return featureResult;
}
}
if (endIndexFeature != null) {
FeatureResult<Integer> endIndexResult = endIndexFeature.check(context, env);
if (endIndexResult != null) {
endIndex = endIndexResult.getOutcome();
} else {
return featureResult;
}
}
if (startIndex < 0)
startIndex = 0;
if (endIndex < 0)
endIndex = 0;
if (startIndex >= posTagSequence.size())
startIndex = posTagSequence.size() - 1;
if (endIndex >= posTagSequence.size())
endIndex = posTagSequence.size() - 1;
int step = -1;
if (endIndex > startIndex)
step = 1;
PosTaggedToken matchingToken = null;
boolean findFirst = true;
if (findFirstFeature != null) {
FeatureResult<Boolean> findFirstResult = this.findFirstFeature.check(context, env);
if (findFirstResult == null) {
return null;
}
findFirst = findFirstResult.getOutcome();
}
ParseConfigurationAddress parseConfigurationAddress = new ParseConfigurationAddress(env);
parseConfigurationAddress.setParseConfiguration(context.getParseConfiguration());
int currentSkip = -1;
for (int i = startIndex; (step < 0 && i >= 0 && i >= endIndex) || (step > 0 && i < posTagSequence.size() && i <= endIndex); i += step) {
PosTaggedToken oneToken = posTagSequence.get(i);
parseConfigurationAddress.setPosTaggedToken(oneToken);
if (currentSkip < 0) {
FeatureResult<Boolean> criterionResult = this.criterion.check(parseConfigurationAddress, env);
if (criterionResult != null && criterionResult.getOutcome()) {
matchingToken = oneToken;
if (findFirst)
break;
}
}
boolean endSkip = false;
if (skipCriteria != null && skipCriteria.length > 0) {
if (currentSkip < 0) {
for (int j = 0; j < skipCriteria.length; j += 2) {
BooleanFeature<PosTaggedTokenWrapper> skipCriterion = skipCriteria[j];
FeatureResult<Boolean> skipResult = skipCriterion.check(parseConfigurationAddress, env);
if (skipResult != null && skipResult.getOutcome()) {
currentSkip = j;
break;
}
}
} else {
int j = currentSkip + 1;
BooleanFeature<PosTaggedTokenWrapper> endSkipCriterion = skipCriteria[j];
FeatureResult<Boolean> endSkipResult = endSkipCriterion.check(parseConfigurationAddress, env);
if (endSkipResult != null && endSkipResult.getOutcome()) {
endSkip = true;
}
}
}
if (currentSkip < 0) {
if (stopCriterion != null) {
FeatureResult<Boolean> stopCriterionResult = this.stopCriterion.check(parseConfigurationAddress, env);
if (stopCriterionResult != null && stopCriterionResult.getOutcome()) {
break;
}
}
}
if (endSkip)
currentSkip = -1;
}
if (matchingToken != null) {
featureResult = this.generateResult(matchingToken);
}
return featureResult;
}
Aggregations