use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class ParserEvaluator method evaluate.
/**
* @throws TalismaneException
* if an attempt is made to evaluate with a tokeniser but no
* pos-tagger
* @throws IOException
*/
public void evaluate() throws TalismaneException, IOException {
while (corpusReader.hasNextSentence()) {
ParseConfiguration realConfiguration = corpusReader.nextConfiguration();
List<PosTagSequence> posTagSequences = null;
List<TokenSequence> tokenSequences = null;
if (tokeniser != null) {
if (posTagger == null)
throw new TalismaneException("Cannot evaluate with tokeniser but no pos-tagger");
Sentence sentence = realConfiguration.getPosTagSequence().getTokenSequence().getSentence();
// annotate the sentence for pre token filters
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
if (LOG.isTraceEnabled()) {
LOG.trace("TokenFilter: " + annotator);
LOG.trace("annotations: " + sentence.getAnnotations());
}
}
tokenSequences = tokeniser.tokenise(sentence);
} else {
tokenSequences = new ArrayList<TokenSequence>();
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence().clonePosTagSequence();
posTagSequence.removeRoot();
tokenSequences.add(posTagSequence.getTokenSequence());
}
if (posTagger != null) {
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
} else {
posTagSequences = new ArrayList<PosTagSequence>();
PosTagSequence posTagSequence = null;
posTagSequence = posTagger.tagSentence(tokenSequences.get(0));
posTagSequences.add(posTagSequence);
}
} else {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
posTagSequences = new ArrayList<PosTagSequence>();
posTagSequences.add(posTagSequence);
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseStart(realConfiguration, posTagSequences);
}
List<ParseConfiguration> guessedConfigurations = null;
if (parser instanceof NonDeterministicParser) {
NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
guessedConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
} else {
ParseConfiguration bestGuess = parser.parseSentence(posTagSequences.get(0));
guessedConfigurations = new ArrayList<ParseConfiguration>();
guessedConfigurations.add(bestGuess);
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseEnd(realConfiguration, guessedConfigurations);
}
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class ParserFScoreCalculator method onParseEnd.
@Override
public void onParseEnd(ParseConfiguration realConfiguration, List<ParseConfiguration> guessedConfigurations) throws TalismaneException {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
ParseConfiguration bestGuess = guessedConfigurations.get(0);
int mismatchedTokens = 0;
for (PosTaggedToken posTaggedToken : posTagSequence) {
if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken, projective);
DependencyArc guessedArc = null;
boolean foundToken = false;
for (PosTaggedToken guessedToken : bestGuess.getPosTagSequence()) {
if (guessedToken.getToken().getStartIndex() == posTaggedToken.getToken().getStartIndex()) {
if (guessedToken.getToken().isEmpty() && !posTaggedToken.getToken().isEmpty())
continue;
if (!guessedToken.getToken().isEmpty() && posTaggedToken.getToken().isEmpty())
continue;
foundToken = true;
guessedArc = bestGuess.getGoverningDependency(guessedToken, projective);
break;
}
}
if (!foundToken) {
LOG.info("Mismatched token :" + posTaggedToken.getToken().getOriginalText() + ", index " + posTaggedToken.getToken().getIndex());
mismatchedTokens += 1;
}
String realLabel = realArc == null ? "noHead" : labeledEvaluation ? realArc.getLabel() : "head";
String guessedLabel = guessedArc == null ? "noHead" : labeledEvaluation ? guessedArc.getLabel() : "head";
if (realLabel == null || realLabel.length() == 0)
realLabel = "noLabel";
if (guessedLabel == null || guessedLabel.length() == 0)
guessedLabel = "noLabel";
// should be considered a "no head" rather than "no label"
if (realArc != null && realArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && realLabel.equals("noLabel"))
realLabel = "noHead";
if (guessedArc != null && guessedArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && guessedLabel.equals("noLabel"))
guessedLabel = "noHead";
if (realArc == null || guessedArc == null) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else {
boolean sameHead = realArc.getHead().getToken().getStartIndex() == guessedArc.getHead().getToken().getStartIndex();
if (sameHead) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else if (guessedLabel.equals("noHead")) {
fscoreCalculator.increment(realLabel, "noHead");
} else if (realArc.getLabel().equals(guessedArc.getLabel())) {
fscoreCalculator.increment(realLabel, "wrongHead");
} else {
fscoreCalculator.increment(realLabel, "wrongHeadWrongLabel");
}
}
// have one of the arcs
}
// is root tag?
}
if ((double) mismatchedTokens / (double) posTagSequence.size() > 0.5) {
// more than half of the tokens mismatched?
throw new TalismaneException("Too many mismatched tokens in sentence: " + posTagSequence.getTokenSequence().getSentence().getText());
}
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class PosTaggers method getPosTagger.
public static PosTagger getPosTagger(String sessionId) throws ReflectiveOperationException {
PosTagger posTagger = posTaggerMap.get(sessionId);
if (posTagger == null) {
Config config = ConfigFactory.load();
Config posTaggerConfig = config.getConfig("talismane.core." + sessionId + ".pos-tagger");
String className = posTaggerConfig.getString("pos-tagger");
@SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
if (!PosTagger.class.isAssignableFrom(untypedClass))
throw new TalismaneException("Class " + className + " does not implement interface " + PosTagger.class.getSimpleName());
@SuppressWarnings("unchecked") Class<? extends PosTagger> clazz = untypedClass;
Constructor<? extends PosTagger> cons = null;
if (cons == null) {
try {
cons = clazz.getConstructor(String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
posTagger = cons.newInstance(sessionId);
} else {
throw new TalismaneException("No constructor found with correct signature for: " + className);
}
}
posTaggerMap.put(sessionId, posTagger);
}
return posTagger.clonePosTagger();
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class ForwardStatisticalPosTagger method tagSentence.
@Override
public List<PosTagSequence> tagSentence(List<TokenSequence> input) throws TalismaneException, IOException {
List<TokenSequence> tokenSequences = null;
if (this.propagateTokeniserBeam) {
tokenSequences = input;
} else {
tokenSequences = new ArrayList<>(1);
tokenSequences.add(input.get(0));
}
int sentenceLength = tokenSequences.get(0).getSentence().getText().length();
TreeMap<Double, PriorityQueue<PosTagSequence>> heaps = new TreeMap<Double, PriorityQueue<PosTagSequence>>();
PriorityQueue<PosTagSequence> heap0 = new PriorityQueue<PosTagSequence>();
for (TokenSequence tokenSequence : tokenSequences) {
// add an empty PosTagSequence for each token sequence
PosTagSequence emptySequence = new PosTagSequence(tokenSequence);
emptySequence.setScoringStrategy(decisionMaker.getDefaultScoringStrategy());
heap0.add(emptySequence);
}
heaps.put(0.0, heap0);
PriorityQueue<PosTagSequence> finalHeap = null;
while (heaps.size() > 0) {
Entry<Double, PriorityQueue<PosTagSequence>> heapEntry = heaps.pollFirstEntry();
if (LOG.isTraceEnabled()) {
LOG.trace("heap key: " + heapEntry.getKey() + ", sentence length: " + sentenceLength);
}
if (heapEntry.getKey() == sentenceLength) {
finalHeap = heapEntry.getValue();
break;
}
PriorityQueue<PosTagSequence> previousHeap = heapEntry.getValue();
// limit the breadth to K
int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
for (int j = 0; j < maxSequences; j++) {
PosTagSequence history = previousHeap.poll();
Token token = history.getNextToken();
if (LOG.isTraceEnabled()) {
LOG.trace("#### Next history ( " + heapEntry.getKey() + "): " + history.toString());
LOG.trace("Prob: " + df.format(history.getScore()));
LOG.trace("Token: " + token.getText());
StringBuilder sb = new StringBuilder();
for (Token oneToken : history.getTokenSequence().listWithWhiteSpace()) {
if (oneToken.equals(token))
sb.append("[" + oneToken + "]");
else
sb.append(oneToken);
}
LOG.trace(sb.toString());
}
PosTaggerContext context = new PosTaggerContextImpl(token, history);
List<Decision> decisions = new ArrayList<Decision>();
boolean ruleApplied = false;
// assigned?
if (token.getAttributes().containsKey(PosTagger.POS_TAG_ATTRIBUTE)) {
StringAttribute posTagCodeAttribute = (StringAttribute) token.getAttributes().get(PosTagger.POS_TAG_ATTRIBUTE);
String posTagCode = posTagCodeAttribute.getValue();
Decision positiveRuleDecision = new Decision(posTagCode);
decisions.add(positiveRuleDecision);
positiveRuleDecision.addAuthority("tokenAttribute");
ruleApplied = true;
if (LOG.isTraceEnabled()) {
LOG.trace("Token has attribute \"" + PosTagger.POS_TAG_ATTRIBUTE + "\". Setting posTag to: " + posTagCode);
}
}
// test the positive rules on the current token
if (!ruleApplied) {
if (posTaggerPositiveRules != null) {
for (PosTaggerRule rule : posTaggerPositiveRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking rule: " + rule.getCondition().getName());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(context, env);
if (ruleResult != null && ruleResult.getOutcome()) {
Decision positiveRuleDecision = new Decision(rule.getTag().getCode());
decisions.add(positiveRuleDecision);
positiveRuleDecision.addAuthority(rule.getCondition().getName());
ruleApplied = true;
if (LOG.isTraceEnabled()) {
LOG.trace("Rule applies. Setting posTag to: " + rule.getTag().getCode());
}
break;
}
}
}
}
if (!ruleApplied) {
// test the features on the current token
List<FeatureResult<?>> featureResults = new ArrayList<FeatureResult<?>>();
for (PosTaggerFeature<?> posTaggerFeature : posTaggerFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = posTaggerFeature.check(context, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
// evaluate the feature results using the maxent model
decisions = this.decisionMaker.decide(featureResults);
for (ClassificationObserver observer : this.observers) {
observer.onAnalyse(token, featureResults, decisions);
}
// apply the negative rules
Set<String> eliminatedPosTags = new TreeSet<String>();
if (posTaggerNegativeRules != null) {
for (PosTaggerRule rule : posTaggerNegativeRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking negative rule: " + rule.getCondition().getName());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(context, env);
if (ruleResult != null && ruleResult.getOutcome()) {
eliminatedPosTags.add(rule.getTag().getCode());
if (LOG.isTraceEnabled()) {
LOG.trace("Rule applies. Eliminating posTag: " + rule.getTag().getCode());
}
}
}
if (eliminatedPosTags.size() > 0) {
List<Decision> decisionShortList = new ArrayList<Decision>();
for (Decision decision : decisions) {
if (!eliminatedPosTags.contains(decision.getOutcome())) {
decisionShortList.add(decision);
} else {
LOG.trace("Eliminating decision: " + decision.toString());
}
}
if (decisionShortList.size() > 0) {
decisions = decisionShortList;
} else {
LOG.debug("All decisions eliminated! Restoring original decisions.");
}
}
}
// is this a known word in the lexicon?
if (LOG.isTraceEnabled()) {
String posTags = "";
for (PosTag onePosTag : token.getPossiblePosTags()) {
posTags += onePosTag.getCode() + ",";
}
LOG.trace("Token: " + token.getText() + ". PosTags: " + posTags);
}
List<Decision> decisionShortList = new ArrayList<Decision>();
for (Decision decision : decisions) {
if (decision.getProbability() >= MIN_PROB_TO_STORE) {
decisionShortList.add(decision);
}
}
if (decisionShortList.size() > 0) {
decisions = decisionShortList;
}
}
// outcome provided by MaxEnt
for (Decision decision : decisions) {
if (LOG.isTraceEnabled())
LOG.trace("Outcome: " + decision.getOutcome() + ", " + decision.getProbability());
PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, this.sessionId);
PosTagSequence sequence = new PosTagSequence(history);
sequence.addPosTaggedToken(posTaggedToken);
if (decision.isStatistical())
sequence.addDecision(decision);
double heapIndex = token.getEndIndex();
// it from regular ones
if (token.getStartIndex() == token.getEndIndex())
heapIndex += 0.5;
// if it's the last token, make sure we end
if (token.getIndex() == sequence.getTokenSequence().size() - 1)
heapIndex = sentenceLength;
if (LOG.isTraceEnabled())
LOG.trace("Heap index: " + heapIndex);
PriorityQueue<PosTagSequence> heap = heaps.get(heapIndex);
if (heap == null) {
heap = new PriorityQueue<PosTagSequence>();
heaps.put(heapIndex, heap);
}
heap.add(sequence);
}
// next outcome for this token
}
// next history
}
// next atomic index
// return the best sequence on the heap
List<PosTagSequence> sequences = new ArrayList<PosTagSequence>();
int i = 0;
while (!finalHeap.isEmpty()) {
// clone the pos tag sequences to ensure they don't share any underlying
// data (e.g. token sequences)
sequences.add(finalHeap.poll().clonePosTagSequence());
i++;
if (i >= this.getBeamWidth())
break;
}
// apply post-processing filters
if (LOG.isDebugEnabled()) {
LOG.debug("####Final postag sequences:");
int j = 1;
for (PosTagSequence sequence : sequences) {
if (LOG.isDebugEnabled()) {
LOG.debug("Sequence " + (j++) + ", score=" + df.format(sequence.getScore()));
LOG.debug("Sequence: " + sequence);
}
}
}
return sequences;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class PosTagSequence method removeRoot.
/**
* Remove a previously pre-pended root.
*/
public void removeRoot() {
PosTaggedToken rootToken = null;
if (this.size() > 0) {
rootToken = this.get(0);
if (!rootToken.getTag().equals(PosTag.ROOT_POS_TAG))
rootToken = null;
}
if (rootToken != null) {
Token emptyToken = rootToken.getToken();
try {
tokenSequence.removeEmptyToken(emptyToken);
} catch (TalismaneException e) {
// should never happen
LOG.error(e.getMessage(), e);
throw new RuntimeException(e);
}
this.remove(0);
tokenSequence.setWithRoot(false);
tokenSequence.reindex();
}
}
Aggregations