use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class TransitionBasedParser method parseSentence.
@Override
public List<ParseConfiguration> parseSentence(List<PosTagSequence> input) throws TalismaneException, IOException {
List<PosTagSequence> posTagSequences = null;
if (this.propagatePosTaggerBeam) {
posTagSequences = input;
} else {
posTagSequences = new ArrayList<>(1);
posTagSequences.add(input.get(0));
}
long startTime = System.currentTimeMillis();
int maxAnalysisTimeMilliseconds = maxAnalysisTimePerSentence * 1000;
int minFreeMemoryBytes = minFreeMemory * KILOBYTE;
TokenSequence tokenSequence = posTagSequences.get(0).getTokenSequence();
TreeMap<Integer, PriorityQueue<ParseConfiguration>> heaps = new TreeMap<>();
PriorityQueue<ParseConfiguration> heap0 = new PriorityQueue<>();
for (PosTagSequence posTagSequence : posTagSequences) {
// add an initial ParseConfiguration for each postag sequence
ParseConfiguration initialConfiguration = new ParseConfiguration(posTagSequence);
initialConfiguration.setScoringStrategy(decisionMaker.getDefaultScoringStrategy());
heap0.add(initialConfiguration);
if (LOG.isDebugEnabled()) {
LOG.debug("Adding initial posTagSequence: " + posTagSequence);
}
}
heaps.put(0, heap0);
PriorityQueue<ParseConfiguration> backupHeap = null;
PriorityQueue<ParseConfiguration> finalHeap = null;
PriorityQueue<ParseConfiguration> terminalHeap = new PriorityQueue<>();
while (heaps.size() > 0) {
Entry<Integer, PriorityQueue<ParseConfiguration>> heapEntry = heaps.pollFirstEntry();
PriorityQueue<ParseConfiguration> currentHeap = heapEntry.getValue();
int currentHeapIndex = heapEntry.getKey();
if (LOG.isTraceEnabled()) {
LOG.trace("##### Polling next heap: " + heapEntry.getKey() + ", size: " + heapEntry.getValue().size());
}
boolean finished = false;
// systematically set the final heap here, just in case we exit
// "naturally" with no more heaps
finalHeap = heapEntry.getValue();
backupHeap = new PriorityQueue<>();
// we jump out when either (a) all tokens have been attached or
// (b) we go over the max alloted time
ParseConfiguration topConf = currentHeap.peek();
if (topConf.isTerminal()) {
LOG.trace("Exiting with terminal heap: " + heapEntry.getKey() + ", size: " + heapEntry.getValue().size());
finished = true;
}
if (earlyStop && terminalHeap.size() >= beamWidth) {
LOG.debug("Early stop activated and terminal heap contains " + beamWidth + " entries. Exiting.");
finalHeap = terminalHeap;
finished = true;
}
long analysisTime = System.currentTimeMillis() - startTime;
if (maxAnalysisTimePerSentence > 0 && analysisTime > maxAnalysisTimeMilliseconds) {
LOG.info("Parse tree analysis took too long for sentence: " + tokenSequence.getSentence().getText());
LOG.info("Breaking out after " + maxAnalysisTimePerSentence + " seconds.");
finished = true;
}
if (minFreeMemory > 0) {
long freeMemory = Runtime.getRuntime().freeMemory();
if (freeMemory < minFreeMemoryBytes) {
LOG.info("Not enough memory left to parse sentence: " + tokenSequence.getSentence().getText());
LOG.info("Min free memory (bytes):" + minFreeMemoryBytes);
LOG.info("Current free memory (bytes): " + freeMemory);
finished = true;
}
}
if (finished) {
break;
}
// limit the breadth to K
int maxSequences = currentHeap.size() > this.beamWidth ? this.beamWidth : currentHeap.size();
int j = 0;
while (currentHeap.size() > 0) {
ParseConfiguration history = currentHeap.poll();
if (LOG.isTraceEnabled()) {
LOG.trace("### Next configuration on heap " + heapEntry.getKey() + ":");
LOG.trace(history.toString());
LOG.trace("Score: " + df.format(history.getScore()));
LOG.trace(history.getPosTagSequence().toString());
}
List<Decision> decisions = new ArrayList<>();
// test the positive rules on the current configuration
boolean ruleApplied = false;
if (parserPositiveRules != null) {
for (ParserRule rule : parserPositiveRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking rule: " + rule.toString());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(history, env);
if (ruleResult != null && ruleResult.getOutcome()) {
Decision positiveRuleDecision = new Decision(rule.getTransition().getCode());
decisions.add(positiveRuleDecision);
positiveRuleDecision.addAuthority(rule.getCondition().getName());
ruleApplied = true;
if (LOG.isTraceEnabled()) {
LOG.trace("Rule applies. Setting transition to: " + rule.getTransition().getCode());
}
break;
}
}
}
if (!ruleApplied) {
// test the features on the current configuration
List<FeatureResult<?>> parseFeatureResults = new ArrayList<>();
for (ParseConfigurationFeature<?> feature : this.parseFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(history, env);
if (featureResult != null)
parseFeatureResults.add(featureResult);
}
if (LOG_FEATURES.isTraceEnabled()) {
SortedSet<String> featureResultSet = parseFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<>()));
for (String featureResultString : featureResultSet) {
LOG_FEATURES.trace(featureResultString);
}
}
// evaluate the feature results using the decision maker
decisions = this.decisionMaker.decide(parseFeatureResults);
for (ClassificationObserver observer : this.observers) {
observer.onAnalyse(history, parseFeatureResults, decisions);
}
List<Decision> decisionShortList = new ArrayList<>(decisions.size());
for (Decision decision : decisions) {
if (decision.getProbability() > MIN_PROB_TO_STORE)
decisionShortList.add(decision);
}
decisions = decisionShortList;
// apply the negative rules
Set<String> eliminatedTransitions = new HashSet<>();
if (parserNegativeRules != null) {
for (ParserRule rule : parserNegativeRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking negative rule: " + rule.toString());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(history, env);
if (ruleResult != null && ruleResult.getOutcome()) {
for (Transition transition : rule.getTransitions()) {
eliminatedTransitions.add(transition.getCode());
if (LOG.isTraceEnabled())
LOG.trace("Rule applies. Eliminating transition: " + transition.getCode());
}
}
}
if (eliminatedTransitions.size() > 0) {
decisionShortList = new ArrayList<>();
for (Decision decision : decisions) {
if (!eliminatedTransitions.contains(decision.getOutcome())) {
decisionShortList.add(decision);
} else {
LOG.trace("Eliminating decision: " + decision.toString());
}
}
if (decisionShortList.size() > 0) {
decisions = decisionShortList;
} else {
LOG.debug("All decisions eliminated! Restoring original decisions.");
}
}
}
}
// has a positive rule been applied?
boolean transitionApplied = false;
TransitionSystem transitionSystem = TalismaneSession.get(sessionId).getTransitionSystem();
// type, we should be able to stop
for (Decision decision : decisions) {
Transition transition = transitionSystem.getTransitionForCode(decision.getOutcome());
if (LOG.isTraceEnabled())
LOG.trace("Outcome: " + transition.getCode() + ", " + decision.getProbability());
if (transition.checkPreconditions(history)) {
transitionApplied = true;
ParseConfiguration configuration = new ParseConfiguration(history);
if (decision.isStatistical())
configuration.addDecision(decision);
transition.apply(configuration);
int nextHeapIndex = parseComparisonStrategy.getComparisonIndex(configuration) * 1000;
if (configuration.isTerminal()) {
nextHeapIndex = Integer.MAX_VALUE;
} else {
while (nextHeapIndex <= currentHeapIndex) nextHeapIndex++;
}
PriorityQueue<ParseConfiguration> nextHeap = heaps.get(nextHeapIndex);
if (nextHeap == null) {
if (configuration.isTerminal())
nextHeap = terminalHeap;
else
nextHeap = new PriorityQueue<>();
heaps.put(nextHeapIndex, nextHeap);
if (LOG.isTraceEnabled())
LOG.trace("Created heap with index: " + nextHeapIndex);
}
nextHeap.add(configuration);
if (LOG.isTraceEnabled()) {
LOG.trace("Added configuration with score " + configuration.getScore() + " to heap: " + nextHeapIndex + ", total size: " + nextHeap.size());
}
configuration.clearMemory();
} else {
if (LOG.isTraceEnabled())
LOG.trace("Cannot apply transition: doesn't meet pre-conditions");
// just in case the we run out of both heaps and
// analyses, we build this backup heap
backupHeap.add(history);
}
// does transition meet pre-conditions?
}
if (transitionApplied) {
j++;
} else {
LOG.trace("No transitions could be applied: not counting this history as part of the beam");
}
// beam width test
if (j == maxSequences)
break;
}
// next history
}
// next atomic index
// return the best sequences on the heap
List<ParseConfiguration> bestConfigurations = new ArrayList<>();
int i = 0;
if (finalHeap.isEmpty())
finalHeap = backupHeap;
while (!finalHeap.isEmpty()) {
bestConfigurations.add(finalHeap.poll());
i++;
if (i >= this.getBeamWidth())
break;
}
if (LOG.isDebugEnabled()) {
for (ParseConfiguration finalConfiguration : bestConfigurations) {
LOG.debug(df.format(finalConfiguration.getScore()) + ": " + finalConfiguration.toString());
LOG.debug("Pos tag sequence: " + finalConfiguration.getPosTagSequence());
LOG.debug("Transitions: " + finalConfiguration.getTransitions());
LOG.debug("Decisions: " + finalConfiguration.getDecisions());
if (LOG.isTraceEnabled()) {
StringBuilder sb = new StringBuilder();
for (Decision decision : finalConfiguration.getDecisions()) {
sb.append(" * ");
sb.append(df.format(decision.getProbability()));
}
sb.append(" root ");
sb.append(finalConfiguration.getTransitions().size());
LOG.trace(sb.toString());
sb = new StringBuilder();
sb.append(" * PosTag sequence score ");
sb.append(df.format(finalConfiguration.getPosTagSequence().getScore()));
sb.append(" = ");
for (PosTaggedToken posTaggedToken : finalConfiguration.getPosTagSequence()) {
sb.append(" * ");
sb.append(df.format(posTaggedToken.getDecision().getProbability()));
}
sb.append(" root ");
sb.append(finalConfiguration.getPosTagSequence().size());
LOG.trace(sb.toString());
sb = new StringBuilder();
sb.append(" * Token sequence score = ");
sb.append(df.format(finalConfiguration.getPosTagSequence().getTokenSequence().getScore()));
LOG.trace(sb.toString());
}
}
}
return bestConfigurations;
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class PosTagFeatureTester method onNextPosTagSequence.
@Override
public void onNextPosTagSequence(PosTagSequence posTagSequence) throws TalismaneException {
PosTagSequence currentHistory = new PosTagSequence(posTagSequence.getTokenSequence());
for (PosTaggedToken posTaggedToken : posTagSequence) {
if (testWords.contains(posTaggedToken.getToken().getAnalyisText().toLowerCase())) {
StringBuilder sb = new StringBuilder();
boolean foundToken = false;
for (PosTaggedToken taggedToken : posTagSequence) {
if (taggedToken.equals(posTaggedToken)) {
sb.append(" [" + taggedToken.getToken().getOriginalText().replace(' ', '_') + "/" + taggedToken.getTag().toString() + "]");
foundToken = true;
} else if (foundToken) {
sb.append(" " + taggedToken.getToken().getOriginalText().replace(' ', '_'));
} else {
sb.append(" " + taggedToken.getToken().getOriginalText().replace(' ', '_') + "/" + taggedToken.getTag().toString());
}
}
LOG.debug(sb.toString());
String classification = posTaggedToken.getTag().getCode();
PosTaggerContext context = new PosTaggerContextImpl(posTaggedToken.getToken(), currentHistory);
List<FeatureResult<?>> posTagFeatureResults = new ArrayList<FeatureResult<?>>();
for (PosTaggerFeature<?> posTaggerFeature : posTaggerFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = posTaggerFeature.check(context, env);
if (featureResult != null)
posTagFeatureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
LOG.trace("Token: " + posTaggedToken.getToken().getAnalyisText());
for (FeatureResult<?> result : posTagFeatureResults) {
LOG.trace(result.toString());
}
}
for (FeatureResult<?> featureResult : posTagFeatureResults) {
Map<String, List<String>> classificationMap = featureResultMap.get(featureResult.toString());
if (classificationMap == null) {
classificationMap = new TreeMap<String, List<String>>();
featureResultMap.put(featureResult.toString(), classificationMap);
}
List<String> sentences = classificationMap.get(classification);
if (sentences == null) {
sentences = new ArrayList<String>();
classificationMap.put(classification, sentences);
}
sentences.add(sb.toString());
}
}
currentHistory.addPosTaggedToken(posTaggedToken);
}
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class ParserEvaluator method evaluate.
/**
* @throws TalismaneException
* if an attempt is made to evaluate with a tokeniser but no
* pos-tagger
* @throws IOException
*/
public void evaluate() throws TalismaneException, IOException {
while (corpusReader.hasNextSentence()) {
ParseConfiguration realConfiguration = corpusReader.nextConfiguration();
List<PosTagSequence> posTagSequences = null;
List<TokenSequence> tokenSequences = null;
if (tokeniser != null) {
if (posTagger == null)
throw new TalismaneException("Cannot evaluate with tokeniser but no pos-tagger");
Sentence sentence = realConfiguration.getPosTagSequence().getTokenSequence().getSentence();
// annotate the sentence for pre token filters
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
if (LOG.isTraceEnabled()) {
LOG.trace("TokenFilter: " + annotator);
LOG.trace("annotations: " + sentence.getAnnotations());
}
}
tokenSequences = tokeniser.tokenise(sentence);
} else {
tokenSequences = new ArrayList<TokenSequence>();
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence().clonePosTagSequence();
posTagSequence.removeRoot();
tokenSequences.add(posTagSequence.getTokenSequence());
}
if (posTagger != null) {
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
} else {
posTagSequences = new ArrayList<PosTagSequence>();
PosTagSequence posTagSequence = null;
posTagSequence = posTagger.tagSentence(tokenSequences.get(0));
posTagSequences.add(posTagSequence);
}
} else {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
posTagSequences = new ArrayList<PosTagSequence>();
posTagSequences.add(posTagSequence);
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseStart(realConfiguration, posTagSequences);
}
List<ParseConfiguration> guessedConfigurations = null;
if (parser instanceof NonDeterministicParser) {
NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
guessedConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
} else {
ParseConfiguration bestGuess = parser.parseSentence(posTagSequences.get(0));
guessedConfigurations = new ArrayList<ParseConfiguration>();
guessedConfigurations.add(bestGuess);
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseEnd(realConfiguration, guessedConfigurations);
}
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class ParserFScoreCalculator method onParseEnd.
@Override
public void onParseEnd(ParseConfiguration realConfiguration, List<ParseConfiguration> guessedConfigurations) throws TalismaneException {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
ParseConfiguration bestGuess = guessedConfigurations.get(0);
int mismatchedTokens = 0;
for (PosTaggedToken posTaggedToken : posTagSequence) {
if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken, projective);
DependencyArc guessedArc = null;
boolean foundToken = false;
for (PosTaggedToken guessedToken : bestGuess.getPosTagSequence()) {
if (guessedToken.getToken().getStartIndex() == posTaggedToken.getToken().getStartIndex()) {
if (guessedToken.getToken().isEmpty() && !posTaggedToken.getToken().isEmpty())
continue;
if (!guessedToken.getToken().isEmpty() && posTaggedToken.getToken().isEmpty())
continue;
foundToken = true;
guessedArc = bestGuess.getGoverningDependency(guessedToken, projective);
break;
}
}
if (!foundToken) {
LOG.info("Mismatched token :" + posTaggedToken.getToken().getOriginalText() + ", index " + posTaggedToken.getToken().getIndex());
mismatchedTokens += 1;
}
String realLabel = realArc == null ? "noHead" : labeledEvaluation ? realArc.getLabel() : "head";
String guessedLabel = guessedArc == null ? "noHead" : labeledEvaluation ? guessedArc.getLabel() : "head";
if (realLabel == null || realLabel.length() == 0)
realLabel = "noLabel";
if (guessedLabel == null || guessedLabel.length() == 0)
guessedLabel = "noLabel";
// should be considered a "no head" rather than "no label"
if (realArc != null && realArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && realLabel.equals("noLabel"))
realLabel = "noHead";
if (guessedArc != null && guessedArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && guessedLabel.equals("noLabel"))
guessedLabel = "noHead";
if (realArc == null || guessedArc == null) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else {
boolean sameHead = realArc.getHead().getToken().getStartIndex() == guessedArc.getHead().getToken().getStartIndex();
if (sameHead) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else if (guessedLabel.equals("noHead")) {
fscoreCalculator.increment(realLabel, "noHead");
} else if (realArc.getLabel().equals(guessedArc.getLabel())) {
fscoreCalculator.increment(realLabel, "wrongHead");
} else {
fscoreCalculator.increment(realLabel, "wrongHeadWrongLabel");
}
}
// have one of the arcs
}
// is root tag?
}
if ((double) mismatchedTokens / (double) posTagSequence.size() > 0.5) {
// more than half of the tokens mismatched?
throw new TalismaneException("Too many mismatched tokens in sentence: " + posTagSequence.getTokenSequence().getSentence().getText());
}
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class ParserFScoreCalculatorByDistance method onParseEnd.
@Override
public void onParseEnd(ParseConfiguration realConfiguration, List<ParseConfiguration> guessedConfigurations) {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
ParseConfiguration bestGuess = guessedConfigurations.get(0);
for (PosTaggedToken posTaggedToken : posTagSequence) {
if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG))
continue;
DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken);
int depDistance = realArc.getHead().getToken().getIndex() - realArc.getDependent().getToken().getIndex();
if (depDistance < 0)
depDistance = 0 - depDistance;
FScoreCalculator<String> fscoreCalculator = fscoreByDistanceMap.get(depDistance);
if (fscoreCalculator == null) {
fscoreCalculator = new FScoreCalculator<String>(depDistance);
fscoreByDistanceMap.put(depDistance, fscoreCalculator);
}
DependencyArc guessedArc = null;
if (!hasTokeniser && !hasPosTagger) {
guessedArc = bestGuess.getGoverningDependency(posTaggedToken);
} else {
for (PosTaggedToken guessedToken : bestGuess.getPosTagSequence()) {
if (guessedToken.getToken().getStartIndex() == posTaggedToken.getToken().getStartIndex()) {
guessedArc = bestGuess.getGoverningDependency(guessedToken);
break;
}
}
}
String realLabel = realArc == null ? "noHead" : labeledEvaluation ? realArc.getLabel() : "head";
String guessedLabel = guessedArc == null ? "noHead" : labeledEvaluation ? guessedArc.getLabel() : "head";
if (realLabel == null || realLabel.length() == 0)
realLabel = "noLabel";
if (guessedLabel == null || guessedLabel.length() == 0)
guessedLabel = "noLabel";
// should be considered a "no head" rather than "no label"
if (realArc != null && realArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && realLabel.equals("noLabel"))
realLabel = "noHead";
if (guessedArc != null && guessedArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && guessedLabel.equals("noLabel"))
guessedLabel = "noHead";
if (realLabel.equals(skipLabel))
return;
if (realArc == null || guessedArc == null) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else {
boolean sameHead = false;
if (hasTokeniser || hasPosTagger)
sameHead = realArc.getHead().getToken().getStartIndex() == guessedArc.getHead().getToken().getStartIndex();
else
sameHead = realArc.getHead().equals(guessedArc.getHead());
if (sameHead) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else if (guessedLabel.equals("noHead")) {
fscoreCalculator.increment(realLabel, "noHead");
} else if (realArc.getLabel().equals(guessedArc.getLabel())) {
fscoreCalculator.increment(realLabel, "wrongHead");
} else {
fscoreCalculator.increment(realLabel, "wrongHeadWrongLabel");
}
}
}
}
Aggregations