use of com.joliciel.talismane.parser.DependencyArc in project talismane by joliciel-informatique.
the class ParseOutputRewriter method getCorpusLines.
List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
// first convert the parse configuration to a list of corpus lines
List<CorpusLine> corpusLines = new ArrayList<>();
for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
if (!posTaggedToken.isRoot()) {
DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
corpusLine.setIndex(posTaggedToken.getIndex());
corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
corpusLine.setPosTag(posTaggedToken.getTag().getCode());
String morphology = posTaggedToken.getMorphologyForCoNLL();
corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
if (posTaggedToken.getToken().getTrailingRawOutput() != null)
corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
corpusLine.setPosTagProbability(posTaggedToken.getProbability());
if (arc != null)
corpusLine.setParseProbability(arc.getProbability());
corpusLines.add(corpusLine);
}
}
Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
for (CorpusLine corpusLine : corpusLines) {
if (LOG.isDebugEnabled())
LOG.debug(corpusLine.toString());
for (RewriteRule rewriteRule : rewriteRules) {
boolean matches = true;
conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
Pattern pattern = rewriteRule.conditions.get(corpusElement);
if (LOG.isTraceEnabled())
LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
switch(corpusElement) {
case POSTAG:
if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Match failed for " + corpusLine.getPosTag());
matches = false;
break conditionLoop;
}
break;
case TOKEN:
if (!pattern.matcher(corpusLine.getToken()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LEMMA:
if (!pattern.matcher(corpusLine.getLemma()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LABEL:
if (!pattern.matcher(corpusLine.getLabel()).matches()) {
matches = false;
break conditionLoop;
}
break;
default:
throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
}
}
if (matches) {
if (rewriteRule.action instanceof SplitAction) {
SplitAction splitAction = (SplitAction) rewriteRule.action;
splitActions.put(corpusLine, splitAction);
}
}
}
}
if (splitActions.size() > 0) {
List<CorpusLine> newCorpusLines = new ArrayList<>();
Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
oldToNewIndexMap.put(0, 0);
int currentIndex = 1;
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
oldToNewIndexMap.put(i + 1, currentIndex);
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
currentIndex += splitAction.elementValues.size();
} else {
currentIndex++;
}
}
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
for (int j = 0; j < splitAction.elementValues.size(); j++) {
CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
// The first matching element in each group will be applied
// The default element marks the end of each group, and will be
// applied if no other match has applied.
List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
boolean groupHasMatch = false;
for (ConditionalAction conditionalAction : conditionalActions) {
CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
if (conditionalAction.isDefault) {
if (!groupHasMatch) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
}
// The default action marks the end of each matching group.
groupHasMatch = false;
} else {
boolean match = true;
for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
String origValue = baseLine.getElement(corpusElement);
Pattern pattern = conditionalAction.conditions.get(corpusElement);
if (!pattern.matcher(origValue).matches()) {
match = false;
break;
}
}
if (match) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
groupHasMatch = true;
}
// did this action match?
}
// default action?
}
// next conditional action
newCorpusLines.add(splitCorpusLine);
}
// next split
} else {
newCorpusLines.add(newCorpusLine);
}
// should line be split?
}
// next corpus line
corpusLines = newCorpusLines;
}
return corpusLines;
}
use of com.joliciel.talismane.parser.DependencyArc in project talismane by joliciel-informatique.
the class TransitionLogWriter method onParseEnd.
@Override
public void onParseEnd(ParseConfiguration refConfiguration, List<ParseConfiguration> guessedConfigurations) throws TalismaneException, IOException {
boolean includeMe = true;
if (errorLabels != null && errorLabels.size() > 0) {
includeMe = false;
int i = 0;
ParseConfiguration guessConfiguration = guessedConfigurations.get(0);
Set<PosTaggedToken> refTokensToExplain = new HashSet<PosTaggedToken>();
Set<PosTaggedToken> guessTokensToExplain = new HashSet<PosTaggedToken>();
Set<PosTaggedToken> refTokensToHighlight = new HashSet<PosTaggedToken>();
Set<PosTaggedToken> guessTokensToHighlight = new HashSet<PosTaggedToken>();
for (PosTaggedToken refToken : refConfiguration.getPosTagSequence()) {
if (i != 0) {
DependencyArc refArc = refConfiguration.getGoverningDependency(refToken);
if (refArc != null) {
PosTaggedToken guessToken = guessConfiguration.getPosTagSequence().get(i);
if (errorLabels.contains(refArc.getLabel())) {
DependencyArc guessArc = guessConfiguration.getGoverningDependency(guessToken);
if (guessArc == null || !refArc.getLabel().equals(guessArc.getLabel()) || (refArc.getHead() == null && guessArc.getHead() != null) || (refArc.getHead() != null && guessArc.getHead() == null) || refArc.getHead().getIndex() != guessArc.getHead().getIndex()) {
refTokensToExplain.add(refToken);
if (refArc.getHead() != null)
refTokensToHighlight.add(refArc.getHead());
guessTokensToExplain.add(guessToken);
if (guessArc != null && guessArc.getHead() != null)
guessTokensToHighlight.add(guessArc.getHead());
includeMe = true;
}
}
}
// have refArc
}
i++;
}
StringBuilder refBuilder = new StringBuilder();
for (PosTaggedToken refToken : refConfiguration.getPosTagSequence()) {
if (refTokensToExplain.contains(refToken)) {
DependencyArc refArc = refConfiguration.getGoverningDependency(refToken);
if (refArc == null)
refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "|Gov0|null# ");
else
refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "|Gov" + (refArc.getHead() == null ? 0 : refArc.getHead().getIndex()) + "|" + refArc.getLabel() + "# ");
} else if (refTokensToHighlight.contains(refToken)) {
refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "# ");
} else {
refBuilder.append(refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + " ");
}
}
StringBuilder guessBuilder = new StringBuilder();
for (PosTaggedToken guessToken : guessConfiguration.getPosTagSequence()) {
if (guessTokensToExplain.contains(guessToken)) {
DependencyArc guessArc = guessConfiguration.getGoverningDependency(guessToken);
if (guessArc == null)
guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "|Gov0|null# ");
else
guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "|Gov" + (guessArc.getHead() == null ? 0 : guessArc.getHead().getIndex()) + "|" + guessArc.getLabel() + "# ");
} else if (guessTokensToHighlight.contains(guessToken)) {
guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "# ");
} else {
guessBuilder.append(guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + " ");
}
}
if (includeMe) {
writer.write("\n");
writer.write(refBuilder.toString() + "\n");
writer.write(guessBuilder.toString() + "\n");
}
}
if (includeMe)
this.onNextParseConfiguration(guessedConfigurations.get(0));
}
use of com.joliciel.talismane.parser.DependencyArc in project talismane by joliciel-informatique.
the class ParserFScoreCalculator method onParseEnd.
@Override
public void onParseEnd(ParseConfiguration realConfiguration, List<ParseConfiguration> guessedConfigurations) throws TalismaneException {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
ParseConfiguration bestGuess = guessedConfigurations.get(0);
int mismatchedTokens = 0;
for (PosTaggedToken posTaggedToken : posTagSequence) {
if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken, projective);
DependencyArc guessedArc = null;
boolean foundToken = false;
for (PosTaggedToken guessedToken : bestGuess.getPosTagSequence()) {
if (guessedToken.getToken().getStartIndex() == posTaggedToken.getToken().getStartIndex()) {
if (guessedToken.getToken().isEmpty() && !posTaggedToken.getToken().isEmpty())
continue;
if (!guessedToken.getToken().isEmpty() && posTaggedToken.getToken().isEmpty())
continue;
foundToken = true;
guessedArc = bestGuess.getGoverningDependency(guessedToken, projective);
break;
}
}
if (!foundToken) {
LOG.info("Mismatched token :" + posTaggedToken.getToken().getOriginalText() + ", index " + posTaggedToken.getToken().getIndex());
mismatchedTokens += 1;
}
String realLabel = realArc == null ? "noHead" : labeledEvaluation ? realArc.getLabel() : "head";
String guessedLabel = guessedArc == null ? "noHead" : labeledEvaluation ? guessedArc.getLabel() : "head";
if (realLabel == null || realLabel.length() == 0)
realLabel = "noLabel";
if (guessedLabel == null || guessedLabel.length() == 0)
guessedLabel = "noLabel";
// should be considered a "no head" rather than "no label"
if (realArc != null && realArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && realLabel.equals("noLabel"))
realLabel = "noHead";
if (guessedArc != null && guessedArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && guessedLabel.equals("noLabel"))
guessedLabel = "noHead";
if (realArc == null || guessedArc == null) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else {
boolean sameHead = realArc.getHead().getToken().getStartIndex() == guessedArc.getHead().getToken().getStartIndex();
if (sameHead) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else if (guessedLabel.equals("noHead")) {
fscoreCalculator.increment(realLabel, "noHead");
} else if (realArc.getLabel().equals(guessedArc.getLabel())) {
fscoreCalculator.increment(realLabel, "wrongHead");
} else {
fscoreCalculator.increment(realLabel, "wrongHeadWrongLabel");
}
}
// have one of the arcs
}
// is root tag?
}
if ((double) mismatchedTokens / (double) posTagSequence.size() > 0.5) {
// more than half of the tokens mismatched?
throw new TalismaneException("Too many mismatched tokens in sentence: " + posTagSequence.getTokenSequence().getSentence().getText());
}
}
use of com.joliciel.talismane.parser.DependencyArc in project talismane by joliciel-informatique.
the class ParserFScoreCalculatorByDistance method onParseEnd.
@Override
public void onParseEnd(ParseConfiguration realConfiguration, List<ParseConfiguration> guessedConfigurations) {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
ParseConfiguration bestGuess = guessedConfigurations.get(0);
for (PosTaggedToken posTaggedToken : posTagSequence) {
if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG))
continue;
DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken);
int depDistance = realArc.getHead().getToken().getIndex() - realArc.getDependent().getToken().getIndex();
if (depDistance < 0)
depDistance = 0 - depDistance;
FScoreCalculator<String> fscoreCalculator = fscoreByDistanceMap.get(depDistance);
if (fscoreCalculator == null) {
fscoreCalculator = new FScoreCalculator<String>(depDistance);
fscoreByDistanceMap.put(depDistance, fscoreCalculator);
}
DependencyArc guessedArc = null;
if (!hasTokeniser && !hasPosTagger) {
guessedArc = bestGuess.getGoverningDependency(posTaggedToken);
} else {
for (PosTaggedToken guessedToken : bestGuess.getPosTagSequence()) {
if (guessedToken.getToken().getStartIndex() == posTaggedToken.getToken().getStartIndex()) {
guessedArc = bestGuess.getGoverningDependency(guessedToken);
break;
}
}
}
String realLabel = realArc == null ? "noHead" : labeledEvaluation ? realArc.getLabel() : "head";
String guessedLabel = guessedArc == null ? "noHead" : labeledEvaluation ? guessedArc.getLabel() : "head";
if (realLabel == null || realLabel.length() == 0)
realLabel = "noLabel";
if (guessedLabel == null || guessedLabel.length() == 0)
guessedLabel = "noLabel";
// should be considered a "no head" rather than "no label"
if (realArc != null && realArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && realLabel.equals("noLabel"))
realLabel = "noHead";
if (guessedArc != null && guessedArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && guessedLabel.equals("noLabel"))
guessedLabel = "noHead";
if (realLabel.equals(skipLabel))
return;
if (realArc == null || guessedArc == null) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else {
boolean sameHead = false;
if (hasTokeniser || hasPosTagger)
sameHead = realArc.getHead().getToken().getStartIndex() == guessedArc.getHead().getToken().getStartIndex();
else
sameHead = realArc.getHead().equals(guessedArc.getHead());
if (sameHead) {
fscoreCalculator.increment(realLabel, guessedLabel);
} else if (guessedLabel.equals("noHead")) {
fscoreCalculator.increment(realLabel, "noHead");
} else if (realArc.getLabel().equals(guessedArc.getLabel())) {
fscoreCalculator.increment(realLabel, "wrongHead");
} else {
fscoreCalculator.increment(realLabel, "wrongHeadWrongLabel");
}
}
}
}
use of com.joliciel.talismane.parser.DependencyArc in project talismane by joliciel-informatique.
the class CorpusStatisticsWriter method onNextParseConfiguration.
@Override
public void onNextParseConfiguration(ParseConfiguration parseConfiguration) {
stats.sentenceCount++;
stats.sentenceLengthStats.addValue(parseConfiguration.getPosTagSequence().size());
for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG))
continue;
Token token = posTaggedToken.getToken();
String word = token.getOriginalText();
stats.words.add(word);
if (referenceStats != null) {
if (!referenceStats.words.contains(word))
stats.unknownTokenCount++;
}
if (alphanumeric.matcher(token.getOriginalText()).find()) {
String lowercase = word.toLowerCase(TalismaneSession.get(sessionId).getLocale());
stats.lowerCaseWords.add(lowercase);
stats.alphanumericCount++;
if (referenceStats != null) {
if (!referenceStats.lowerCaseWords.contains(lowercase))
stats.unknownAlphanumericCount++;
}
}
stats.tokenCount++;
Integer countObj = stats.posTagCounts.get(posTaggedToken.getTag().getCode());
int count = countObj == null ? 0 : countObj.intValue();
count++;
stats.posTagCounts.put(posTaggedToken.getTag().getCode(), count);
}
int maxDepth = 0;
DescriptiveStatistics avgSyntaxDepthForSentenceStats = new DescriptiveStatistics();
for (DependencyArc arc : parseConfiguration.getNonProjectiveDependencies()) {
Integer countObj = stats.depLabelCounts.get(arc.getLabel());
int count = countObj == null ? 0 : countObj.intValue();
count++;
stats.depLabelCounts.put(arc.getLabel(), count);
stats.totalDepCount++;
if (arc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (arc.getLabel() == null || arc.getLabel().length() == 0)) {
// do nothing for unattached stuff (e.g. punctuation)
} else if (arc.getLabel().equals("ponct")) {
// do nothing for punctuation
} else {
int depth = 0;
DependencyArc theArc = arc;
while (theArc != null && !theArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG)) {
theArc = parseConfiguration.getGoverningDependency(theArc.getHead());
depth++;
}
if (depth > maxDepth)
maxDepth = depth;
stats.syntaxDepthStats.addValue(depth);
avgSyntaxDepthForSentenceStats.addValue(depth);
int distance = Math.abs(arc.getHead().getToken().getIndex() - arc.getDependent().getToken().getIndex());
stats.syntaxDistanceStats.addValue(distance);
}
}
stats.maxSyntaxDepthStats.addValue(maxDepth);
if (avgSyntaxDepthForSentenceStats.getN() > 0)
stats.avgSyntaxDepthStats.addValue(avgSyntaxDepthForSentenceStats.getMean());
if (maxDepth > stats.maxDepthCorpus)
stats.maxDepthCorpus = maxDepth;
// we cheat a little bit by only allowing each arc to count once
// there could be a situation where there are two independent
// non-projective arcs
// crossing the same mother arc, but we prefer here to underestimate,
// as this phenomenon is quite rare.
Set<DependencyArc> nonProjectiveArcs = new HashSet<DependencyArc>();
int i = 0;
for (DependencyArc arc : parseConfiguration.getNonProjectiveDependencies()) {
i++;
if (arc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (arc.getLabel() == null || arc.getLabel().length() == 0))
continue;
if (nonProjectiveArcs.contains(arc))
continue;
int headIndex = arc.getHead().getToken().getIndex();
int depIndex = arc.getDependent().getToken().getIndex();
int startIndex = headIndex < depIndex ? headIndex : depIndex;
int endIndex = headIndex >= depIndex ? headIndex : depIndex;
int j = 0;
for (DependencyArc otherArc : parseConfiguration.getNonProjectiveDependencies()) {
j++;
if (j <= i)
continue;
if (otherArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (otherArc.getLabel() == null || otherArc.getLabel().length() == 0))
continue;
if (nonProjectiveArcs.contains(otherArc))
continue;
int headIndex2 = otherArc.getHead().getToken().getIndex();
int depIndex2 = otherArc.getDependent().getToken().getIndex();
int startIndex2 = headIndex2 < depIndex2 ? headIndex2 : depIndex2;
int endIndex2 = headIndex2 >= depIndex2 ? headIndex2 : depIndex2;
boolean nonProjective = false;
if (startIndex2 < startIndex && endIndex2 > startIndex && endIndex2 < endIndex) {
nonProjective = true;
} else if (startIndex2 > startIndex && startIndex2 < endIndex && endIndex2 > endIndex) {
nonProjective = true;
}
if (nonProjective) {
nonProjectiveArcs.add(arc);
nonProjectiveArcs.add(otherArc);
stats.nonProjectiveCount++;
LOG.debug("Non-projective arcs in sentence: " + parseConfiguration.getSentence().getText());
LOG.debug(arc.toString());
LOG.debug(otherArc.toString());
break;
}
}
}
}
Aggregations