use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class CorpusStatisticsWriter method onNextParseConfiguration.
@Override
public void onNextParseConfiguration(ParseConfiguration parseConfiguration) {
stats.sentenceCount++;
stats.sentenceLengthStats.addValue(parseConfiguration.getPosTagSequence().size());
for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG))
continue;
Token token = posTaggedToken.getToken();
String word = token.getOriginalText();
stats.words.add(word);
if (referenceStats != null) {
if (!referenceStats.words.contains(word))
stats.unknownTokenCount++;
}
if (alphanumeric.matcher(token.getOriginalText()).find()) {
String lowercase = word.toLowerCase(TalismaneSession.get(sessionId).getLocale());
stats.lowerCaseWords.add(lowercase);
stats.alphanumericCount++;
if (referenceStats != null) {
if (!referenceStats.lowerCaseWords.contains(lowercase))
stats.unknownAlphanumericCount++;
}
}
stats.tokenCount++;
Integer countObj = stats.posTagCounts.get(posTaggedToken.getTag().getCode());
int count = countObj == null ? 0 : countObj.intValue();
count++;
stats.posTagCounts.put(posTaggedToken.getTag().getCode(), count);
}
int maxDepth = 0;
DescriptiveStatistics avgSyntaxDepthForSentenceStats = new DescriptiveStatistics();
for (DependencyArc arc : parseConfiguration.getNonProjectiveDependencies()) {
Integer countObj = stats.depLabelCounts.get(arc.getLabel());
int count = countObj == null ? 0 : countObj.intValue();
count++;
stats.depLabelCounts.put(arc.getLabel(), count);
stats.totalDepCount++;
if (arc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (arc.getLabel() == null || arc.getLabel().length() == 0)) {
// do nothing for unattached stuff (e.g. punctuation)
} else if (arc.getLabel().equals("ponct")) {
// do nothing for punctuation
} else {
int depth = 0;
DependencyArc theArc = arc;
while (theArc != null && !theArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG)) {
theArc = parseConfiguration.getGoverningDependency(theArc.getHead());
depth++;
}
if (depth > maxDepth)
maxDepth = depth;
stats.syntaxDepthStats.addValue(depth);
avgSyntaxDepthForSentenceStats.addValue(depth);
int distance = Math.abs(arc.getHead().getToken().getIndex() - arc.getDependent().getToken().getIndex());
stats.syntaxDistanceStats.addValue(distance);
}
}
stats.maxSyntaxDepthStats.addValue(maxDepth);
if (avgSyntaxDepthForSentenceStats.getN() > 0)
stats.avgSyntaxDepthStats.addValue(avgSyntaxDepthForSentenceStats.getMean());
if (maxDepth > stats.maxDepthCorpus)
stats.maxDepthCorpus = maxDepth;
// we cheat a little bit by only allowing each arc to count once
// there could be a situation where there are two independent
// non-projective arcs
// crossing the same mother arc, but we prefer here to underestimate,
// as this phenomenon is quite rare.
Set<DependencyArc> nonProjectiveArcs = new HashSet<DependencyArc>();
int i = 0;
for (DependencyArc arc : parseConfiguration.getNonProjectiveDependencies()) {
i++;
if (arc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (arc.getLabel() == null || arc.getLabel().length() == 0))
continue;
if (nonProjectiveArcs.contains(arc))
continue;
int headIndex = arc.getHead().getToken().getIndex();
int depIndex = arc.getDependent().getToken().getIndex();
int startIndex = headIndex < depIndex ? headIndex : depIndex;
int endIndex = headIndex >= depIndex ? headIndex : depIndex;
int j = 0;
for (DependencyArc otherArc : parseConfiguration.getNonProjectiveDependencies()) {
j++;
if (j <= i)
continue;
if (otherArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (otherArc.getLabel() == null || otherArc.getLabel().length() == 0))
continue;
if (nonProjectiveArcs.contains(otherArc))
continue;
int headIndex2 = otherArc.getHead().getToken().getIndex();
int depIndex2 = otherArc.getDependent().getToken().getIndex();
int startIndex2 = headIndex2 < depIndex2 ? headIndex2 : depIndex2;
int endIndex2 = headIndex2 >= depIndex2 ? headIndex2 : depIndex2;
boolean nonProjective = false;
if (startIndex2 < startIndex && endIndex2 > startIndex && endIndex2 < endIndex) {
nonProjective = true;
} else if (startIndex2 > startIndex && startIndex2 < endIndex && endIndex2 > endIndex) {
nonProjective = true;
}
if (nonProjective) {
nonProjectiveArcs.add(arc);
nonProjectiveArcs.add(otherArc);
stats.nonProjectiveCount++;
LOG.debug("Non-projective arcs in sentence: " + parseConfiguration.getSentence().getText());
LOG.debug(arc.toString());
LOG.debug(otherArc.toString());
break;
}
}
}
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class CombinedLexicalAttributesTest method testCheckInternalMultipleEntries.
@Test
public void testCheckInternalMultipleEntries() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("je demande", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("demande", tokenSequence, 1, "je ".length(), "je demande".length(), sessionId);
Decision decision = new Decision("V", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
CombinedLexicalAttributesFeature<PosTaggerContext> feature = new CombinedLexicalAttributesFeature<>(addressFunction, person);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<String> featureResult = feature.checkInternal(context, env);
String outcome = featureResult.getOutcome();
System.out.println(outcome);
assertEquals("1;3", outcome);
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class CombinedLexicalAttributesTest method testCheckInternalMultipleAttributes.
@Test
public void testCheckInternalMultipleAttributes() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("blah", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("blah", tokenSequence, 1, "".length(), "blah".length(), sessionId);
Decision decision = new Decision("V", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
StringLiteralFeature<PosTaggedTokenWrapper> number = new StringLiteralFeature<>(LexicalAttribute.Number.name());
CombinedLexicalAttributesFeature<PosTaggerContext> feature = new CombinedLexicalAttributesFeature<>(addressFunction, person, number);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<String> featureResult = feature.checkInternal(context, env);
String outcome = featureResult.getOutcome();
System.out.println(outcome);
assertEquals("1;3|p;s", outcome);
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LexicalAttributeFeatureTest method testCheckInternalMultipleAttributes.
@Test
public void testCheckInternalMultipleAttributes() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("blah", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("blah", tokenSequence, 1, "".length(), "blah".length(), sessionId);
Decision decision = new Decision("V", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
StringLiteralFeature<PosTaggedTokenWrapper> number = new StringLiteralFeature<>(LexicalAttribute.Number.name());
LexicalAttributeFeature<PosTaggerContext> feature = new LexicalAttributeFeature<>(addressFunction, person, number);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<List<WeightedOutcome<String>>> featureResult = feature.checkInternal(context, env);
List<WeightedOutcome<String>> outcomes = featureResult.getOutcome();
System.out.println(outcomes);
for (WeightedOutcome<String> outcome : outcomes) {
assertTrue("3|p".equals(outcome.getOutcome()) || "1|s".equals(outcome.getOutcome()) || "3|s".equals(outcome.getOutcome()));
}
assertEquals(3, outcomes.size());
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LexicalAttributeFeatureTest method testCheckInternal.
@Test
public void testCheckInternal() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("une dame", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("dame", tokenSequence, 1, "une ".length(), "une dame".length(), sessionId);
Decision decision = new Decision("NC", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> gender = new StringLiteralFeature<>(LexicalAttribute.Gender.name());
LexicalAttributeFeature<PosTaggerContext> feature = new LexicalAttributeFeature<>(addressFunction, gender);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<List<WeightedOutcome<String>>> featureResult = feature.checkInternal(context, env);
List<WeightedOutcome<String>> outcomes = featureResult.getOutcome();
System.out.println(outcomes);
assertEquals("f", outcomes.get(0).getOutcome());
assertEquals(1, outcomes.size());
}
Aggregations