use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class PosTaggerStatisticsWriter method onNextPosTagSequence.
@Override
public void onNextPosTagSequence(PosTagSequence posTagSequence) throws TalismaneException {
stats.sentenceCount++;
stats.sentenceLengthStats.addValue(posTagSequence.size());
for (PosTaggedToken posTaggedToken : posTagSequence) {
if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG))
continue;
Token token = posTaggedToken.getToken();
boolean knownInRefCorpus = false;
boolean knownInLexicon = false;
if (token.getPossiblePosTags().size() > 0)
knownInLexicon = true;
String word = token.getOriginalText();
stats.words.add(word);
if (referenceStats != null)
if (referenceStats.words.contains(word))
knownInRefCorpus = true;
if (!knownInLexicon) {
stats.unknownInLexiconCount++;
}
if (posTaggedToken.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.CLOSED) {
stats.closedClassCount++;
if (!knownInRefCorpus)
stats.closedClassUnknownInRefCorpus++;
if (!knownInLexicon)
stats.closedClassUnknownInLexicon++;
} else if (posTaggedToken.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.OPEN) {
stats.openClassCount++;
if (!knownInRefCorpus)
stats.openClassUnknownInRefCorpus++;
if (!knownInLexicon)
stats.openClassUnknownInLexicon++;
}
if (!knownInRefCorpus)
stats.unknownTokenCount++;
if (alphanumeric.matcher(token.getOriginalText()).find()) {
String lowercase = word.toLowerCase(TalismaneSession.get(sessionId).getLocale());
stats.lowerCaseWords.add(lowercase);
stats.alphanumericCount++;
if (!knownInRefCorpus)
stats.unknownAlphanumericCount++;
if (!knownInLexicon)
stats.unknownAlphaInLexiconCount++;
}
stats.tokenCount++;
Integer countObj = stats.posTagCounts.get(posTaggedToken.getTag().getCode());
int count = countObj == null ? 0 : countObj.intValue();
count++;
stats.posTagCounts.put(posTaggedToken.getTag().getCode(), count);
}
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class UppercaseSeriesFilter method apply.
@Override
public void apply(TokenSequence tokenSequence) {
List<Token> upperCaseSequence = new ArrayList<Token>();
for (Token token : tokenSequence) {
String word = token.getText();
if (word.length() == 0)
continue;
boolean hasLowerCase = false;
boolean hasUpperCase = false;
for (int i = 0; i < word.length(); i++) {
char c = word.charAt(i);
if (Character.isUpperCase(c)) {
hasUpperCase = true;
}
if (Character.isLowerCase(c)) {
hasLowerCase = true;
break;
}
}
if (hasUpperCase && !hasLowerCase) {
upperCaseSequence.add(token);
} else if (!hasLowerCase) {
// do nothing, might be punctuation or number in middle of upper case
// sequence
} else {
if (upperCaseSequence.size() > 1) {
this.checkSequence(upperCaseSequence);
}
upperCaseSequence.clear();
}
}
// next token
if (upperCaseSequence.size() > 1) {
this.checkSequence(upperCaseSequence);
}
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class PatternEventStream method hasNext.
@Override
public boolean hasNext() throws TalismaneException, IOException {
if (currentPatternMatches != null) {
if (currentIndex == currentPatternMatches.size()) {
currentPatternMatches = null;
}
}
while (currentPatternMatches == null) {
if (this.corpusReader.hasNextSentence()) {
currentPatternMatches = new ArrayList<TokenPatternMatch>();
currentOutcomes = new ArrayList<TokeniserOutcome>();
currentIndex = 0;
TokenSequence realSequence = corpusReader.nextTokenSequence();
List<Integer> tokenSplits = realSequence.getTokenSplits();
String text = realSequence.getSentence().getText().toString();
LOG.debug("Sentence: " + text);
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(tokenSequence);
List<TaggedToken<TokeniserOutcome>> currentSentence = this.getTaggedTokens(tokenSequence, tokenSplits);
// check if anything matches each pattern
for (TokenPattern parsedPattern : this.tokeniserPatternManager.getParsedTestPatterns()) {
List<TokenPatternMatchSequence> tokenPatternMatches = parsedPattern.match(tokenSequence);
for (TokenPatternMatchSequence tokenPatternMatchSequence : tokenPatternMatches) {
if (LOG.isTraceEnabled())
LOG.trace("Matched pattern: " + parsedPattern + ": " + tokenPatternMatchSequence.getTokenSequence());
// check if entire pattern is separated or joined
TokeniserOutcome outcome = null;
TokeniserOutcome defaultOutcome = null;
boolean haveMismatch = false;
TokenPatternMatch tokenPatternMatch = null;
for (Token token : tokenPatternMatchSequence.getTokensToCheck()) {
if (tokenPatternMatch == null) {
for (TokenPatternMatch patternMatch : tokenPatternMatchSequence.getTokenPatternMatches()) {
if (patternMatch.getToken().equals(token)) {
tokenPatternMatch = patternMatch;
break;
}
}
}
TaggedToken<TokeniserOutcome> taggedToken = currentSentence.get(token.getIndexWithWhiteSpace());
if (outcome == null) {
outcome = taggedToken.getTag();
defaultOutcome = defaultOutcomes.get(token.getIndexWithWhiteSpace());
} else if (taggedToken.getTag() != outcome) {
// this should only happen when two patterns
// overlap:
// e.g. "aussi bien que" and "bien que", or
// "plutot que" and "plutot que de"
// AND the outer pattern is separated, while
// the inner pattern is joined
LOG.debug("Mismatch in pattern: " + tokenPatternMatch + ", " + taggedToken);
haveMismatch = true;
}
}
currentPatternMatches.add(tokenPatternMatch);
if (haveMismatch) {
currentOutcomes.add(defaultOutcome);
} else {
currentOutcomes.add(outcome);
}
}
}
if (currentPatternMatches.size() == 0) {
currentPatternMatches = null;
currentOutcomes = null;
}
} else {
break;
}
}
return currentPatternMatches != null;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LowercaseKnownFirstWordFilterTest method testReplace.
@Test
public void testReplace() throws Exception {
Diacriticizer diacriticizer = mock(Diacriticizer.class);
when(diacriticizer.diacriticize("J'")).thenReturn(new HashSet<>(Arrays.asList("j'")));
when(diacriticizer.diacriticize("Il")).thenReturn(new HashSet<>(Arrays.asList("il")));
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
String text = "J'avais oublié : Il est Malade.";
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "J'".length());
tokenSequence.addToken("J'".length(), "J'avais".length());
tokenSequence.addToken("J'avais ".length(), "J'avais oublié".length());
tokenSequence.addToken("J'avais oublié ".length(), "J'avais oublié :".length());
tokenSequence.addToken("J'avais oublié : ".length(), "J'avais oublié : Il".length());
tokenSequence.addToken("J'avais oublié : Il ".length(), "J'avais oublié : Il est".length());
tokenSequence.addToken("J'avais oublié : Il est ".length(), "J'avais oublié : Il est Malade".length());
tokenSequence.addToken("J'avais oublié : Il est Malade".length(), "J'avais oublié : Il est Malade.".length());
filter.apply(tokenSequence);
System.out.println(tokenSequence);
StringBuilder sb = new StringBuilder();
for (Token token : tokenSequence) {
sb.append(token.getText());
sb.append('|');
}
assertEquals("j'|avais|oublié|:|il|est|Malade|.|", sb.toString());
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LowercaseKnownFirstWordFilterTest method testReplaceLongWord.
@Test
public void testReplaceLongWord() throws Exception {
Diacriticizer diacriticizer = mock(Diacriticizer.class);
when(diacriticizer.diacriticize("Aujourd'hui")).thenReturn(new HashSet<>(Arrays.asList("aujourd'hui")));
when(diacriticizer.diacriticize("Parce que")).thenReturn(new HashSet<>(Arrays.asList("parce que")));
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
String text = "Aujourd'hui il vient. Parce que...";
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "Aujourd'hui".length());
tokenSequence.addToken("Aujourd'hui ".length(), "Aujourd'hui il".length());
tokenSequence.addToken("Aujourd'hui il ".length(), "Aujourd'hui il vient".length());
tokenSequence.addToken("Aujourd'hui il vient".length(), "Aujourd'hui il vient.".length());
tokenSequence.addToken("Aujourd'hui il vient. ".length(), "Aujourd'hui il vient. Parce que".length());
tokenSequence.addToken("Aujourd'hui il vient. Parce que".length(), "Aujourd'hui il vient. Parce que...".length());
filter.apply(tokenSequence);
System.out.println(tokenSequence);
StringBuilder sb = new StringBuilder();
for (Token token : tokenSequence) {
sb.append(token.getText());
sb.append('|');
}
assertEquals("aujourd'hui|il|vient|.|parce que|...|", sb.toString());
}
Aggregations