use of com.joliciel.talismane.lexicon.Diacriticizer in project talismane by joliciel-informatique.
the class TalismaneSession method getDiacriticizer.
public Diacriticizer getDiacriticizer() {
if (diacriticizer == null) {
diacriticizer = new Diacriticizer(this.getMergedLexicon());
diacriticizer.setLocale(this.getLocale());
diacriticizer.setLowercasePreferences(lowercasePreferences);
}
return diacriticizer;
}
use of com.joliciel.talismane.lexicon.Diacriticizer in project talismane by joliciel-informatique.
the class UppercaseSeriesFilter method getKnownWord.
public static String getKnownWord(String sessionId, String word) {
String knownWord = word;
boolean foundWord = false;
Diacriticizer diacriticizer = TalismaneSession.get(sessionId).getDiacriticizer();
Set<String> lowercaseForms = diacriticizer.diacriticize(word);
if (lowercaseForms.size() > 0) {
knownWord = lowercaseForms.iterator().next();
foundWord = true;
}
if (!foundWord) {
if (word.length() > 0) {
knownWord = word.substring(0, 1) + word.substring(1).toLowerCase(TalismaneSession.get(sessionId).getLocale());
}
}
return knownWord;
}
use of com.joliciel.talismane.lexicon.Diacriticizer in project talismane by joliciel-informatique.
the class LowercaseKnownFirstWordFilterTest method testReplace.
@Test
public void testReplace() throws Exception {
Diacriticizer diacriticizer = mock(Diacriticizer.class);
when(diacriticizer.diacriticize("J'")).thenReturn(new HashSet<>(Arrays.asList("j'")));
when(diacriticizer.diacriticize("Il")).thenReturn(new HashSet<>(Arrays.asList("il")));
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
String text = "J'avais oublié : Il est Malade.";
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "J'".length());
tokenSequence.addToken("J'".length(), "J'avais".length());
tokenSequence.addToken("J'avais ".length(), "J'avais oublié".length());
tokenSequence.addToken("J'avais oublié ".length(), "J'avais oublié :".length());
tokenSequence.addToken("J'avais oublié : ".length(), "J'avais oublié : Il".length());
tokenSequence.addToken("J'avais oublié : Il ".length(), "J'avais oublié : Il est".length());
tokenSequence.addToken("J'avais oublié : Il est ".length(), "J'avais oublié : Il est Malade".length());
tokenSequence.addToken("J'avais oublié : Il est Malade".length(), "J'avais oublié : Il est Malade.".length());
filter.apply(tokenSequence);
System.out.println(tokenSequence);
StringBuilder sb = new StringBuilder();
for (Token token : tokenSequence) {
sb.append(token.getText());
sb.append('|');
}
assertEquals("j'|avais|oublié|:|il|est|Malade|.|", sb.toString());
}
use of com.joliciel.talismane.lexicon.Diacriticizer in project talismane by joliciel-informatique.
the class LowercaseKnownFirstWordFilterTest method testReplaceLongWord.
@Test
public void testReplaceLongWord() throws Exception {
Diacriticizer diacriticizer = mock(Diacriticizer.class);
when(diacriticizer.diacriticize("Aujourd'hui")).thenReturn(new HashSet<>(Arrays.asList("aujourd'hui")));
when(diacriticizer.diacriticize("Parce que")).thenReturn(new HashSet<>(Arrays.asList("parce que")));
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
String text = "Aujourd'hui il vient. Parce que...";
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "Aujourd'hui".length());
tokenSequence.addToken("Aujourd'hui ".length(), "Aujourd'hui il".length());
tokenSequence.addToken("Aujourd'hui il ".length(), "Aujourd'hui il vient".length());
tokenSequence.addToken("Aujourd'hui il vient".length(), "Aujourd'hui il vient.".length());
tokenSequence.addToken("Aujourd'hui il vient. ".length(), "Aujourd'hui il vient. Parce que".length());
tokenSequence.addToken("Aujourd'hui il vient. Parce que".length(), "Aujourd'hui il vient. Parce que...".length());
filter.apply(tokenSequence);
System.out.println(tokenSequence);
StringBuilder sb = new StringBuilder();
for (Token token : tokenSequence) {
sb.append(token.getText());
sb.append('|');
}
assertEquals("aujourd'hui|il|vient|.|parce que|...|", sb.toString());
}
use of com.joliciel.talismane.lexicon.Diacriticizer in project talismane by joliciel-informatique.
the class LowercaseKnownFirstWordFilterTest method testReplace3.
@Test
public void testReplace3() throws Exception {
Diacriticizer diacriticizer = mock(Diacriticizer.class);
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
String text = "Georges est là.";
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "Georges".length());
tokenSequence.addToken("Georges ".length(), "Georges est".length());
tokenSequence.addToken("Georges est ".length(), "Georges est là".length());
tokenSequence.addToken("Georges est là".length(), "Georges est là.".length());
filter.apply(tokenSequence);
System.out.println(tokenSequence);
StringBuilder sb = new StringBuilder();
for (Token token : tokenSequence) {
sb.append(token.getText());
sb.append('|');
}
assertEquals("Georges|est|là|.|", sb.toString());
}
Aggregations