use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LowercaseKnownFirstWordFilterTest method testReplace3.
@Test
public void testReplace3() throws Exception {
Diacriticizer diacriticizer = mock(Diacriticizer.class);
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
String text = "Georges est là.";
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "Georges".length());
tokenSequence.addToken("Georges ".length(), "Georges est".length());
tokenSequence.addToken("Georges est ".length(), "Georges est là".length());
tokenSequence.addToken("Georges est là".length(), "Georges est là.".length());
filter.apply(tokenSequence);
System.out.println(tokenSequence);
StringBuilder sb = new StringBuilder();
for (Token token : tokenSequence) {
sb.append(token.getText());
sb.append('|');
}
assertEquals("Georges|est|là|.|", sb.toString());
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class UppercaseSeriesFilterTest method testReplace.
@Test
public void testReplace() throws Exception {
Diacriticizer diacriticizer = mock(Diacriticizer.class);
when(diacriticizer.diacriticize("VEUX")).thenReturn(new HashSet<>(Arrays.asList("veux")));
when(diacriticizer.diacriticize("SAVOIR")).thenReturn(new HashSet<>(Arrays.asList("savoir")));
when(diacriticizer.diacriticize("L'")).thenReturn(new HashSet<>(Arrays.asList("l'")));
when(diacriticizer.diacriticize("AMERIQUE")).thenReturn(new HashSet<>(Arrays.asList("Amérique")));
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
UppercaseSeriesFilter filter = new UppercaseSeriesFilter(sessionId);
String text = "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'AMERIQUE!";
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "Je".length());
tokenSequence.addToken("Je ".length(), "Je VEUX".length());
tokenSequence.addToken("Je VEUX ".length(), "Je VEUX SAVOIR".length());
tokenSequence.addToken("Je VEUX SAVOIR ".length(), "Je VEUX SAVOIR la".length());
tokenSequence.addToken("Je VEUX SAVOIR la ".length(), "Je VEUX SAVOIR la VERITE".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE".length(), "Je VEUX SAVOIR la VERITE,".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, ".length(), "Je VEUX SAVOIR la VERITE, je".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je ".length(), "Je VEUX SAVOIR la VERITE, je VEUX".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'AMERIQUE".length());
tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'AMERIQUE".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'AMERIQUE!".length());
filter.apply(tokenSequence);
System.out.println(tokenSequence);
StringBuilder sb = new StringBuilder();
for (Token token : tokenSequence) {
sb.append(token.getText());
sb.append('|');
}
assertEquals("Je|veux|savoir|la|VERITE|,|je|veux|savoir|La|Verite|sur|l'|Amérique|!|", sb.toString());
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class TokenPatternTest method testMatch.
@Test
public void testMatch() throws TalismaneException {
final String separators = "[\\s\\p{Punct}]";
final List<TokenPatternMatch> matches3 = new ArrayList<TokenPatternMatch>();
final List<TokenPatternMatch> matches4 = new ArrayList<TokenPatternMatch>();
final List<TokenPatternMatch> matches5 = new ArrayList<TokenPatternMatch>();
final List<TokenPatternMatch> matches6 = new ArrayList<TokenPatternMatch>();
final List<TokenPatternMatch> matches7 = new ArrayList<TokenPatternMatch>();
final TokenSequence tokenSequence = mock(TokenSequence.class);
final Token token0 = mock(Token.class);
final Token token1 = mock(Token.class);
final Token token2 = mock(Token.class);
final Token token3 = mock(Token.class);
final Token token4 = mock(Token.class);
final Token token5 = mock(Token.class);
final Token token6 = mock(Token.class);
final Token token7 = mock(Token.class);
final List<Token> listWithWhiteSpaces = Arrays.asList(token0, token1, token2, token3, token4, token5, token6, token7);
when(tokenSequence.listWithWhiteSpace()).thenReturn(listWithWhiteSpaces);
when(token0.getAnalyisText()).thenReturn("Moi");
when(token0.isSeparator()).thenReturn(false);
when(token0.getIndex()).thenReturn(0);
when(token1.getAnalyisText()).thenReturn(",");
when(token1.isSeparator()).thenReturn(true);
when(token1.getIndex()).thenReturn(1);
when(token2.getAnalyisText()).thenReturn(" ");
when(token2.isSeparator()).thenReturn(true);
when(token2.getIndex()).thenReturn(2);
when(token3.getAnalyisText()).thenReturn("j");
when(token3.isSeparator()).thenReturn(false);
when(token3.getIndex()).thenReturn(3);
when(token3.getMatches()).thenReturn(matches3);
when(token4.getAnalyisText()).thenReturn("'");
when(token4.isSeparator()).thenReturn(true);
when(token4.getIndex()).thenReturn(4);
when(token4.getMatches()).thenReturn(matches4);
when(token5.getAnalyisText()).thenReturn("aim");
when(token5.isSeparator()).thenReturn(false);
when(token5.getIndex()).thenReturn(5);
when(token5.getMatches()).thenReturn(matches5);
when(token6.getAnalyisText()).thenReturn("'");
when(token6.isSeparator()).thenReturn(true);
when(token6.getIndex()).thenReturn(6);
when(token6.getMatches()).thenReturn(matches6);
when(token7.getAnalyisText()).thenReturn("rais");
when(token7.isSeparator()).thenReturn(false);
when(token7.getIndex()).thenReturn(7);
when(token7.getMatches()).thenReturn(matches7);
Pattern separatorPattern = Pattern.compile(separators, Pattern.UNICODE_CHARACTER_CLASS);
TokenPattern tokeniserPatternImpl = new TokenPattern(".+'.+", separatorPattern);
List<TokenPatternMatchSequence> patternMatches = tokeniserPatternImpl.match(tokenSequence);
assertEquals(2, patternMatches.size());
List<Token> patternMatch = patternMatches.get(0).getTokenSequence();
assertEquals(3, patternMatch.size());
for (int i = 0; i < 3; i++) {
Token token = patternMatch.get(i);
if (i == 0) {
assertEquals(3, token.getIndex());
assertEquals("j", token.getAnalyisText());
assertEquals(1, token.getMatches().size());
assertEquals(0, token.getMatches().get(0).getIndex());
}
if (i == 1) {
assertEquals(4, token.getIndex());
assertEquals("'", token.getAnalyisText());
assertEquals(1, token.getMatches().size());
assertEquals(1, token.getMatches().get(0).getIndex());
}
if (i == 2) {
assertEquals(5, token.getIndex());
assertEquals("aim", token.getAnalyisText());
assertEquals(2, token.getMatches().size());
assertEquals(2, token.getMatches().get(0).getIndex());
}
}
patternMatch = patternMatches.get(1).getTokenSequence();
assertEquals(3, patternMatch.size());
for (int i = 0; i < 3; i++) {
Token token = patternMatch.get(i);
if (i == 0) {
assertEquals(5, token.getIndex());
assertEquals("aim", token.getAnalyisText());
assertEquals(2, token.getMatches().size());
assertEquals(0, token.getMatches().get(1).getIndex());
}
if (i == 1) {
assertEquals(6, token.getIndex());
assertEquals("'", token.getAnalyisText());
assertEquals(1, token.getMatches().size());
assertEquals(1, token.getMatches().get(0).getIndex());
}
if (i == 2) {
assertEquals(7, token.getIndex());
assertEquals("rais", token.getAnalyisText());
assertEquals(1, token.getMatches().size());
assertEquals(2, token.getMatches().get(0).getIndex());
}
}
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class PatternIndexInSentenceFeature method checkInternal.
@Override
public FeatureResult<Integer> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
Token token = tokenWrapper.getToken();
FeatureResult<Integer> result = null;
FeatureResult<String> tokenPatternResult = tokenPatternFeature.check(tokenWrapper, env);
if (tokenPatternResult != null) {
// If we have a token pattern, then this is the first token to be
// tested in that pattern
TokenPattern tokenPattern = this.patternMap.get(tokenPatternResult.getOutcome());
TokenPatternMatch theMatch = null;
for (TokenPatternMatch tokenMatch : token.getMatches(tokenPattern)) {
if (tokenMatch.getPattern().equals(tokenPattern) && tokenMatch.getIndex() == tokenPattern.getIndexesToTest().get(0)) {
theMatch = tokenMatch;
break;
}
}
if (theMatch != null) {
// note - if a match is found, this is actually the second token
// in the pattern
// therefore, we want the index of the first token in the
// pattern.
int indexWithWhiteSpace = token.getIndexWithWhiteSpace() - theMatch.getIndex();
Token firstToken = token.getTokenSequence().listWithWhiteSpace().get(indexWithWhiteSpace);
int patternIndex = firstToken.getIndex();
result = this.generateResult(patternIndex);
}
// the current token matches the tokeniserPattern at it's first
// test index
}
return result;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class PatternMatchIndexInSentenceFeature method checkInternal.
@Override
public FeatureResult<Integer> checkInternal(TokenPatternMatch tokenPatternMatch, RuntimeEnvironment env) {
FeatureResult<Integer> result = null;
Token token = tokenPatternMatch.getToken();
// note - if a match is found, this is actually the second token in the
// pattern
// therefore, we want the index of the first token in the pattern.
int indexWithWhiteSpace = token.getIndexWithWhiteSpace() - tokenPatternMatch.getIndex();
Token firstToken = token.getTokenSequence().listWithWhiteSpace().get(indexWithWhiteSpace);
int patternIndex = firstToken.getIndex();
result = this.generateResult(patternIndex);
return result;
}
Aggregations