Search in sources :

Example 46 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class LowercaseKnownFirstWordFilterTest method testReplace3.

@Test
public void testReplace3() throws Exception {
    Diacriticizer diacriticizer = mock(Diacriticizer.class);
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
    LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
    String text = "Georges est là.";
    Sentence sentence = new Sentence(text, sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.addToken("".length(), "Georges".length());
    tokenSequence.addToken("Georges ".length(), "Georges est".length());
    tokenSequence.addToken("Georges est ".length(), "Georges est là".length());
    tokenSequence.addToken("Georges est là".length(), "Georges est là.".length());
    filter.apply(tokenSequence);
    System.out.println(tokenSequence);
    StringBuilder sb = new StringBuilder();
    for (Token token : tokenSequence) {
        sb.append(token.getText());
        sb.append('|');
    }
    assertEquals("Georges|est|là|.|", sb.toString());
}
Also used : Config(com.typesafe.config.Config) Token(com.joliciel.talismane.tokeniser.Token) Diacriticizer(com.joliciel.talismane.lexicon.Diacriticizer) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 47 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class UppercaseSeriesFilterTest method testReplace.

@Test
public void testReplace() throws Exception {
    Diacriticizer diacriticizer = mock(Diacriticizer.class);
    when(diacriticizer.diacriticize("VEUX")).thenReturn(new HashSet<>(Arrays.asList("veux")));
    when(diacriticizer.diacriticize("SAVOIR")).thenReturn(new HashSet<>(Arrays.asList("savoir")));
    when(diacriticizer.diacriticize("L'")).thenReturn(new HashSet<>(Arrays.asList("l'")));
    when(diacriticizer.diacriticize("AMERIQUE")).thenReturn(new HashSet<>(Arrays.asList("Amérique")));
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
    UppercaseSeriesFilter filter = new UppercaseSeriesFilter(sessionId);
    String text = "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'AMERIQUE!";
    Sentence sentence = new Sentence(text, sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.addToken("".length(), "Je".length());
    tokenSequence.addToken("Je ".length(), "Je VEUX".length());
    tokenSequence.addToken("Je VEUX ".length(), "Je VEUX SAVOIR".length());
    tokenSequence.addToken("Je VEUX SAVOIR ".length(), "Je VEUX SAVOIR la".length());
    tokenSequence.addToken("Je VEUX SAVOIR la ".length(), "Je VEUX SAVOIR la VERITE".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE".length(), "Je VEUX SAVOIR la VERITE,".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, ".length(), "Je VEUX SAVOIR la VERITE, je".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je ".length(), "Je VEUX SAVOIR la VERITE, je VEUX".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur ".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'AMERIQUE".length());
    tokenSequence.addToken("Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'AMERIQUE".length(), "Je VEUX SAVOIR la VERITE, je VEUX SAVOIR LA VERITE sur L'AMERIQUE!".length());
    filter.apply(tokenSequence);
    System.out.println(tokenSequence);
    StringBuilder sb = new StringBuilder();
    for (Token token : tokenSequence) {
        sb.append(token.getText());
        sb.append('|');
    }
    assertEquals("Je|veux|savoir|la|VERITE|,|je|veux|savoir|La|Verite|sur|l'|Amérique|!|", sb.toString());
}
Also used : Config(com.typesafe.config.Config) Token(com.joliciel.talismane.tokeniser.Token) Diacriticizer(com.joliciel.talismane.lexicon.Diacriticizer) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 48 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class TokenPatternTest method testMatch.

@Test
public void testMatch() throws TalismaneException {
    final String separators = "[\\s\\p{Punct}]";
    final List<TokenPatternMatch> matches3 = new ArrayList<TokenPatternMatch>();
    final List<TokenPatternMatch> matches4 = new ArrayList<TokenPatternMatch>();
    final List<TokenPatternMatch> matches5 = new ArrayList<TokenPatternMatch>();
    final List<TokenPatternMatch> matches6 = new ArrayList<TokenPatternMatch>();
    final List<TokenPatternMatch> matches7 = new ArrayList<TokenPatternMatch>();
    final TokenSequence tokenSequence = mock(TokenSequence.class);
    final Token token0 = mock(Token.class);
    final Token token1 = mock(Token.class);
    final Token token2 = mock(Token.class);
    final Token token3 = mock(Token.class);
    final Token token4 = mock(Token.class);
    final Token token5 = mock(Token.class);
    final Token token6 = mock(Token.class);
    final Token token7 = mock(Token.class);
    final List<Token> listWithWhiteSpaces = Arrays.asList(token0, token1, token2, token3, token4, token5, token6, token7);
    when(tokenSequence.listWithWhiteSpace()).thenReturn(listWithWhiteSpaces);
    when(token0.getAnalyisText()).thenReturn("Moi");
    when(token0.isSeparator()).thenReturn(false);
    when(token0.getIndex()).thenReturn(0);
    when(token1.getAnalyisText()).thenReturn(",");
    when(token1.isSeparator()).thenReturn(true);
    when(token1.getIndex()).thenReturn(1);
    when(token2.getAnalyisText()).thenReturn(" ");
    when(token2.isSeparator()).thenReturn(true);
    when(token2.getIndex()).thenReturn(2);
    when(token3.getAnalyisText()).thenReturn("j");
    when(token3.isSeparator()).thenReturn(false);
    when(token3.getIndex()).thenReturn(3);
    when(token3.getMatches()).thenReturn(matches3);
    when(token4.getAnalyisText()).thenReturn("'");
    when(token4.isSeparator()).thenReturn(true);
    when(token4.getIndex()).thenReturn(4);
    when(token4.getMatches()).thenReturn(matches4);
    when(token5.getAnalyisText()).thenReturn("aim");
    when(token5.isSeparator()).thenReturn(false);
    when(token5.getIndex()).thenReturn(5);
    when(token5.getMatches()).thenReturn(matches5);
    when(token6.getAnalyisText()).thenReturn("'");
    when(token6.isSeparator()).thenReturn(true);
    when(token6.getIndex()).thenReturn(6);
    when(token6.getMatches()).thenReturn(matches6);
    when(token7.getAnalyisText()).thenReturn("rais");
    when(token7.isSeparator()).thenReturn(false);
    when(token7.getIndex()).thenReturn(7);
    when(token7.getMatches()).thenReturn(matches7);
    Pattern separatorPattern = Pattern.compile(separators, Pattern.UNICODE_CHARACTER_CLASS);
    TokenPattern tokeniserPatternImpl = new TokenPattern(".+'.+", separatorPattern);
    List<TokenPatternMatchSequence> patternMatches = tokeniserPatternImpl.match(tokenSequence);
    assertEquals(2, patternMatches.size());
    List<Token> patternMatch = patternMatches.get(0).getTokenSequence();
    assertEquals(3, patternMatch.size());
    for (int i = 0; i < 3; i++) {
        Token token = patternMatch.get(i);
        if (i == 0) {
            assertEquals(3, token.getIndex());
            assertEquals("j", token.getAnalyisText());
            assertEquals(1, token.getMatches().size());
            assertEquals(0, token.getMatches().get(0).getIndex());
        }
        if (i == 1) {
            assertEquals(4, token.getIndex());
            assertEquals("'", token.getAnalyisText());
            assertEquals(1, token.getMatches().size());
            assertEquals(1, token.getMatches().get(0).getIndex());
        }
        if (i == 2) {
            assertEquals(5, token.getIndex());
            assertEquals("aim", token.getAnalyisText());
            assertEquals(2, token.getMatches().size());
            assertEquals(2, token.getMatches().get(0).getIndex());
        }
    }
    patternMatch = patternMatches.get(1).getTokenSequence();
    assertEquals(3, patternMatch.size());
    for (int i = 0; i < 3; i++) {
        Token token = patternMatch.get(i);
        if (i == 0) {
            assertEquals(5, token.getIndex());
            assertEquals("aim", token.getAnalyisText());
            assertEquals(2, token.getMatches().size());
            assertEquals(0, token.getMatches().get(1).getIndex());
        }
        if (i == 1) {
            assertEquals(6, token.getIndex());
            assertEquals("'", token.getAnalyisText());
            assertEquals(1, token.getMatches().size());
            assertEquals(1, token.getMatches().get(0).getIndex());
        }
        if (i == 2) {
            assertEquals(7, token.getIndex());
            assertEquals("rais", token.getAnalyisText());
            assertEquals(1, token.getMatches().size());
            assertEquals(2, token.getMatches().get(0).getIndex());
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 49 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class PatternIndexInSentenceFeature method checkInternal.

@Override
public FeatureResult<Integer> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    Token token = tokenWrapper.getToken();
    FeatureResult<Integer> result = null;
    FeatureResult<String> tokenPatternResult = tokenPatternFeature.check(tokenWrapper, env);
    if (tokenPatternResult != null) {
        // If we have a token pattern, then this is the first token to be
        // tested in that pattern
        TokenPattern tokenPattern = this.patternMap.get(tokenPatternResult.getOutcome());
        TokenPatternMatch theMatch = null;
        for (TokenPatternMatch tokenMatch : token.getMatches(tokenPattern)) {
            if (tokenMatch.getPattern().equals(tokenPattern) && tokenMatch.getIndex() == tokenPattern.getIndexesToTest().get(0)) {
                theMatch = tokenMatch;
                break;
            }
        }
        if (theMatch != null) {
            // note - if a match is found, this is actually the second token
            // in the pattern
            // therefore, we want the index of the first token in the
            // pattern.
            int indexWithWhiteSpace = token.getIndexWithWhiteSpace() - theMatch.getIndex();
            Token firstToken = token.getTokenSequence().listWithWhiteSpace().get(indexWithWhiteSpace);
            int patternIndex = firstToken.getIndex();
            result = this.generateResult(patternIndex);
        }
    // the current token matches the tokeniserPattern at it's first
    // test index
    }
    return result;
}
Also used : Token(com.joliciel.talismane.tokeniser.Token) TokenPatternMatch(com.joliciel.talismane.tokeniser.patterns.TokenPatternMatch) TokenPattern(com.joliciel.talismane.tokeniser.patterns.TokenPattern)

Example 50 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class PatternMatchIndexInSentenceFeature method checkInternal.

@Override
public FeatureResult<Integer> checkInternal(TokenPatternMatch tokenPatternMatch, RuntimeEnvironment env) {
    FeatureResult<Integer> result = null;
    Token token = tokenPatternMatch.getToken();
    // note - if a match is found, this is actually the second token in the
    // pattern
    // therefore, we want the index of the first token in the pattern.
    int indexWithWhiteSpace = token.getIndexWithWhiteSpace() - tokenPatternMatch.getIndex();
    Token firstToken = token.getTokenSequence().listWithWhiteSpace().get(indexWithWhiteSpace);
    int patternIndex = firstToken.getIndex();
    result = this.generateResult(patternIndex);
    return result;
}
Also used : Token(com.joliciel.talismane.tokeniser.Token)

Aggregations

Token (com.joliciel.talismane.tokeniser.Token)69 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)16 ArrayList (java.util.ArrayList)15 Sentence (com.joliciel.talismane.rawText.Sentence)14 Decision (com.joliciel.talismane.machineLearning.Decision)12 Config (com.typesafe.config.Config)12 TalismaneTest (com.joliciel.talismane.TalismaneTest)11 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)11 Test (org.junit.Test)11 TalismaneException (com.joliciel.talismane.TalismaneException)7 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)7 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)7 List (java.util.List)7 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)6 HashMap (java.util.HashMap)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 PosTag (com.joliciel.talismane.posTagger.PosTag)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5