Search in sources :

Example 16 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class TokenisedAtomicTokenSequenceTest method testGetTokenSequence.

@Test
public void testGetTokenSequence() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    final Sentence sentence = new Sentence("Je n'ai pas encore l'ourang-outan.", sessionId);
    TokeniserOutcome[] tokeniserOutcomeArray = new TokeniserOutcome[] { // Je
    TokeniserOutcome.SEPARATE, // _
    TokeniserOutcome.SEPARATE, // n
    TokeniserOutcome.SEPARATE, // '
    TokeniserOutcome.JOIN, // ai
    TokeniserOutcome.SEPARATE, // _
    TokeniserOutcome.SEPARATE, // pas
    TokeniserOutcome.SEPARATE, // _
    TokeniserOutcome.JOIN, // encore
    TokeniserOutcome.JOIN, // _
    TokeniserOutcome.SEPARATE, // l
    TokeniserOutcome.SEPARATE, // '
    TokeniserOutcome.JOIN, // ourang
    TokeniserOutcome.SEPARATE, // -
    TokeniserOutcome.JOIN, // outan
    TokeniserOutcome.JOIN, // .
    TokeniserOutcome.SEPARATE };
    TokenisedAtomicTokenSequence atomicTokenSequence = new TokenisedAtomicTokenSequence(sentence, sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.findDefaultTokens();
    int i = 0;
    for (Token token : tokenSequence.listWithWhiteSpace()) {
        Decision decision = new Decision(tokeniserOutcomeArray[i++].name());
        TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
        atomicTokenSequence.add(taggedToken);
    }
    TokenSequence newTokenSequence = atomicTokenSequence.inferTokenSequence();
    LOG.debug(newTokenSequence.toString());
    i = 0;
    for (Token token : newTokenSequence) {
        if (i == 0) {
            assertEquals("Je", token.getAnalyisText());
        } else if (i == 1) {
            assertEquals("n'", token.getAnalyisText());
        } else if (i == 2) {
            assertEquals("ai", token.getAnalyisText());
        } else if (i == 3) {
            assertEquals("pas encore", token.getAnalyisText());
        } else if (i == 4) {
            assertEquals("l'", token.getAnalyisText());
        } else if (i == 5) {
            assertEquals("ourang-outan", token.getAnalyisText());
        } else if (i == 6) {
            assertEquals(".", token.getAnalyisText());
        }
        i++;
    }
    assertEquals(7, newTokenSequence.size());
}
Also used : Config(com.typesafe.config.Config) Sentence(com.joliciel.talismane.rawText.Sentence) Decision(com.joliciel.talismane.machineLearning.Decision) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 17 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class PatternTokeniserTest method testTokenise.

@Test
public void testTokenise() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    final Sentence sentence = new Sentence("Je n'ai pas l'ourang-outan sur www.google.com.", sessionId);
    List<Annotation<TokenPlaceholder>> annotations = new ArrayList<>();
    Annotation<TokenPlaceholder> annotation = new Annotation<TokenPlaceholder>("Je n'ai pas l'ourang-outan sur ".length(), "Je n'ai pas l'ourang-outan sur www.google.com".length(), new TokenPlaceholder("URL", ""), labels);
    annotations.add(annotation);
    sentence.addAnnotations(annotations);
    List<String> tokeniserPatterns = new ArrayList<String>();
    tokeniserPatterns.add("IS_NOT_SEPARATOR -_");
    tokeniserPatterns.add("IS_SEPARATOR_AFTER '");
    TokeniserPatternManager patternManager = new TokeniserPatternManager(tokeniserPatterns, sessionId);
    PatternTokeniser tokeniser = new PatternTokeniser(null, patternManager, null, 1, sessionId);
    List<TokenSequence> tokenSequences = tokeniser.tokenise(sentence);
    TokenSequence tokenSequence = tokenSequences.get(0);
    LOG.debug(tokenSequence.toString());
    assertEquals(9, tokenSequence.size());
    int i = 0;
    for (Token token : tokenSequence) {
        if (i == 0) {
            assertEquals("Je", token.getAnalyisText());
        } else if (i == 1) {
            assertEquals("n'", token.getAnalyisText());
        } else if (i == 2) {
            assertEquals("ai", token.getAnalyisText());
        } else if (i == 3) {
            assertEquals("pas", token.getAnalyisText());
        } else if (i == 4) {
            assertEquals("l'", token.getAnalyisText());
        } else if (i == 5) {
            assertEquals("ourang-outan", token.getAnalyisText());
        } else if (i == 6) {
            assertEquals("sur", token.getAnalyisText());
        } else if (i == 7) {
            assertEquals("URL", token.getAnalyisText());
        } else if (i == 8) {
            assertEquals(".", token.getAnalyisText());
        }
        i++;
    }
}
Also used : Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) Annotation(com.joliciel.talismane.Annotation) TokenPlaceholder(com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 18 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class TokenPatternTest method testMatch3.

@Test
public void testMatch3() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    final Sentence sentence = new Sentence("Z'ensuite il aille...", sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.findDefaultTokens();
    TokenPattern tokenPattern = new TokenPattern("{(?![cdjlmnstCDJLMNST]\\z|qu\\z|jusqu\\z|puisqu\\z|lorsqu\\z|aujourd\\z|prud\\z|quelqu\\z|quoiqu\\z).+'}.+", Tokeniser.getTokenSeparators(sessionId));
    List<TokenPatternMatchSequence> patternMatches = tokenPattern.match(tokenSequence);
    assertEquals(1, patternMatches.size());
    TokenPatternMatchSequence matchSequence = patternMatches.get(0);
    assertEquals(3, matchSequence.getTokenSequence().size());
    assertEquals("Z", matchSequence.getTokenSequence().get(0).getOriginalText());
}
Also used : Config(com.typesafe.config.Config) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 19 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testApplyWithDollars.

@Test
public void testApplyWithDollars() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "\\b([\\w.%-]+)(@[-.\\w]+\\.[A-Za-z]{2,4})\\b";
    String replacement = "\\$Email$2:$1";
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
    filter.annotate(text);
    List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
    LOG.debug(placeholders.toString());
    assertEquals(1, placeholders.size());
    Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
    assertEquals(14, placeholder.getStart());
    assertEquals(33, placeholder.getEnd());
    assertEquals("$Email@test.com:joe.schmoe", placeholder.getData().getReplacement());
}
Also used : Config(com.typesafe.config.Config) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 20 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testPuctuation.

@Test
public void testPuctuation() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "[\\p{IsPunctuation}&&[^%$#@§¶‰‱]]+";
    String replacement = null;
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    filter.addAttribute("featureType", new StringAttribute("featureType", "punctuation"));
    Sentence text = new Sentence("Bonjour. Comment ça va?", sessionId);
    filter.annotate(text);
    @SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
    LOG.debug(annotations.toString());
    assertEquals(2, annotations.size());
    @SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
    assertEquals("Bonjour".length(), placeholder.getStart());
    assertEquals("Bonjour.".length(), placeholder.getEnd());
    assertEquals("featureType", placeholder.getData().getKey());
    assertEquals("punctuation", placeholder.getData().getValue());
}
Also used : Config(com.typesafe.config.Config) StringAttribute(com.joliciel.talismane.tokeniser.StringAttribute) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

Sentence (com.joliciel.talismane.rawText.Sentence)43 Config (com.typesafe.config.Config)31 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)25 Token (com.joliciel.talismane.tokeniser.Token)14 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)13 Annotation (com.joliciel.talismane.Annotation)12 Decision (com.joliciel.talismane.machineLearning.Decision)11 ArrayList (java.util.ArrayList)9 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 TalismaneException (com.joliciel.talismane.TalismaneException)6 HashMap (java.util.HashMap)6 List (java.util.List)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)5