Search in sources :

Example 11 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RollingTextBlockTest method testNoSentenceAnnotationLocation.

@Test
public void testNoSentenceAnnotationLocation() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    // String text = "I see Mr. Jones and <skip/>Mrs. Smith.";
    RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
    textBlock = textBlock.roll("I see ");
    textBlock = textBlock.roll("Mr. Jones ");
    textBlock = textBlock.roll("and <sk");
    AnnotatedText rawText = textBlock.getRawTextBlock();
    System.out.println("rawText text: " + rawText.getText());
    List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreaks = new ArrayList<>();
    System.out.println("we add no sentence break annotations (as if they were added by a filter)");
    noSentenceBreaks.add(new Annotation<>("".length(), "Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
    rawText.addAnnotations(noSentenceBreaks);
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    textBlock = textBlock.roll("ip/>Mrs.");
    rawText = textBlock.getRawTextBlock();
    System.out.println("rawText text: " + rawText.getText());
    List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
    skips.add(new Annotation<>("and ".length(), "and <skip/>".length(), new RawTextSkipMarker("me"), labels));
    rawText.addAnnotations(skips);
    AnnotatedText processedTextBlock = textBlock.getProcessedText();
    assertEquals("I see Mr. Jones and ", processedTextBlock.getText());
    // ensure that the no sentence break text got added at the right place
    // in the processed text
    noSentenceBreaks = processedTextBlock.getAnnotations(RawTextNoSentenceBreakMarker.class);
    System.out.println("Processed annotations: " + noSentenceBreaks);
    assertEquals(1, noSentenceBreaks.size());
    assertEquals("I see ".length(), noSentenceBreaks.get(0).getStart());
    assertEquals("I see Mr.".length(), noSentenceBreaks.get(0).getEnd());
    textBlock = textBlock.roll(" Smith.");
    rawText = textBlock.getRawTextBlock();
    System.out.println("rawText text: " + rawText.getText());
    noSentenceBreaks = new ArrayList<>();
    noSentenceBreaks.add(new Annotation<>("ip/>".length(), "ip/>Mrs.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
    rawText.addAnnotations(noSentenceBreaks);
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    textBlock = textBlock.roll("");
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    processedTextBlock = textBlock.getProcessedText();
    assertEquals("and Mrs. Smith.", processedTextBlock.getText());
    // ensure that the no sentence break text got added at the right place
    // in the processed text
    noSentenceBreaks = processedTextBlock.getAnnotations(RawTextNoSentenceBreakMarker.class);
    System.out.println("Processed annotations: " + noSentenceBreaks);
    assertEquals(1, noSentenceBreaks.size());
    assertEquals("and ".length(), noSentenceBreaks.get(0).getStart());
    assertEquals("and Mrs.".length(), noSentenceBreaks.get(0).getEnd());
}
Also used : AnnotatedText(com.joliciel.talismane.AnnotatedText) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 12 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class SimpleTokeniserTest method testTokenise.

@Test
public void testTokenise() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    final Sentence sentence = new Sentence("Click http://www.blah-di-blah.com now", sessionId);
    List<Annotation<TokenPlaceholder>> annotations = new ArrayList<>();
    Annotation<TokenPlaceholder> annotation = new Annotation<TokenPlaceholder>("Click ".length(), "Click http://www.blah-di-blah.com".length(), new TokenPlaceholder("URL", ""), labels);
    annotations.add(annotation);
    sentence.addAnnotations(annotations);
    SimpleTokeniser simpleTokeniser = new SimpleTokeniser(sessionId);
    TokenSequence tokenSequence = simpleTokeniser.tokeniseSentence(sentence);
    System.out.println(tokenSequence.toString());
    assertEquals(3, tokenSequence.size());
    assertEquals("Click", tokenSequence.get(0).getAnalyisText());
    assertEquals("URL", tokenSequence.get(1).getAnalyisText());
    assertEquals("now", tokenSequence.get(2).getAnalyisText());
    List<Annotation<TokenBoundary>> tokenBoundaries = sentence.getAnnotations(TokenBoundary.class);
    assertEquals(3, tokenBoundaries.size());
    assertEquals("".length(), tokenBoundaries.get(0).getStart());
    assertEquals("Click".length(), tokenBoundaries.get(0).getEnd());
    assertEquals("Click", tokenBoundaries.get(0).getData().getAnalysisText());
    assertEquals("Click ".length(), tokenBoundaries.get(1).getStart());
    assertEquals("URL", tokenBoundaries.get(1).getData().getAnalysisText());
    assertEquals("Click http://www.blah-di-blah.com".length(), tokenBoundaries.get(1).getEnd());
    assertEquals("Click http://www.blah-di-blah.com ".length(), tokenBoundaries.get(2).getStart());
    assertEquals("Click http://www.blah-di-blah.com now".length(), tokenBoundaries.get(2).getEnd());
    assertEquals("now", tokenBoundaries.get(2).getData().getAnalysisText());
}
Also used : TokenPlaceholder(com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 13 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class TokenSequenceTest method testTokeniseSentenceWithPlaceholders.

@Test
public void testTokeniseSentenceWithPlaceholders() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    final Sentence sentence = new Sentence("Write to me at joe.schome@test.com, otherwise go to http://test.com.", sessionId);
    final List<Annotation<TokenPlaceholder>> placeholders = new ArrayList<>();
    Annotation<TokenPlaceholder> placeholder0 = new Annotation<>("Write to me at ".length(), "Write to me at joe.schome@test.com".length(), new TokenPlaceholder("Email", "blah"), labels);
    placeholders.add(placeholder0);
    Annotation<TokenPlaceholder> placeholder1 = new Annotation<>("Write to me at joe.schome@test.com, otherwise go to ".length(), "Write to me at joe.schome@test.com, otherwise go to http://test.com".length(), new TokenPlaceholder("URL", "blah"), labels);
    placeholders.add(placeholder1);
    sentence.addAnnotations(placeholders);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.findDefaultTokens();
    assertEquals(19, tokenSequence.listWithWhiteSpace().size());
    assertEquals(11, tokenSequence.size());
    int i = 0;
    for (Token token : tokenSequence.listWithWhiteSpace()) {
        if (i == 0) {
            assertEquals("Write", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 1) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 2) {
            assertEquals("to", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 3) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 4) {
            assertEquals("me", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 5) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 6) {
            assertEquals("at", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 7) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 8) {
            assertEquals("Email", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 9) {
            assertEquals(",", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 10) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 11) {
            assertEquals("otherwise", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 12) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 13) {
            assertEquals("go", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 14) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 15) {
            assertEquals("to", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 16) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 17) {
            assertEquals("URL", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 18) {
            assertEquals(".", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        }
        i++;
    }
    i = 0;
    for (Token token : tokenSequence) {
        if (i == 0) {
            assertEquals("Write", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 1) {
            assertEquals("to", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 2) {
            assertEquals("me", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 3) {
            assertEquals("at", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 4) {
            assertEquals("Email", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 5) {
            assertEquals(",", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 6) {
            assertEquals("otherwise", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 7) {
            assertEquals("go", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 8) {
            assertEquals("to", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 9) {
            assertEquals("URL", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 10) {
            assertEquals(".", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        }
        i++;
    }
}
Also used : TokenPlaceholder(com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 14 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class PatternTokeniserTest method testTokenise.

@Test
public void testTokenise() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    final Sentence sentence = new Sentence("Je n'ai pas l'ourang-outan sur www.google.com.", sessionId);
    List<Annotation<TokenPlaceholder>> annotations = new ArrayList<>();
    Annotation<TokenPlaceholder> annotation = new Annotation<TokenPlaceholder>("Je n'ai pas l'ourang-outan sur ".length(), "Je n'ai pas l'ourang-outan sur www.google.com".length(), new TokenPlaceholder("URL", ""), labels);
    annotations.add(annotation);
    sentence.addAnnotations(annotations);
    List<String> tokeniserPatterns = new ArrayList<String>();
    tokeniserPatterns.add("IS_NOT_SEPARATOR -_");
    tokeniserPatterns.add("IS_SEPARATOR_AFTER '");
    TokeniserPatternManager patternManager = new TokeniserPatternManager(tokeniserPatterns, sessionId);
    PatternTokeniser tokeniser = new PatternTokeniser(null, patternManager, null, 1, sessionId);
    List<TokenSequence> tokenSequences = tokeniser.tokenise(sentence);
    TokenSequence tokenSequence = tokenSequences.get(0);
    LOG.debug(tokenSequence.toString());
    assertEquals(9, tokenSequence.size());
    int i = 0;
    for (Token token : tokenSequence) {
        if (i == 0) {
            assertEquals("Je", token.getAnalyisText());
        } else if (i == 1) {
            assertEquals("n'", token.getAnalyisText());
        } else if (i == 2) {
            assertEquals("ai", token.getAnalyisText());
        } else if (i == 3) {
            assertEquals("pas", token.getAnalyisText());
        } else if (i == 4) {
            assertEquals("l'", token.getAnalyisText());
        } else if (i == 5) {
            assertEquals("ourang-outan", token.getAnalyisText());
        } else if (i == 6) {
            assertEquals("sur", token.getAnalyisText());
        } else if (i == 7) {
            assertEquals("URL", token.getAnalyisText());
        } else if (i == 8) {
            assertEquals(".", token.getAnalyisText());
        }
        i++;
    }
}
Also used : Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) Annotation(com.joliciel.talismane.Annotation) TokenPlaceholder(com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 15 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testApplyWithDollars.

@Test
public void testApplyWithDollars() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "\\b([\\w.%-]+)(@[-.\\w]+\\.[A-Za-z]{2,4})\\b";
    String replacement = "\\$Email$2:$1";
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
    filter.annotate(text);
    List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
    LOG.debug(placeholders.toString());
    assertEquals(1, placeholders.size());
    Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
    assertEquals(14, placeholder.getStart());
    assertEquals(33, placeholder.getEnd());
    assertEquals("$Email@test.com:joe.schmoe", placeholder.getData().getReplacement());
}
Also used : Config(com.typesafe.config.Config) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

Annotation (com.joliciel.talismane.Annotation)36 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 ArrayList (java.util.ArrayList)23 Config (com.typesafe.config.Config)22 AnnotatedText (com.joliciel.talismane.AnnotatedText)20 Sentence (com.joliciel.talismane.rawText.Sentence)12 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)11 List (java.util.List)7 RawTextNoSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker)6 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)6 RawTextReplaceMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker)4 TokenPlaceholder (com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder)4 SentenceBoundary (com.joliciel.talismane.sentenceDetector.SentenceBoundary)4 TokenAttribute (com.joliciel.talismane.tokeniser.TokenAttribute)4 Matcher (java.util.regex.Matcher)4 AnnotationObserver (com.joliciel.talismane.AnnotationObserver)3 Decision (com.joliciel.talismane.machineLearning.Decision)3 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)3 SentenceDetectorFeature (com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature)3