Search in sources :

Example 31 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testApply.

@Test
public void testApply() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "\\b[\\w.%-]+@[-.\\w]+\\.[A-Za-z]{2,4}\\b";
    String replacement = "Email";
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
    filter.annotate(text);
    LOG.debug(text.getAnnotations().toString());
    List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
    assertEquals(1, placeholders.size());
    Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
    assertEquals(14, placeholder.getStart());
    assertEquals(33, placeholder.getEnd());
    assertEquals("Email", placeholder.getData().getReplacement());
}
Also used : Config(com.typesafe.config.Config) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 32 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class SentenceDetectorTest method testDetectSentences2.

@Test
public void testDetectSentences2() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    DecisionMaker decisionMaker = new DecisionMaker() {

        @Override
        public ScoringStrategy<ClassificationSolution> getDefaultScoringStrategy() {
            return new GeometricMeanScoringStrategy();
        }

        @Override
        public List<Decision> decide(List<FeatureResult<?>> featureResults) {
            List<Decision> decisions = new ArrayList<>();
            Decision decision = new Decision(SentenceDetectorOutcome.IS_BOUNDARY.name(), 1.0);
            decisions.add(decision);
            return decisions;
        }
    };
    String[] labels = new String[0];
    Set<SentenceDetectorFeature<?>> features = new HashSet<>();
    SentenceDetector sentenceDetector = new SentenceDetector(decisionMaker, features, sessionId);
    String text = "Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones? After";
    AnnotatedText annotatedText = new AnnotatedText(text, "Before analysis. ".length(), text.length());
    List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = new ArrayList<>();
    noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello ".length(), "Before analysis. Hello Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
    noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello Mr. Jones\nHow are you, ".length(), "Before analysis. Hello Mr. Jones\nHow are you, Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
    annotatedText.addAnnotations(noSentenceBreakMarkers);
    List<Annotation<SentenceBoundary>> existingBoundaries = new ArrayList<>();
    existingBoundaries.add(new Annotation<>("".length(), "Before analysis.".length(), new SentenceBoundary(), labels));
    annotatedText.addAnnotations(existingBoundaries);
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
    sentenceBreaks.add(new Annotation<>("Before analysis. Hello Mr. Jones".length(), "Before analysis. Hello Mr. Jones\n".length(), new RawTextSentenceBreakMarker("me"), labels));
    annotatedText.addAnnotations(sentenceBreaks);
    List<Integer> guessedBoundaries = sentenceDetector.detectSentences(annotatedText);
    assertEquals(2, guessedBoundaries.size());
    assertEquals("Before analysis. Hello Mr. Jones\n".length(), guessedBoundaries.get(0).intValue());
    assertEquals("Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones?".length(), guessedBoundaries.get(1).intValue());
    List<Annotation<SentenceBoundary>> sentenceBoundaries = annotatedText.getAnnotations(SentenceBoundary.class);
    System.out.println(sentenceBoundaries.toString());
    assertEquals(4, sentenceBoundaries.size());
    assertEquals("".length(), sentenceBoundaries.get(0).getStart());
    assertEquals("Before analysis.".length(), sentenceBoundaries.get(0).getEnd());
    assertEquals("Before analysis. ".length(), sentenceBoundaries.get(1).getStart());
    assertEquals("Before analysis. Hello Mr. Jones\n".length(), sentenceBoundaries.get(1).getEnd());
    assertEquals("Before analysis. Hello Mr. Jones\n".length(), sentenceBoundaries.get(2).getStart());
    assertEquals("Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones?".length(), sentenceBoundaries.get(2).getEnd());
    assertEquals("Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones?".length(), sentenceBoundaries.get(3).getStart());
    assertEquals("Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones? After".length(), sentenceBoundaries.get(3).getEnd());
}
Also used : SentenceDetectorFeature(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) GeometricMeanScoringStrategy(com.joliciel.talismane.machineLearning.GeometricMeanScoringStrategy) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) AnnotatedText(com.joliciel.talismane.AnnotatedText) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) ClassificationSolution(com.joliciel.talismane.machineLearning.ClassificationSolution) Decision(com.joliciel.talismane.machineLearning.Decision) Annotation(com.joliciel.talismane.Annotation) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 33 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class SentenceDetectorTest method testDetectSentences.

@Test
public void testDetectSentences() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    DecisionMaker decisionMaker = new DecisionMaker() {

        @Override
        public ScoringStrategy<ClassificationSolution> getDefaultScoringStrategy() {
            return new GeometricMeanScoringStrategy();
        }

        @Override
        public List<Decision> decide(List<FeatureResult<?>> featureResults) {
            List<Decision> decisions = new ArrayList<>();
            Decision decision = new Decision(SentenceDetectorOutcome.IS_BOUNDARY.name(), 1.0);
            decisions.add(decision);
            return decisions;
        }
    };
    String[] labels = new String[0];
    Set<SentenceDetectorFeature<?>> features = new HashSet<>();
    SentenceDetector sentenceDetector = new SentenceDetector(decisionMaker, features, sessionId);
    String text = "Before analysis. Hello Mr. Jones. How are you, Mr. Jones? After analysis.";
    AnnotatedText annotatedText = new AnnotatedText(text, "Before analysis. ".length(), "Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length());
    List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = new ArrayList<>();
    noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello ".length(), "Before analysis. Hello Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
    noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello Mr. Jones. How are you, ".length(), "Before analysis. Hello Mr. Jones. How are you, Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
    annotatedText.addAnnotations(noSentenceBreakMarkers);
    List<Integer> sentenceBreaks = sentenceDetector.detectSentences(annotatedText);
    assertEquals(2, sentenceBreaks.size());
    assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBreaks.get(0).intValue());
    assertEquals("Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length(), sentenceBreaks.get(1).intValue());
    List<Annotation<SentenceBoundary>> sentenceBoundaries = annotatedText.getAnnotations(SentenceBoundary.class);
    assertEquals(2, sentenceBoundaries.size());
    assertEquals("".length(), sentenceBoundaries.get(0).getStart());
    assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBoundaries.get(0).getEnd());
    assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBoundaries.get(1).getStart());
    assertEquals("Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length(), sentenceBoundaries.get(1).getEnd());
}
Also used : SentenceDetectorFeature(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature) AnnotatedText(com.joliciel.talismane.AnnotatedText) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) ClassificationSolution(com.joliciel.talismane.machineLearning.ClassificationSolution) GeometricMeanScoringStrategy(com.joliciel.talismane.machineLearning.GeometricMeanScoringStrategy) Decision(com.joliciel.talismane.machineLearning.Decision) Annotation(com.joliciel.talismane.Annotation) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 34 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class TokenSequenceTest method testTokeniseSentenceWithPlaceholdersNoSeparators.

@Test
public void testTokeniseSentenceWithPlaceholdersNoSeparators() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    final Sentence sentence = new Sentence("Il t’aime.", sessionId);
    final List<Annotation<StringAttribute>> annotations = new ArrayList<>();
    Annotation<StringAttribute> annotation1 = new Annotation<>("Il ".length(), "Il t’aime".length(), new StringAttribute("phrase", "verbal"), labels);
    annotations.add(annotation1);
    Annotation<StringAttribute> annotation2 = new Annotation<>("Il ".length(), "Il t’aime".length(), new StringAttribute("person", "3rd"), labels);
    annotations.add(annotation2);
    Annotation<StringAttribute> annotation3 = new Annotation<>("Il ".length(), "Il t’".length(), new StringAttribute("type", "object"), labels);
    annotations.add(annotation3);
    final List<Annotation<TokenPlaceholder>> placeholders = new ArrayList<>();
    Annotation<TokenPlaceholder> placeholder0 = new Annotation<>("Il t".length(), "Il t’".length(), new TokenPlaceholder("'", "blah"), labels);
    placeholders.add(placeholder0);
    sentence.addAnnotations(annotations);
    sentence.addAnnotations(placeholders);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.findDefaultTokens();
    LOG.debug(tokenSequence.listWithWhiteSpace().toString());
    LOG.debug(tokenSequence.toString());
    assertEquals(6, tokenSequence.listWithWhiteSpace().size());
    assertEquals(5, tokenSequence.size());
    int i = 0;
    for (Token token : tokenSequence.listWithWhiteSpace()) {
        if (i == 0) {
            assertEquals("Il", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
            assertEquals(0, token.getAttributes().size());
        } else if (i == 1) {
            assertEquals(" ", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
            assertEquals(0, token.getAttributes().size());
        } else if (i == 2) {
            assertEquals("t", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
            assertEquals(3, token.getAttributes().size());
            assertEquals("verbal", token.getAttributes().get("phrase").getValue());
            assertEquals("3rd", token.getAttributes().get("person").getValue());
            assertEquals("object", token.getAttributes().get("type").getValue());
        } else if (i == 3) {
            assertEquals("'", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
            assertEquals(3, token.getAttributes().size());
            assertEquals("verbal", token.getAttributes().get("phrase").getValue());
            assertEquals("3rd", token.getAttributes().get("person").getValue());
            assertEquals("object", token.getAttributes().get("type").getValue());
        } else if (i == 4) {
            assertEquals("aime", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
            assertEquals(2, token.getAttributes().size());
            assertEquals("verbal", token.getAttributes().get("phrase").getValue());
            assertEquals("3rd", token.getAttributes().get("person").getValue());
        } else if (i == 5) {
            assertEquals(".", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
            assertEquals(0, token.getAttributes().size());
        }
        i++;
    }
    i = 0;
    for (Token token : tokenSequence) {
        if (i == 0) {
            assertEquals("Il", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 1) {
            assertEquals("t", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 2) {
            assertEquals("'", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        } else if (i == 3) {
            assertEquals("aime", token.getAnalyisText());
            assertEquals(false, token.isSeparator());
        } else if (i == 4) {
            assertEquals(".", token.getAnalyisText());
            assertEquals(true, token.isSeparator());
        }
        i++;
    }
}
Also used : Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) TokenPlaceholder(com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder) Sentence(com.joliciel.talismane.rawText.Sentence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 35 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class TokenSequenceTest method testOverlappingPlaceholders.

@Test
public void testOverlappingPlaceholders() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    final Sentence sentence = new Sentence("Pakistan International Airlines Company", sessionId);
    final List<Annotation<StringAttribute>> annotations = new ArrayList<>();
    Annotation<StringAttribute> annotation1 = new Annotation<>("".length(), "Pakistan".length(), new StringAttribute("namedEntity", "place"), labels);
    Annotation<StringAttribute> annotation1b = new Annotation<>("".length(), "Pakistan".length(), new StringAttribute("startsWithP", "true"), labels);
    annotations.add(annotation1);
    annotations.add(annotation1b);
    Annotation<StringAttribute> annotation2 = new Annotation<>("".length(), "Pakistan International Airlines".length(), new StringAttribute("namedEntity", "company"), labels);
    Annotation<StringAttribute> annotation2b = new Annotation<>("".length(), "Pakistan International Airlines".length(), new StringAttribute("asianCompany", "true"), labels);
    annotations.add(annotation2);
    annotations.add(annotation2b);
    Annotation<StringAttribute> annotation3 = new Annotation<>("Pakistan ".length(), "Pakistan International Airlines Company".length(), new StringAttribute("namedEntity", "company"), labels);
    Annotation<StringAttribute> annotation3b = new Annotation<>("Pakistan ".length(), "Pakistan International Airlines Company".length(), new StringAttribute("asianCompany", "false"), labels);
    annotations.add(annotation3);
    annotations.add(annotation3b);
    Annotation<StringAttribute> annotation4 = new Annotation<>("Pakistan International Airlines ".length(), "Pakistan International Airlines Company".length(), new StringAttribute("startsWithC", "true"), labels);
    annotations.add(annotation4);
    sentence.addAnnotations(annotations);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.findDefaultTokens();
    LOG.debug(tokenSequence.listWithWhiteSpace().toString());
    LOG.debug(tokenSequence.toString());
    assertEquals(4, tokenSequence.size());
    int i = 0;
    for (Token token : tokenSequence) {
        LOG.debug(token.getAttributes().toString());
        if (i == 0) {
            assertEquals("Pakistan", token.getAnalyisText());
            assertEquals(3, token.getAttributes().size());
            assertEquals("company", token.getAttributes().get("namedEntity").getValue());
            assertEquals("true", token.getAttributes().get("startsWithP").getValue());
            assertEquals("true", token.getAttributes().get("asianCompany").getValue());
        } else if (i == 1) {
            assertEquals("International", token.getAnalyisText());
            assertEquals(2, token.getAttributes().size());
            assertEquals("company", token.getAttributes().get("namedEntity").getValue());
            assertEquals("true", token.getAttributes().get("asianCompany").getValue());
        } else if (i == 2) {
            assertEquals("Airlines", token.getAnalyisText());
            assertEquals(2, token.getAttributes().size());
            assertEquals("company", token.getAttributes().get("namedEntity").getValue());
            assertEquals("true", token.getAttributes().get("asianCompany").getValue());
        } else if (i == 3) {
            assertEquals("Company", token.getAnalyisText());
            assertEquals(1, token.getAttributes().size());
            assertEquals("true", token.getAttributes().get("startsWithC").getValue());
        }
        i++;
    }
}
Also used : Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

Annotation (com.joliciel.talismane.Annotation)36 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 ArrayList (java.util.ArrayList)23 Config (com.typesafe.config.Config)22 AnnotatedText (com.joliciel.talismane.AnnotatedText)20 Sentence (com.joliciel.talismane.rawText.Sentence)12 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)11 List (java.util.List)7 RawTextNoSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker)6 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)6 RawTextReplaceMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker)4 TokenPlaceholder (com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder)4 SentenceBoundary (com.joliciel.talismane.sentenceDetector.SentenceBoundary)4 TokenAttribute (com.joliciel.talismane.tokeniser.TokenAttribute)4 Matcher (java.util.regex.Matcher)4 AnnotationObserver (com.joliciel.talismane.AnnotationObserver)3 Decision (com.joliciel.talismane.machineLearning.Decision)3 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)3 SentenceDetectorFeature (com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature)3