Search in sources :

Example 26 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class TokenPatternTest method testMatch2.

@Test
public void testMatch2() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    final Sentence sentence = new Sentence("Qu'ensuite il aille...", sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.findDefaultTokens();
    TokenPattern tokenPattern = new TokenPattern("{(?![cdjlmnstCDJLMNST]\\z|qu\\z|jusqu\\z|puisqu\\z|lorsqu\\z|aujourd\\z|prud\\z|quelqu\\z|quoiqu\\z).+'}.+", Tokeniser.getTokenSeparators(sessionId));
    List<TokenPatternMatchSequence> patternMatches = tokenPattern.match(tokenSequence);
    assertEquals(0, patternMatches.size());
}
Also used : Config(com.typesafe.config.Config) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 27 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class TalismaneAPIExamples method example2.

/**
 * Similar to example1, but begins with filtering and sentence detection.
 */
public static void example2(String sessionId) throws Exception {
    String text = "Les gens qui voient de travers pensent que les bancs verts qu'on voit sur les trottoirs " + "sont faits pour les impotents ou les ventripotents. " + "Mais c'est une absurdité, car, à la vérité, ils sont là, c'est notoire, " + "pour accueillir quelque temps les amours débutants.";
    RawText rawText = new RawText(text, true, sessionId);
    // issues (e.g. replace &quot; with ")
    for (RawTextAnnotator filter : TalismaneSession.get(sessionId).getTextAnnotators()) {
        filter.annotate(rawText);
    }
    // retrieve the processed text after filters have been applied
    AnnotatedText processedText = rawText.getProcessedText();
    // detect sentences
    SentenceDetector sentenceDetector = SentenceDetector.getInstance(sessionId);
    sentenceDetector.detectSentences(processedText);
    // the detected sentences can be retrieved directly from the raw text
    // this allows annotations made on the sentences to get reflected in the
    // raw text
    List<Sentence> sentences = rawText.getDetectedSentences();
    for (Sentence sentence : sentences) {
        // assignment for a given word)
        for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
            annotator.annotate(sentence);
        }
        // tokenise the text
        Tokeniser tokeniser = Tokeniser.getInstance(sessionId);
        TokenSequence tokenSequence = tokeniser.tokeniseSentence(sentence);
        // pos-tag the token sequence
        PosTagger posTagger = PosTaggers.getPosTagger(sessionId);
        PosTagSequence posTagSequence = posTagger.tagSentence(tokenSequence);
        System.out.println(posTagSequence);
        // parse the pos-tag sequence
        Parser parser = Parsers.getParser(sessionId);
        ParseConfiguration parseConfiguration = parser.parseSentence(posTagSequence);
        System.out.println(parseConfiguration);
        ParseTree parseTree = new ParseTree(parseConfiguration, true);
        System.out.println(parseTree);
    }
}
Also used : AnnotatedText(com.joliciel.talismane.AnnotatedText) RawTextAnnotator(com.joliciel.talismane.rawText.RawTextAnnotator) RawText(com.joliciel.talismane.rawText.RawText) OptionParser(joptsimple.OptionParser) SentenceDetector(com.joliciel.talismane.sentenceDetector.SentenceDetector) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) Tokeniser(com.joliciel.talismane.tokeniser.Tokeniser) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) PosTagger(com.joliciel.talismane.posTagger.PosTagger)

Example 28 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class StandoffReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        if (configuration == null && sentenceIndex < sentences.size()) {
            List<StandoffToken> tokens = sentences.get(sentenceIndex++);
            LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
            if (rules == null)
                throw new RuntimeException("Linguistic rules have not been set.");
            String text = "";
            for (StandoffToken standoffToken : tokens) {
                String word = standoffToken.text;
                if (rules.shouldAddSpace(text, word))
                    text += " ";
                text += word;
            }
            Sentence sentence = new Sentence(text, sessionId);
            for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                annotator.annotate(sentence);
            }
            PretokenisedSequence tokenSequence = new PretokenisedSequence(sentence, sessionId);
            PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
            Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>();
            for (StandoffToken standoffToken : tokens) {
                Token token = tokenSequence.addToken(standoffToken.text);
                Decision posTagDecision = new Decision(standoffToken.posTag.getCode());
                PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
                if (LOG.isTraceEnabled()) {
                    LOG.trace(posTaggedToken.toString());
                }
                posTaggedToken.setComment(standoffToken.comment);
                posTagSequence.addPosTaggedToken(posTaggedToken);
                idTokenMap.put(standoffToken.id, posTaggedToken);
                LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken);
            }
            tokenSequence.setWithRoot(true);
            configuration = new ParseConfiguration(posTagSequence);
            for (StandoffToken standoffToken : tokens) {
                StandoffRelation relation = relationMap.get(standoffToken.id);
                if (relation != null) {
                    PosTaggedToken head = idTokenMap.get(relation.fromToken);
                    PosTaggedToken dependent = idTokenMap.get(relation.toToken);
                    if (head == null) {
                        throw new TalismaneException("No token found for head id: " + relation.fromToken);
                    }
                    if (dependent == null) {
                        throw new TalismaneException("No token found for dependent id: " + relation.toToken);
                    }
                    DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null);
                    arc.setComment(relation.comment);
                } else if (standoffToken.posTag.getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION) {
                    if (punctuationDepLabel != null) {
                        PosTaggedToken dependent = idTokenMap.get(standoffToken.id);
                        for (int i = dependent.getIndex() - 1; i >= 0; i--) {
                            PosTaggedToken head = posTagSequence.get(i);
                            if (head.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION)
                                continue;
                            configuration.addDependency(head, dependent, punctuationDepLabel, null);
                            break;
                        }
                    }
                }
            }
        }
    }
    return (configuration != null);
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TalismaneException(com.joliciel.talismane.TalismaneException) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token) Decision(com.joliciel.talismane.machineLearning.Decision) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) PretokenisedSequence(com.joliciel.talismane.tokeniser.PretokenisedSequence) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) DependencyArc(com.joliciel.talismane.parser.DependencyArc) Sentence(com.joliciel.talismane.rawText.Sentence)

Example 29 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class PatternEventStream method hasNext.

@Override
public boolean hasNext() throws TalismaneException, IOException {
    if (currentPatternMatches != null) {
        if (currentIndex == currentPatternMatches.size()) {
            currentPatternMatches = null;
        }
    }
    while (currentPatternMatches == null) {
        if (this.corpusReader.hasNextSentence()) {
            currentPatternMatches = new ArrayList<TokenPatternMatch>();
            currentOutcomes = new ArrayList<TokeniserOutcome>();
            currentIndex = 0;
            TokenSequence realSequence = corpusReader.nextTokenSequence();
            List<Integer> tokenSplits = realSequence.getTokenSplits();
            String text = realSequence.getSentence().getText().toString();
            LOG.debug("Sentence: " + text);
            Sentence sentence = new Sentence(text, sessionId);
            TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
            tokenSequence.findDefaultTokens();
            List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(tokenSequence);
            List<TaggedToken<TokeniserOutcome>> currentSentence = this.getTaggedTokens(tokenSequence, tokenSplits);
            // check if anything matches each pattern
            for (TokenPattern parsedPattern : this.tokeniserPatternManager.getParsedTestPatterns()) {
                List<TokenPatternMatchSequence> tokenPatternMatches = parsedPattern.match(tokenSequence);
                for (TokenPatternMatchSequence tokenPatternMatchSequence : tokenPatternMatches) {
                    if (LOG.isTraceEnabled())
                        LOG.trace("Matched pattern: " + parsedPattern + ": " + tokenPatternMatchSequence.getTokenSequence());
                    // check if entire pattern is separated or joined
                    TokeniserOutcome outcome = null;
                    TokeniserOutcome defaultOutcome = null;
                    boolean haveMismatch = false;
                    TokenPatternMatch tokenPatternMatch = null;
                    for (Token token : tokenPatternMatchSequence.getTokensToCheck()) {
                        if (tokenPatternMatch == null) {
                            for (TokenPatternMatch patternMatch : tokenPatternMatchSequence.getTokenPatternMatches()) {
                                if (patternMatch.getToken().equals(token)) {
                                    tokenPatternMatch = patternMatch;
                                    break;
                                }
                            }
                        }
                        TaggedToken<TokeniserOutcome> taggedToken = currentSentence.get(token.getIndexWithWhiteSpace());
                        if (outcome == null) {
                            outcome = taggedToken.getTag();
                            defaultOutcome = defaultOutcomes.get(token.getIndexWithWhiteSpace());
                        } else if (taggedToken.getTag() != outcome) {
                            // this should only happen when two patterns
                            // overlap:
                            // e.g. "aussi bien que" and "bien que", or
                            // "plutot que" and "plutot que de"
                            // AND the outer pattern is separated, while
                            // the inner pattern is joined
                            LOG.debug("Mismatch in pattern: " + tokenPatternMatch + ", " + taggedToken);
                            haveMismatch = true;
                        }
                    }
                    currentPatternMatches.add(tokenPatternMatch);
                    if (haveMismatch) {
                        currentOutcomes.add(defaultOutcome);
                    } else {
                        currentOutcomes.add(outcome);
                    }
                }
            }
            if (currentPatternMatches.size() == 0) {
                currentPatternMatches = null;
                currentOutcomes = null;
            }
        } else {
            break;
        }
    }
    return currentPatternMatches != null;
}
Also used : TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Example 30 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testApplyWithUnmatchingGroups.

@Test
public void testApplyWithUnmatchingGroups() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "\\b(\\d)(\\d)?\\b";
    String replacement = "Number$1$2";
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    Sentence text = new Sentence("Two-digit number: 42. One-digit number: 7.", sessionId);
    filter.annotate(text);
    List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
    LOG.debug(placeholders.toString());
    assertEquals(2, placeholders.size());
    Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
    assertEquals("Two-digit number: ".length(), placeholder.getStart());
    assertEquals("Two-digit number: 42".length(), placeholder.getEnd());
    assertEquals("Number42", placeholder.getData().getReplacement());
    placeholder = placeholders.get(1);
    assertEquals("Two-digit number: 42. One-digit number: ".length(), placeholder.getStart());
    assertEquals("Two-digit number: 42. One-digit number: 7".length(), placeholder.getEnd());
    assertEquals("Number7", placeholder.getData().getReplacement());
}
Also used : Config(com.typesafe.config.Config) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

Sentence (com.joliciel.talismane.rawText.Sentence)43 Config (com.typesafe.config.Config)31 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)25 Token (com.joliciel.talismane.tokeniser.Token)14 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)13 Annotation (com.joliciel.talismane.Annotation)12 Decision (com.joliciel.talismane.machineLearning.Decision)11 ArrayList (java.util.ArrayList)9 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 TalismaneException (com.joliciel.talismane.TalismaneException)6 HashMap (java.util.HashMap)6 List (java.util.List)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)5