Search in sources :

Example 41 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class PosTaggerStatisticsWriter method onNextPosTagSequence.

@Override
public void onNextPosTagSequence(PosTagSequence posTagSequence) throws TalismaneException {
    stats.sentenceCount++;
    stats.sentenceLengthStats.addValue(posTagSequence.size());
    for (PosTaggedToken posTaggedToken : posTagSequence) {
        if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG))
            continue;
        Token token = posTaggedToken.getToken();
        boolean knownInRefCorpus = false;
        boolean knownInLexicon = false;
        if (token.getPossiblePosTags().size() > 0)
            knownInLexicon = true;
        String word = token.getOriginalText();
        stats.words.add(word);
        if (referenceStats != null)
            if (referenceStats.words.contains(word))
                knownInRefCorpus = true;
        if (!knownInLexicon) {
            stats.unknownInLexiconCount++;
        }
        if (posTaggedToken.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.CLOSED) {
            stats.closedClassCount++;
            if (!knownInRefCorpus)
                stats.closedClassUnknownInRefCorpus++;
            if (!knownInLexicon)
                stats.closedClassUnknownInLexicon++;
        } else if (posTaggedToken.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.OPEN) {
            stats.openClassCount++;
            if (!knownInRefCorpus)
                stats.openClassUnknownInRefCorpus++;
            if (!knownInLexicon)
                stats.openClassUnknownInLexicon++;
        }
        if (!knownInRefCorpus)
            stats.unknownTokenCount++;
        if (alphanumeric.matcher(token.getOriginalText()).find()) {
            String lowercase = word.toLowerCase(TalismaneSession.get(sessionId).getLocale());
            stats.lowerCaseWords.add(lowercase);
            stats.alphanumericCount++;
            if (!knownInRefCorpus)
                stats.unknownAlphanumericCount++;
            if (!knownInLexicon)
                stats.unknownAlphaInLexiconCount++;
        }
        stats.tokenCount++;
        Integer countObj = stats.posTagCounts.get(posTaggedToken.getTag().getCode());
        int count = countObj == null ? 0 : countObj.intValue();
        count++;
        stats.posTagCounts.put(posTaggedToken.getTag().getCode(), count);
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token)

Example 42 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class UppercaseSeriesFilter method apply.

@Override
public void apply(TokenSequence tokenSequence) {
    List<Token> upperCaseSequence = new ArrayList<Token>();
    for (Token token : tokenSequence) {
        String word = token.getText();
        if (word.length() == 0)
            continue;
        boolean hasLowerCase = false;
        boolean hasUpperCase = false;
        for (int i = 0; i < word.length(); i++) {
            char c = word.charAt(i);
            if (Character.isUpperCase(c)) {
                hasUpperCase = true;
            }
            if (Character.isLowerCase(c)) {
                hasLowerCase = true;
                break;
            }
        }
        if (hasUpperCase && !hasLowerCase) {
            upperCaseSequence.add(token);
        } else if (!hasLowerCase) {
        // do nothing, might be punctuation or number in middle of upper case
        // sequence
        } else {
            if (upperCaseSequence.size() > 1) {
                this.checkSequence(upperCaseSequence);
            }
            upperCaseSequence.clear();
        }
    }
    // next token
    if (upperCaseSequence.size() > 1) {
        this.checkSequence(upperCaseSequence);
    }
}
Also used : ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token)

Example 43 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class PatternEventStream method hasNext.

@Override
public boolean hasNext() throws TalismaneException, IOException {
    if (currentPatternMatches != null) {
        if (currentIndex == currentPatternMatches.size()) {
            currentPatternMatches = null;
        }
    }
    while (currentPatternMatches == null) {
        if (this.corpusReader.hasNextSentence()) {
            currentPatternMatches = new ArrayList<TokenPatternMatch>();
            currentOutcomes = new ArrayList<TokeniserOutcome>();
            currentIndex = 0;
            TokenSequence realSequence = corpusReader.nextTokenSequence();
            List<Integer> tokenSplits = realSequence.getTokenSplits();
            String text = realSequence.getSentence().getText().toString();
            LOG.debug("Sentence: " + text);
            Sentence sentence = new Sentence(text, sessionId);
            TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
            tokenSequence.findDefaultTokens();
            List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(tokenSequence);
            List<TaggedToken<TokeniserOutcome>> currentSentence = this.getTaggedTokens(tokenSequence, tokenSplits);
            // check if anything matches each pattern
            for (TokenPattern parsedPattern : this.tokeniserPatternManager.getParsedTestPatterns()) {
                List<TokenPatternMatchSequence> tokenPatternMatches = parsedPattern.match(tokenSequence);
                for (TokenPatternMatchSequence tokenPatternMatchSequence : tokenPatternMatches) {
                    if (LOG.isTraceEnabled())
                        LOG.trace("Matched pattern: " + parsedPattern + ": " + tokenPatternMatchSequence.getTokenSequence());
                    // check if entire pattern is separated or joined
                    TokeniserOutcome outcome = null;
                    TokeniserOutcome defaultOutcome = null;
                    boolean haveMismatch = false;
                    TokenPatternMatch tokenPatternMatch = null;
                    for (Token token : tokenPatternMatchSequence.getTokensToCheck()) {
                        if (tokenPatternMatch == null) {
                            for (TokenPatternMatch patternMatch : tokenPatternMatchSequence.getTokenPatternMatches()) {
                                if (patternMatch.getToken().equals(token)) {
                                    tokenPatternMatch = patternMatch;
                                    break;
                                }
                            }
                        }
                        TaggedToken<TokeniserOutcome> taggedToken = currentSentence.get(token.getIndexWithWhiteSpace());
                        if (outcome == null) {
                            outcome = taggedToken.getTag();
                            defaultOutcome = defaultOutcomes.get(token.getIndexWithWhiteSpace());
                        } else if (taggedToken.getTag() != outcome) {
                            // this should only happen when two patterns
                            // overlap:
                            // e.g. "aussi bien que" and "bien que", or
                            // "plutot que" and "plutot que de"
                            // AND the outer pattern is separated, while
                            // the inner pattern is joined
                            LOG.debug("Mismatch in pattern: " + tokenPatternMatch + ", " + taggedToken);
                            haveMismatch = true;
                        }
                    }
                    currentPatternMatches.add(tokenPatternMatch);
                    if (haveMismatch) {
                        currentOutcomes.add(defaultOutcome);
                    } else {
                        currentOutcomes.add(outcome);
                    }
                }
            }
            if (currentPatternMatches.size() == 0) {
                currentPatternMatches = null;
                currentOutcomes = null;
            }
        } else {
            break;
        }
    }
    return currentPatternMatches != null;
}
Also used : TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Example 44 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class LowercaseKnownFirstWordFilterTest method testReplace.

@Test
public void testReplace() throws Exception {
    Diacriticizer diacriticizer = mock(Diacriticizer.class);
    when(diacriticizer.diacriticize("J'")).thenReturn(new HashSet<>(Arrays.asList("j'")));
    when(diacriticizer.diacriticize("Il")).thenReturn(new HashSet<>(Arrays.asList("il")));
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
    LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
    String text = "J'avais oublié : Il est Malade.";
    Sentence sentence = new Sentence(text, sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.addToken("".length(), "J'".length());
    tokenSequence.addToken("J'".length(), "J'avais".length());
    tokenSequence.addToken("J'avais ".length(), "J'avais oublié".length());
    tokenSequence.addToken("J'avais oublié ".length(), "J'avais oublié :".length());
    tokenSequence.addToken("J'avais oublié : ".length(), "J'avais oublié : Il".length());
    tokenSequence.addToken("J'avais oublié : Il ".length(), "J'avais oublié : Il est".length());
    tokenSequence.addToken("J'avais oublié : Il est ".length(), "J'avais oublié : Il est Malade".length());
    tokenSequence.addToken("J'avais oublié : Il est Malade".length(), "J'avais oublié : Il est Malade.".length());
    filter.apply(tokenSequence);
    System.out.println(tokenSequence);
    StringBuilder sb = new StringBuilder();
    for (Token token : tokenSequence) {
        sb.append(token.getText());
        sb.append('|');
    }
    assertEquals("j'|avais|oublié|:|il|est|Malade|.|", sb.toString());
}
Also used : Config(com.typesafe.config.Config) Token(com.joliciel.talismane.tokeniser.Token) Diacriticizer(com.joliciel.talismane.lexicon.Diacriticizer) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 45 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class LowercaseKnownFirstWordFilterTest method testReplaceLongWord.

@Test
public void testReplaceLongWord() throws Exception {
    Diacriticizer diacriticizer = mock(Diacriticizer.class);
    when(diacriticizer.diacriticize("Aujourd'hui")).thenReturn(new HashSet<>(Arrays.asList("aujourd'hui")));
    when(diacriticizer.diacriticize("Parce que")).thenReturn(new HashSet<>(Arrays.asList("parce que")));
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    TalismaneSession.get(sessionId).setDiacriticizer(diacriticizer);
    LowercaseKnownFirstWordFilter filter = new LowercaseKnownFirstWordFilter(sessionId);
    String text = "Aujourd'hui il vient. Parce que...";
    Sentence sentence = new Sentence(text, sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.addToken("".length(), "Aujourd'hui".length());
    tokenSequence.addToken("Aujourd'hui ".length(), "Aujourd'hui il".length());
    tokenSequence.addToken("Aujourd'hui il ".length(), "Aujourd'hui il vient".length());
    tokenSequence.addToken("Aujourd'hui il vient".length(), "Aujourd'hui il vient.".length());
    tokenSequence.addToken("Aujourd'hui il vient. ".length(), "Aujourd'hui il vient. Parce que".length());
    tokenSequence.addToken("Aujourd'hui il vient. Parce que".length(), "Aujourd'hui il vient. Parce que...".length());
    filter.apply(tokenSequence);
    System.out.println(tokenSequence);
    StringBuilder sb = new StringBuilder();
    for (Token token : tokenSequence) {
        sb.append(token.getText());
        sb.append('|');
    }
    assertEquals("aujourd'hui|il|vient|.|parce que|...|", sb.toString());
}
Also used : Config(com.typesafe.config.Config) Token(com.joliciel.talismane.tokeniser.Token) Diacriticizer(com.joliciel.talismane.lexicon.Diacriticizer) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

Token (com.joliciel.talismane.tokeniser.Token)69 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)16 ArrayList (java.util.ArrayList)15 Sentence (com.joliciel.talismane.rawText.Sentence)14 Decision (com.joliciel.talismane.machineLearning.Decision)12 Config (com.typesafe.config.Config)12 TalismaneTest (com.joliciel.talismane.TalismaneTest)11 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)11 Test (org.junit.Test)11 TalismaneException (com.joliciel.talismane.TalismaneException)7 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)7 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)7 List (java.util.List)7 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)6 HashMap (java.util.HashMap)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 PosTag (com.joliciel.talismane.posTagger.PosTag)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5