Search in sources :

Example 56 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class LowercaseKnownFirstWordFilter method apply.

@Override
public void apply(TokenSequence tokenSequence) {
    int startIndex = 0;
    if (tokenSequence.isWithRoot())
        startIndex += 1;
    boolean lowerCaseNextWord = true;
    int index = -1;
    for (Token token : tokenSequence) {
        index++;
        if (index < startIndex)
            continue;
        if (token.getText().length() == 0)
            continue;
        if (lowerCaseNextWord) {
            char firstChar = token.getText().charAt(0);
            if (Character.isUpperCase(firstChar)) {
                Set<String> possibleWords = TalismaneSession.get(sessionId).getDiacriticizer().diacriticize(token.getText());
                if (possibleWords.size() > 0)
                    token.setText(possibleWords.iterator().next());
            }
            // next word starts with an upper-case
            lowerCaseNextWord = false;
        }
        // should we lower-case the next word?
        if (Tokeniser.getTokenSeparators(sessionId).matcher(token.getText()).matches() && !noUppercasePunctuation.matcher(token.getText()).matches()) {
            lowerCaseNextWord = true;
        }
    }
// next token
}
Also used : Token(com.joliciel.talismane.tokeniser.Token)

Example 57 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class TokenEvaluationCorpusWriter method onNextTokenSequence.

@Override
public void onNextTokenSequence(TokenSequence realSequence, List<TokenisedAtomicTokenSequence> guessedAtomicSequences) throws IOException {
    List<Integer> realSplits = realSequence.getTokenSplits();
    TokenisedAtomicTokenSequence tokenisedAtomicTokenSequence = guessedAtomicSequences.get(0);
    Map<Integer, TokeniserOutcome> realOutcomes = new HashMap<Integer, TokeniserOutcome>();
    Map<Integer, TokeniserOutcome> guessedOutcomes = new HashMap<Integer, TokeniserOutcome>();
    Map<Integer, List<String>> guessedAuthorities = new HashMap<Integer, List<String>>();
    List<Integer> indexes = new ArrayList<Integer>();
    corpusWriter.write(realSequence.getSentence().getText() + "\n");
    for (TaggedToken<TokeniserOutcome> guessTag : tokenisedAtomicTokenSequence) {
        TokeniserOutcome guessDecision = guessTag.getTag();
        int startIndex = guessTag.getToken().getStartIndex();
        boolean realSplit = realSplits.contains(startIndex);
        TokeniserOutcome realDecision = realSplit ? TokeniserOutcome.SEPARATE : TokeniserOutcome.JOIN;
        indexes.add(startIndex);
        realOutcomes.put(startIndex, realDecision);
        guessedOutcomes.put(startIndex, guessDecision);
        guessedAuthorities.put(startIndex, guessTag.getDecision().getAuthorities());
    }
    int prevEndIndex = 0;
    for (Token token : realSequence) {
        corpusWriter.write(token.getOriginalText());
        Set<String> authorities = new TreeSet<String>();
        boolean correct = true;
        for (int index : indexes) {
            if (prevEndIndex <= index && index < token.getEndIndex()) {
                correct = correct && realOutcomes.get(index) == guessedOutcomes.get(index);
                authorities.addAll(guessedAuthorities.get(index));
            }
        }
        corpusWriter.write("\t" + correct);
        for (String authority : authorities) {
            if (!authority.startsWith("_")) {
                corpusWriter.write("\t" + authority);
            }
        }
        corpusWriter.write("\n");
        corpusWriter.flush();
        prevEndIndex = token.getEndIndex();
    }
    corpusWriter.write("\n");
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Example 58 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class PossibleSentenceBoundary method getTokenIndexWithWhitespace.

/**
 * Index of this boundary's token, including whitespace.
 */
public int getTokenIndexWithWhitespace() {
    if (tokenIndex < 0) {
        // perform binary search to find token index quickly
        List<Token> tokens = this.getTokenSequence().listWithWhiteSpace();
        int current = tokens.size() / 2;
        int step = current;
        while (tokenIndex < 0) {
            Token token = tokens.get(current);
            if (token.getStartIndex() <= index && index < token.getEndIndex()) {
                tokenIndex = token.getIndexWithWhiteSpace();
                break;
            }
            step = step / 2;
            if (step < 1)
                step = 1;
            if (token.getStartIndex() <= index) {
                current += step;
            } else if (token.getStartIndex() > index) {
                current -= step;
            }
            if (current < 0 || current >= tokens.size()) {
                throw new RuntimeException("Binary search failed. Current = " + current + ", Size = " + tokens.size());
            }
        }
    }
    return tokenIndex;
}
Also used : Token(com.joliciel.talismane.tokeniser.Token)

Example 59 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class LexiconPosTagFeature method checkInternal.

@Override
public FeatureResult<Boolean> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<Boolean> result = null;
    boolean matches = false;
    for (StringFeature<TokenWrapper> posTagFeature : posTagFeatures) {
        FeatureResult<String> posTagResult = posTagFeature.check(innerWrapper, env);
        if (posTagResult != null) {
            PosTag posTag = TalismaneSession.get(sessionId).getPosTagSet().getPosTag(posTagResult.getOutcome());
            boolean hasPosTag = (token.getPossiblePosTags().contains(posTag));
            if (hasPosTag) {
                matches = true;
                break;
            }
        }
    }
    result = this.generateResult(matches);
    return result;
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) Token(com.joliciel.talismane.tokeniser.Token)

Example 60 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class LexiconPosTagsFeature method checkInternal.

@Override
public FeatureResult<List<WeightedOutcome<String>>> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<List<WeightedOutcome<String>>> result = null;
    List<WeightedOutcome<String>> resultList = new ArrayList<WeightedOutcome<String>>();
    for (PosTag posTag : token.getPossiblePosTags()) {
        resultList.add(new WeightedOutcome<String>(posTag.getCode(), 1.0));
    }
    if (resultList.size() > 0)
        result = this.generateResult(resultList);
    return result;
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) ArrayList(java.util.ArrayList) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome) Token(com.joliciel.talismane.tokeniser.Token) List(java.util.List) ArrayList(java.util.ArrayList)

Aggregations

Token (com.joliciel.talismane.tokeniser.Token)69 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)16 ArrayList (java.util.ArrayList)15 Sentence (com.joliciel.talismane.rawText.Sentence)14 Decision (com.joliciel.talismane.machineLearning.Decision)12 Config (com.typesafe.config.Config)12 TalismaneTest (com.joliciel.talismane.TalismaneTest)11 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)11 Test (org.junit.Test)11 TalismaneException (com.joliciel.talismane.TalismaneException)7 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)7 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)7 List (java.util.List)7 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)6 HashMap (java.util.HashMap)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 PosTag (com.joliciel.talismane.posTagger.PosTag)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5