Search in sources :

Example 1 with SeparatorDecision

use of com.joliciel.talismane.tokeniser.SeparatorDecision in project talismane by joliciel-informatique.

the class TokeniserPatternManager method getDefaultOutcomes.

/**
 * Takes a sequence of atomic tokens and applies default decisions for each
 * separator.
 */
public List<TokeniserOutcome> getDefaultOutcomes(TokenSequence tokenSequence) {
    List<TokeniserOutcome> defaultOutcomes = new ArrayList<TokeniserOutcome>();
    // Assign each separator its default value
    TokeniserOutcome nextOutcome = TokeniserOutcome.SEPARATE;
    Pattern tokenSeparators = Tokeniser.getTokenSeparators(sessionId);
    for (Token token : tokenSequence.listWithWhiteSpace()) {
        TokeniserOutcome outcome = null;
        if (tokenSeparators.matcher(token.getAnalyisText()).matches()) {
            boolean defaultValueFound = false;
            for (Entry<SeparatorDecision, Pattern> entry : this.getSeparatorDefaultPatterns().entrySet()) {
                if (entry.getValue().matcher(token.getAnalyisText()).matches()) {
                    defaultValueFound = true;
                    SeparatorDecision defaultSeparatorDecision = entry.getKey();
                    switch(defaultSeparatorDecision) {
                        case IS_SEPARATOR:
                            outcome = TokeniserOutcome.SEPARATE;
                            nextOutcome = TokeniserOutcome.SEPARATE;
                            break;
                        case IS_NOT_SEPARATOR:
                            outcome = TokeniserOutcome.JOIN;
                            nextOutcome = TokeniserOutcome.JOIN;
                            break;
                        case IS_SEPARATOR_BEFORE:
                            outcome = TokeniserOutcome.SEPARATE;
                            nextOutcome = TokeniserOutcome.JOIN;
                        case IS_SEPARATOR_AFTER:
                            outcome = TokeniserOutcome.JOIN;
                            nextOutcome = TokeniserOutcome.SEPARATE;
                        case NOT_APPLICABLE:
                            break;
                        default:
                            break;
                    }
                    break;
                }
            }
            if (!defaultValueFound) {
                outcome = TokeniserOutcome.SEPARATE;
                nextOutcome = TokeniserOutcome.SEPARATE;
            }
            defaultOutcomes.add(outcome);
        } else {
            defaultOutcomes.add(nextOutcome);
        }
    }
    return defaultOutcomes;
}
Also used : Pattern(java.util.regex.Pattern) SeparatorDecision(com.joliciel.talismane.tokeniser.SeparatorDecision) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome)

Example 2 with SeparatorDecision

use of com.joliciel.talismane.tokeniser.SeparatorDecision in project talismane by joliciel-informatique.

the class TokeniserPatternManager method getSeparatorDefaultPatterns.

protected Map<SeparatorDecision, Pattern> getSeparatorDefaultPatterns() {
    if (this.separatorDefaultPatterns == null) {
        this.separatorDefaultPatterns = new HashMap<SeparatorDecision, Pattern>();
        for (Entry<SeparatorDecision, String> entry : this.getSeparatorDefaults().entrySet()) {
            String separators = entry.getValue();
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < separators.length(); i++) {
                char c = separators.charAt(i);
                sb.append('\\');
                sb.append(c);
            }
            Pattern pattern = Pattern.compile("[" + sb.toString() + "]", Pattern.UNICODE_CHARACTER_CLASS);
            this.separatorDefaultPatterns.put(entry.getKey(), pattern);
        }
    }
    return separatorDefaultPatterns;
}
Also used : Pattern(java.util.regex.Pattern) SeparatorDecision(com.joliciel.talismane.tokeniser.SeparatorDecision)

Aggregations

SeparatorDecision (com.joliciel.talismane.tokeniser.SeparatorDecision)2 Pattern (java.util.regex.Pattern)2 Token (com.joliciel.talismane.tokeniser.Token)1 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)1 ArrayList (java.util.ArrayList)1