Search in sources :

Example 1 with LexicalEntry

use of com.joliciel.talismane.lexicon.LexicalEntry in project talismane by joliciel-informatique.

the class LemmaFeature method checkInternal.

@Override
protected FeatureResult<String> checkInternal(T context, RuntimeEnvironment env) throws TalismaneException {
    PosTaggedTokenWrapper innerWrapper = this.getToken(context, env);
    if (innerWrapper == null)
        return null;
    PosTaggedToken posTaggedToken = innerWrapper.getPosTaggedToken();
    if (posTaggedToken == null)
        return null;
    FeatureResult<String> featureResult = null;
    List<LexicalEntry> lexicalEntries = posTaggedToken.getLexicalEntries();
    if (lexicalEntries.size() > 0) {
        LexicalEntry lexicalEntry = lexicalEntries.get(0);
        featureResult = this.generateResult(lexicalEntry.getLemma());
    }
    return featureResult;
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry)

Example 2 with LexicalEntry

use of com.joliciel.talismane.lexicon.LexicalEntry in project talismane by joliciel-informatique.

the class PosTaggerEvaluator method evaluate.

/**
 * Evaluate a given pos tagger.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (corpusReader.hasNextSentence()) {
        PosTagSequence realPosTagSequence = corpusReader.nextPosTagSequence();
        List<TokenSequence> tokenSequences = null;
        List<PosTagSequence> guessedSequences = null;
        TokenSequence tokenSequence = realPosTagSequence.getTokenSequence();
        PosTagSequence guessedSequence = null;
        if (this.tokeniser != null) {
            Sentence sentence = tokenSequence.getSentence();
            tokenSequences = tokeniser.tokenise(sentence);
            tokenSequence = tokenSequences.get(0);
        } else {
            tokenSequences = new ArrayList<TokenSequence>();
            tokenSequences.add(tokenSequence);
        }
        if (posTagger instanceof NonDeterministicPosTagger) {
            NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
            guessedSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
            guessedSequence = guessedSequences.get(0);
        } else {
            guessedSequence = posTagger.tagSentence(tokenSequence);
        }
        if (LOG.isDebugEnabled()) {
            StringBuilder stringBuilder = new StringBuilder();
            for (PosTaggedToken posTaggedToken : guessedSequence) {
                Set<String> lemmas = new TreeSet<String>();
                stringBuilder.append(posTaggedToken.getToken().getOriginalText());
                stringBuilder.append("[" + posTaggedToken.getTag());
                List<LexicalEntry> entries = posTaggedToken.getLexicalEntries();
                boolean dropCurrentWord = false;
                if (entries.size() > 1)
                    dropCurrentWord = true;
                for (LexicalEntry entry : posTaggedToken.getLexicalEntries()) {
                    if (!lemmas.contains(entry.getLemma())) {
                        if (dropCurrentWord && posTaggedToken.getToken().getText().equals(entry.getLemma())) {
                            dropCurrentWord = false;
                            continue;
                        }
                        stringBuilder.append("|" + entry.getLemma());
                        // stringBuilder.append("/" + entry.getCategory());
                        stringBuilder.append("/" + entry.getMorphology());
                        lemmas.add(entry.getLemma());
                    }
                }
                stringBuilder.append("] ");
            }
            LOG.debug(stringBuilder.toString());
        }
        for (PosTagEvaluationObserver observer : this.observers) {
            observer.onNextPosTagSequence(realPosTagSequence, guessedSequences);
        }
    }
    for (PosTagEvaluationObserver observer : this.observers) {
        observer.onEvaluationComplete();
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) TreeSet(java.util.TreeSet) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Example 3 with LexicalEntry

use of com.joliciel.talismane.lexicon.LexicalEntry in project talismane by joliciel-informatique.

the class PosTaggedToken method getMorphologyForCoNLL.

/**
 * A string representation of all of the morpho-syntaxic information combined
 * in CoNLL-X format.
 */
public String getMorphologyForCoNLL() {
    if (morphologyForCoNLL == null) {
        StringBuilder sb = new StringBuilder();
        Set<String> items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.SubCategory) && lexicalEntry.getSubCategory().length() > 0)
                items.add(lexicalEntry.getSubCategory());
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("s=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.Case)) {
                items.addAll(lexicalEntry.getCase());
            }
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("c=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.Number)) {
                items.addAll(lexicalEntry.getNumber());
            }
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("n=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.Gender)) {
                items.addAll(lexicalEntry.getGender());
            }
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("g=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.Tense)) {
                items.addAll(lexicalEntry.getTense());
            }
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("t=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.Mood)) {
                items.addAll(lexicalEntry.getMood());
            }
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("m=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.Aspect)) {
                items.addAll(lexicalEntry.getAspect());
            }
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("a=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.Person)) {
                items.addAll(lexicalEntry.getPerson());
            }
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("p=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        items = new TreeSet<>();
        for (LexicalEntry lexicalEntry : this.getLexicalEntries()) {
            if (lexicalEntry.hasAttribute(LexicalAttribute.PossessorNumber)) {
                items.addAll(lexicalEntry.getPossessorNumber());
            }
        }
        if (items.size() > 0) {
            if (sb.length() > 0)
                sb.append("|");
            sb.append("poss=");
            sb.append(items.stream().collect(Collectors.joining(",")));
        }
        morphologyForCoNLL = sb.toString();
    }
    return morphologyForCoNLL;
}
Also used : TreeSet(java.util.TreeSet) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry)

Example 4 with LexicalEntry

use of com.joliciel.talismane.lexicon.LexicalEntry in project talismane by joliciel-informatique.

the class AbstractLexicalAttributesFeature method checkInternal.

@Override
public FeatureResult<String> checkInternal(T context, RuntimeEnvironment env) throws TalismaneException {
    PosTaggedTokenWrapper innerWrapper = this.getToken(context, env);
    if (innerWrapper == null)
        return null;
    PosTaggedToken posTaggedToken = innerWrapper.getPosTaggedToken();
    if (posTaggedToken == null)
        return null;
    FeatureResult<String> featureResult = null;
    List<String> attributes = this.getAttributes(innerWrapper, env);
    Map<String, Set<String>> results = new HashMap<>();
    for (String attribute : attributes) {
        Set<String> values = new TreeSet<>();
        results.put(attribute, values);
        for (LexicalEntry lexicalEntry : posTaggedToken.getLexicalEntries()) {
            values.addAll(lexicalEntry.getAttributeAsList(attribute));
        }
    }
    boolean firstAttribute = true;
    boolean haveAtLeastOne = false;
    StringBuilder sb = new StringBuilder();
    for (String attribute : attributes) {
        if (!firstAttribute)
            sb.append("|");
        Set<String> values = results.get(attribute);
        if (values.size() > 0) {
            haveAtLeastOne = true;
            sb.append(values.stream().collect(Collectors.joining(";")));
        }
        firstAttribute = false;
    }
    if (haveAtLeastOne) {
        String result = sb.toString();
        featureResult = this.generateResult(result);
    }
    return featureResult;
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Set(java.util.Set) TreeSet(java.util.TreeSet) HashMap(java.util.HashMap) TreeSet(java.util.TreeSet) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry)

Example 5 with LexicalEntry

use of com.joliciel.talismane.lexicon.LexicalEntry in project talismane by joliciel-informatique.

the class LemmaForPosTagFeature method checkInternal.

@Override
public FeatureResult<String> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<String> featureResult = null;
    List<String> posTagCodes = new ArrayList<String>();
    for (StringFeature<TokenWrapper> posTagCodeFeature : posTagCodeFeatures) {
        FeatureResult<String> posTagCodeResult = posTagCodeFeature.check(innerWrapper, env);
        if (posTagCodeResult != null)
            posTagCodes.add(posTagCodeResult.getOutcome());
    }
    for (String posTagCode : posTagCodes) {
        PosTag posTag = TalismaneSession.get(sessionId).getPosTagSet().getPosTag(posTagCode);
        LexicalEntry lexicalEntry = token.getLexicalEntry(posTag);
        if (lexicalEntry != null) {
            featureResult = this.generateResult(lexicalEntry.getLemma());
            break;
        }
    }
    return featureResult;
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry)

Aggregations

LexicalEntry (com.joliciel.talismane.lexicon.LexicalEntry)8 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)4 ArrayList (java.util.ArrayList)4 TreeSet (java.util.TreeSet)3 PosTag (com.joliciel.talismane.posTagger.PosTag)2 Token (com.joliciel.talismane.tokeniser.Token)2 List (java.util.List)2 TalismaneException (com.joliciel.talismane.TalismaneException)1 Decision (com.joliciel.talismane.machineLearning.Decision)1 NonDeterministicPosTagger (com.joliciel.talismane.posTagger.NonDeterministicPosTagger)1 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)1 Sentence (com.joliciel.talismane.rawText.Sentence)1 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)1 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1