Search in sources :

Example 36 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class PatternEventStream method getTaggedTokens.

public List<TaggedToken<TokeniserOutcome>> getTaggedTokens(TokenSequence tokenSequence, List<Integer> tokenSplits) {
    List<TaggedToken<TokeniserOutcome>> taggedTokens = new ArrayList<TaggedToken<TokeniserOutcome>>();
    for (Token token : tokenSequence.listWithWhiteSpace()) {
        TokeniserOutcome outcome = TokeniserOutcome.JOIN;
        if (tokenSplits.contains(token.getStartIndex()))
            outcome = TokeniserOutcome.SEPARATE;
        Decision decision = new Decision(outcome.name());
        TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
        taggedTokens.add(taggedToken);
    }
    return taggedTokens;
}
Also used : TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) ArrayList(java.util.ArrayList) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision)

Example 37 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class PosTagRegexBasedCorpusReader method convertToPosTaggedToken.

protected PosTaggedToken convertToPosTaggedToken(CorpusLine corpusLine, PosTagSequence posTagSequence, int index, File currentFile) throws TalismaneException {
    Token token = posTagSequence.getTokenSequence().get(index);
    PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
    PosTag posTag = null;
    try {
        posTag = posTagSet.getPosTag(corpusLine.getElement(CorpusElement.POSTAG));
    } catch (UnknownPosTagException upte) {
        String fileName = "";
        if (currentFile != null)
            fileName = currentFile.getPath();
        throw new TalismaneException("Unknown posTag, " + fileName + ", on line " + corpusLine.getLineNumber() + ": " + corpusLine.getElement(CorpusElement.POSTAG));
    }
    Decision posTagDecision = new Decision(posTag.getCode());
    PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
    if (LOG.isTraceEnabled()) {
        LOG.trace(posTaggedToken.toString());
    }
    if (corpusLine.hasElement(CorpusElement.POSTAG_COMMENT))
        posTaggedToken.setComment(corpusLine.getElement(CorpusElement.POSTAG_COMMENT));
    // set the lexical entry if we have one
    if (corpusLine.getLexicalEntry() != null) {
        List<LexicalEntry> lexicalEntrySet = new ArrayList<>(1);
        lexicalEntrySet.add(corpusLine.getLexicalEntry());
        posTaggedToken.setLexicalEntries(lexicalEntrySet);
    }
    posTagSequence.addPosTaggedToken(posTaggedToken);
    return posTaggedToken;
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry) Decision(com.joliciel.talismane.machineLearning.Decision)

Example 38 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class PosTaggedTokenIndexFeature method checkInternal.

@Override
public FeatureResult<Integer> checkInternal(T context, RuntimeEnvironment env) throws TalismaneException {
    PosTaggedTokenWrapper innerWrapper = this.getToken(context, env);
    if (innerWrapper == null)
        return null;
    PosTaggedToken posTaggedToken = innerWrapper.getPosTaggedToken();
    if (posTaggedToken == null)
        return null;
    FeatureResult<Integer> featureResult = null;
    Token token = posTaggedToken.getToken();
    int index = token.getIndex();
    featureResult = this.generateResult(index);
    return featureResult;
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token)

Example 39 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class ParseEvaluationSentenceWriter method onParseEnd.

@Override
public void onParseEnd(ParseConfiguration realConfiguration, List<ParseConfiguration> guessedConfigurations) throws IOException {
    TreeSet<Integer> startIndexes = new TreeSet<Integer>();
    for (PosTaggedToken posTaggedToken : realConfiguration.getPosTagSequence()) {
        if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
            Token token = posTaggedToken.getToken();
            startIndexes.add(token.getStartIndex());
        }
    }
    if (hasTokeniser || hasPosTagger) {
        int i = 0;
        for (ParseConfiguration guessedConfiguration : guessedConfigurations) {
            for (PosTaggedToken posTaggedToken : guessedConfiguration.getPosTagSequence()) {
                if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
                    Token token = posTaggedToken.getToken();
                    startIndexes.add(token.getStartIndex());
                }
            }
            i++;
            if (i == guessCount)
                break;
        }
    }
    Map<Integer, Integer> startIndexMap = new HashMap<Integer, Integer>();
    int j = 0;
    for (int startIndex : startIndexes) {
        startIndexMap.put(startIndex, j++);
    }
    PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
    PosTaggedToken[] realTokens = new PosTaggedToken[startIndexes.size()];
    for (PosTaggedToken posTaggedToken : posTagSequence) {
        if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
            realTokens[startIndexMap.get(posTaggedToken.getToken().getStartIndex())] = posTaggedToken;
        }
    }
    for (PosTaggedToken posTaggedToken : realTokens) {
        if (posTaggedToken != null) {
            csvFileWriter.write(CSV.format(posTaggedToken.getToken().getOriginalText()));
        } else {
            csvFileWriter.write(CSV.getCsvSeparator());
        }
    }
    csvFileWriter.write("\n");
    for (PosTaggedToken posTaggedToken : realTokens) {
        if (posTaggedToken != null) {
            csvFileWriter.write(CSV.format(posTaggedToken.getTag().getCode()));
        } else {
            csvFileWriter.write(CSV.getCsvSeparator());
        }
    }
    csvFileWriter.write("\n");
    for (PosTaggedToken posTaggedToken : realTokens) {
        if (posTaggedToken != null) {
            DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken);
            String realLabel = realArc.getLabel() == null ? "null" : realArc.getLabel();
            csvFileWriter.write(CSV.format(realLabel));
        } else {
            csvFileWriter.write(CSV.getCsvSeparator());
        }
    }
    csvFileWriter.write("\n");
    for (PosTaggedToken posTaggedToken : realTokens) {
        if (posTaggedToken != null) {
            DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken);
            int startIndex = -1;
            if (realArc != null) {
                PosTaggedToken head = realArc.getHead();
                if (!head.getTag().equals(PosTag.ROOT_POS_TAG)) {
                    startIndex = head.getToken().getStartIndex();
                }
            }
            if (startIndex < 0)
                csvFileWriter.write(CSV.format("ROOT"));
            else
                csvFileWriter.write(CSV.getColumnLabel(startIndexMap.get(startIndex)) + CSV.getCsvSeparator());
        } else {
            csvFileWriter.write(CSV.getCsvSeparator());
        }
    }
    csvFileWriter.write("\n");
    for (int i = 0; i < guessCount; i++) {
        if (i < guessedConfigurations.size()) {
            ParseConfiguration guessedConfiguration = guessedConfigurations.get(i);
            PosTaggedToken[] guessedTokens = new PosTaggedToken[startIndexes.size()];
            for (PosTaggedToken posTaggedToken : guessedConfiguration.getPosTagSequence()) {
                if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
                    guessedTokens[startIndexMap.get(posTaggedToken.getToken().getStartIndex())] = posTaggedToken;
                }
            }
            if (hasTokeniser) {
                for (PosTaggedToken posTaggedToken : guessedTokens) {
                    if (posTaggedToken != null) {
                        csvFileWriter.write(CSV.format(posTaggedToken.getToken().getOriginalText()));
                    } else {
                        csvFileWriter.write(CSV.getCsvSeparator());
                    }
                }
                csvFileWriter.write("\n");
            }
            if (hasPosTagger) {
                for (PosTaggedToken posTaggedToken : guessedTokens) {
                    if (posTaggedToken != null) {
                        csvFileWriter.write(CSV.format(posTaggedToken.getTag().getCode()));
                    } else {
                        csvFileWriter.write(CSV.getCsvSeparator());
                    }
                }
                csvFileWriter.write("\n");
            }
            for (PosTaggedToken posTaggedToken : guessedTokens) {
                if (posTaggedToken != null) {
                    DependencyArc guessedArc = guessedConfiguration.getGoverningDependency(posTaggedToken);
                    String guessedLabel = "";
                    if (guessedArc != null) {
                        guessedLabel = guessedArc.getLabel() == null ? "null" : guessedArc.getLabel();
                    }
                    csvFileWriter.write(CSV.format(guessedLabel));
                } else {
                    csvFileWriter.write(CSV.getCsvSeparator());
                }
            }
            csvFileWriter.write("\n");
            for (PosTaggedToken posTaggedToken : guessedTokens) {
                if (posTaggedToken != null) {
                    DependencyArc guessedArc = guessedConfiguration.getGoverningDependency(posTaggedToken);
                    int startIndex = -1;
                    if (guessedArc != null) {
                        PosTaggedToken head = guessedArc.getHead();
                        if (!head.getTag().equals(PosTag.ROOT_POS_TAG)) {
                            startIndex = head.getToken().getStartIndex();
                        }
                    }
                    if (startIndex < 0)
                        csvFileWriter.write(CSV.format("ROOT"));
                    else
                        csvFileWriter.write(CSV.getColumnLabel(startIndexMap.get(startIndex)) + CSV.getCsvSeparator());
                } else {
                    csvFileWriter.write(CSV.getCsvSeparator());
                }
            }
            csvFileWriter.write("\n");
            for (PosTaggedToken posTaggedToken : guessedTokens) {
                if (posTaggedToken != null) {
                    DependencyArc guessedArc = guessedConfiguration.getGoverningDependency(posTaggedToken);
                    double prob = 1.0;
                    if (guessedArc != null) {
                        Transition transition = guessedConfiguration.getTransition(guessedArc);
                        if (transition != null)
                            prob = transition.getDecision().getProbability();
                    }
                    csvFileWriter.write(CSV.format(prob));
                } else {
                    csvFileWriter.write(CSV.getCsvSeparator());
                }
            }
            csvFileWriter.write("\n");
        } else {
            csvFileWriter.write("\n");
            csvFileWriter.write("\n");
        }
    // have more configurations
    }
    // next guessed configuration
    csvFileWriter.flush();
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) TreeSet(java.util.TreeSet) Transition(com.joliciel.talismane.parser.Transition) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) DependencyArc(com.joliciel.talismane.parser.DependencyArc)

Example 40 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class StandoffReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        if (configuration == null && sentenceIndex < sentences.size()) {
            List<StandoffToken> tokens = sentences.get(sentenceIndex++);
            LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
            if (rules == null)
                throw new RuntimeException("Linguistic rules have not been set.");
            String text = "";
            for (StandoffToken standoffToken : tokens) {
                String word = standoffToken.text;
                if (rules.shouldAddSpace(text, word))
                    text += " ";
                text += word;
            }
            Sentence sentence = new Sentence(text, sessionId);
            for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                annotator.annotate(sentence);
            }
            PretokenisedSequence tokenSequence = new PretokenisedSequence(sentence, sessionId);
            PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
            Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>();
            for (StandoffToken standoffToken : tokens) {
                Token token = tokenSequence.addToken(standoffToken.text);
                Decision posTagDecision = new Decision(standoffToken.posTag.getCode());
                PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
                if (LOG.isTraceEnabled()) {
                    LOG.trace(posTaggedToken.toString());
                }
                posTaggedToken.setComment(standoffToken.comment);
                posTagSequence.addPosTaggedToken(posTaggedToken);
                idTokenMap.put(standoffToken.id, posTaggedToken);
                LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken);
            }
            tokenSequence.setWithRoot(true);
            configuration = new ParseConfiguration(posTagSequence);
            for (StandoffToken standoffToken : tokens) {
                StandoffRelation relation = relationMap.get(standoffToken.id);
                if (relation != null) {
                    PosTaggedToken head = idTokenMap.get(relation.fromToken);
                    PosTaggedToken dependent = idTokenMap.get(relation.toToken);
                    if (head == null) {
                        throw new TalismaneException("No token found for head id: " + relation.fromToken);
                    }
                    if (dependent == null) {
                        throw new TalismaneException("No token found for dependent id: " + relation.toToken);
                    }
                    DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null);
                    arc.setComment(relation.comment);
                } else if (standoffToken.posTag.getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION) {
                    if (punctuationDepLabel != null) {
                        PosTaggedToken dependent = idTokenMap.get(standoffToken.id);
                        for (int i = dependent.getIndex() - 1; i >= 0; i--) {
                            PosTaggedToken head = posTagSequence.get(i);
                            if (head.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION)
                                continue;
                            configuration.addDependency(head, dependent, punctuationDepLabel, null);
                            break;
                        }
                    }
                }
            }
        }
    }
    return (configuration != null);
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TalismaneException(com.joliciel.talismane.TalismaneException) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token) Decision(com.joliciel.talismane.machineLearning.Decision) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) PretokenisedSequence(com.joliciel.talismane.tokeniser.PretokenisedSequence) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) DependencyArc(com.joliciel.talismane.parser.DependencyArc) Sentence(com.joliciel.talismane.rawText.Sentence)

Aggregations

Token (com.joliciel.talismane.tokeniser.Token)69 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)16 ArrayList (java.util.ArrayList)15 Sentence (com.joliciel.talismane.rawText.Sentence)14 Decision (com.joliciel.talismane.machineLearning.Decision)12 Config (com.typesafe.config.Config)12 TalismaneTest (com.joliciel.talismane.TalismaneTest)11 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)11 Test (org.junit.Test)11 TalismaneException (com.joliciel.talismane.TalismaneException)7 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)7 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)7 List (java.util.List)7 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)6 HashMap (java.util.HashMap)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 PosTag (com.joliciel.talismane.posTagger.PosTag)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5