Search in sources :

Example 6 with PosTag

use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.

the class LemmaForPosTagFeature method checkInternal.

@Override
public FeatureResult<String> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<String> featureResult = null;
    List<String> posTagCodes = new ArrayList<String>();
    for (StringFeature<TokenWrapper> posTagCodeFeature : posTagCodeFeatures) {
        FeatureResult<String> posTagCodeResult = posTagCodeFeature.check(innerWrapper, env);
        if (posTagCodeResult != null)
            posTagCodes.add(posTagCodeResult.getOutcome());
    }
    for (String posTagCode : posTagCodes) {
        PosTag posTag = TalismaneSession.get(sessionId).getPosTagSet().getPosTag(posTagCode);
        LexicalEntry lexicalEntry = token.getLexicalEntry(posTag);
        if (lexicalEntry != null) {
            featureResult = this.generateResult(lexicalEntry.getLemma());
            break;
        }
    }
    return featureResult;
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry)

Example 7 with PosTag

use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.

the class LexiconAllPosTagsFeature method checkInternal.

@Override
public FeatureResult<String> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<String> result = null;
    if (token.getPossiblePosTags().size() > 0) {
        StringBuilder sb = new StringBuilder();
        boolean firstPosTag = true;
        for (PosTag posTag : token.getPossiblePosTags()) {
            if (!firstPosTag)
                sb.append(',');
            firstPosTag = false;
            sb.append(posTag.getCode());
        }
        result = this.generateResult(sb.toString());
    }
    return result;
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) Token(com.joliciel.talismane.tokeniser.Token)

Example 8 with PosTag

use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.

the class PosTaggerStatisticsWriter method onCompleteAnalysis.

@Override
public void onCompleteAnalysis() throws IOException {
    if (writer != null) {
        PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
        for (PosTag posTag : posTagSet.getTags()) {
            if (!stats.posTagCounts.containsKey(posTag.getCode())) {
                stats.posTagCounts.put(posTag.getCode(), 0);
            }
        }
        double unknownLexiconPercent = 1;
        if (referenceStats != null) {
            int unknownLexiconCount = 0;
            for (String word : stats.words) {
                if (!referenceStats.words.contains(word))
                    unknownLexiconCount++;
            }
            unknownLexiconPercent = (double) unknownLexiconCount / (double) stats.words.size();
        }
        double unknownLowercaseLexiconPercent = 1;
        if (referenceStats != null) {
            int unknownLowercaseLexiconCount = 0;
            for (String lowercase : stats.lowerCaseWords) {
                if (!referenceStats.lowerCaseWords.contains(lowercase))
                    unknownLowercaseLexiconCount++;
            }
            unknownLowercaseLexiconPercent = (double) unknownLowercaseLexiconCount / (double) stats.lowerCaseWords.size();
        }
        writer.write(CSV.format("sentenceCount") + CSV.format(stats.sentenceCount) + "\n");
        writer.write(CSV.format("sentenceLengthMean") + CSV.format(stats.sentenceLengthStats.getMean()) + "\n");
        writer.write(CSV.format("sentenceLengthStdDev") + CSV.format(stats.sentenceLengthStats.getStandardDeviation()) + "\n");
        writer.write(CSV.format("lexiconSize") + CSV.format(stats.words.size()) + "\n");
        writer.write(CSV.format("lexiconUnknownInRefCorpus") + CSV.format(unknownLexiconPercent * 100.0) + "\n");
        writer.write(CSV.format("tokenCount") + CSV.format(stats.tokenCount) + "\n");
        double unknownTokenPercent = ((double) stats.unknownTokenCount / (double) stats.tokenCount) * 100.0;
        writer.write(CSV.format("tokenUnknownInRefCorpus") + CSV.format(unknownTokenPercent) + "\n");
        double unknownInLexiconPercent = ((double) stats.unknownInLexiconCount / (double) stats.tokenCount) * 100.0;
        writer.write(CSV.format("tokenUnknownInRefLexicon") + CSV.format(unknownInLexiconPercent) + "\n");
        writer.write(CSV.format("lowercaseLexiconSize") + CSV.format(stats.lowerCaseWords.size()) + "\n");
        writer.write(CSV.format("lowercaseLexiconUnknownInRefCorpus") + CSV.format(unknownLowercaseLexiconPercent * 100.0) + "\n");
        writer.write(CSV.format("alphanumericCount") + CSV.format(stats.alphanumericCount) + "\n");
        double unknownAlphanumericPercent = ((double) stats.unknownAlphanumericCount / (double) stats.alphanumericCount) * 100.0;
        writer.write(CSV.format("alphaUnknownInRefCorpus") + CSV.format(unknownAlphanumericPercent) + "\n");
        double unknownAlphaInLexiconPercent = ((double) stats.unknownAlphaInLexiconCount / (double) stats.alphanumericCount) * 100.0;
        writer.write(CSV.format("alphaUnknownInRefLexicon") + CSV.format(unknownAlphaInLexiconPercent) + "\n");
        writer.write(CSV.format("openClassCount") + CSV.format(stats.openClassCount) + "\n");
        double openClassUnknownPercent = ((double) stats.openClassUnknownInRefCorpus / (double) stats.openClassCount) * 100.0;
        writer.write(CSV.format("openClassUnknownInRefCorpus") + CSV.format(openClassUnknownPercent) + "\n");
        double openClassUnknownInLexiconPercent = ((double) stats.openClassUnknownInLexicon / (double) stats.openClassCount) * 100.0;
        writer.write(CSV.format("openClassUnknownInRefLexicon") + CSV.format(openClassUnknownInLexiconPercent) + "\n");
        writer.write(CSV.format("closedClassCount") + CSV.format(stats.closedClassCount) + "\n");
        double closedClassUnknownPercent = ((double) stats.closedClassUnknownInRefCorpus / (double) stats.closedClassCount) * 100.0;
        writer.write(CSV.format("closedClassUnknownInRefCorpus") + CSV.format(closedClassUnknownPercent) + "\n");
        double closedClassUnknownInLexiconPercent = ((double) stats.closedClassUnknownInLexicon / (double) stats.closedClassCount) * 100.0;
        writer.write(CSV.format("closedClassUnknownInRefLexicon") + CSV.format(closedClassUnknownInLexiconPercent) + "\n");
        for (String posTag : stats.posTagCounts.keySet()) {
            int count = stats.posTagCounts.get(posTag);
            writer.write(CSV.format(posTag) + CSV.format(count) + CSV.format(((double) count / (double) stats.tokenCount) * 100.0) + "\n");
        }
        writer.flush();
        writer.close();
    }
    if (this.serializationFile != null) {
        ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(serializationFile, false));
        zos.putNextEntry(new ZipEntry("Contents.obj"));
        ObjectOutputStream oos = new ObjectOutputStream(zos);
        try {
            oos.writeObject(stats);
        } finally {
            oos.flush();
        }
        zos.flush();
        zos.close();
    }
}
Also used : PosTagSet(com.joliciel.talismane.posTagger.PosTagSet) PosTag(com.joliciel.talismane.posTagger.PosTag) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) ZipEntry(java.util.zip.ZipEntry) ObjectOutputStream(java.io.ObjectOutputStream)

Example 9 with PosTag

use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.

the class HasClosedClassesOnlyFeature method checkInternal.

@Override
public FeatureResult<Boolean> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<Boolean> result = null;
    boolean hasClosedClassesOnly = false;
    if (token.getPossiblePosTags().size() > 0)
        hasClosedClassesOnly = true;
    for (PosTag posTag : token.getPossiblePosTags()) {
        if (!posTag.getOpenClassIndicator().isClosed()) {
            hasClosedClassesOnly = false;
            break;
        }
    }
    result = this.generateResult(hasClosedClassesOnly);
    return result;
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) Token(com.joliciel.talismane.tokeniser.Token)

Example 10 with PosTag

use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.

the class Token method getLexicalEntry.

/**
 * The "best" lexical entry for this token/postag combination if one exists,
 * or null otherwise.
 */
public LexicalEntry getLexicalEntry(PosTag posTag) {
    if (this.lexicalEntryMap == null) {
        this.lexicalEntryMap = new HashMap<PosTag, List<LexicalEntry>>();
    }
    List<LexicalEntry> lexicalEntries = this.lexicalEntryMap.get(posTag);
    if (lexicalEntries == null) {
        lexicalEntries = TalismaneSession.get(sessionId).getMergedLexicon().findLexicalEntries(this.getText(), posTag);
        this.lexicalEntryMap.put(posTag, lexicalEntries);
    }
    LexicalEntry bestEntry = null;
    if (lexicalEntries.size() > 0)
        bestEntry = lexicalEntries.get(0);
    return bestEntry;
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) ArrayList(java.util.ArrayList) List(java.util.List) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry)

Aggregations

PosTag (com.joliciel.talismane.posTagger.PosTag)17 ArrayList (java.util.ArrayList)6 Token (com.joliciel.talismane.tokeniser.Token)5 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)3 PosTagSet (com.joliciel.talismane.posTagger.PosTagSet)3 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)3 List (java.util.List)3 LexicalEntry (com.joliciel.talismane.lexicon.LexicalEntry)2 PosTaggerLexicon (com.joliciel.talismane.lexicon.PosTaggerLexicon)2 TalismaneException (com.joliciel.talismane.TalismaneException)1 TalismaneTest (com.joliciel.talismane.TalismaneTest)1 BooleanFeature (com.joliciel.talismane.machineLearning.features.BooleanFeature)1 FunctionDescriptor (com.joliciel.talismane.machineLearning.features.FunctionDescriptor)1 FunctionDescriptorParser (com.joliciel.talismane.machineLearning.features.FunctionDescriptorParser)1 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)1 UnknownPosTagException (com.joliciel.talismane.posTagger.UnknownPosTagException)1 Config (com.typesafe.config.Config)1 FileOutputStream (java.io.FileOutputStream)1 ObjectOutputStream (java.io.ObjectOutputStream)1 ZipEntry (java.util.zip.ZipEntry)1