use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.
the class LemmaForPosTagFeature method checkInternal.
@Override
public FeatureResult<String> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<String> featureResult = null;
List<String> posTagCodes = new ArrayList<String>();
for (StringFeature<TokenWrapper> posTagCodeFeature : posTagCodeFeatures) {
FeatureResult<String> posTagCodeResult = posTagCodeFeature.check(innerWrapper, env);
if (posTagCodeResult != null)
posTagCodes.add(posTagCodeResult.getOutcome());
}
for (String posTagCode : posTagCodes) {
PosTag posTag = TalismaneSession.get(sessionId).getPosTagSet().getPosTag(posTagCode);
LexicalEntry lexicalEntry = token.getLexicalEntry(posTag);
if (lexicalEntry != null) {
featureResult = this.generateResult(lexicalEntry.getLemma());
break;
}
}
return featureResult;
}
use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.
the class LexiconAllPosTagsFeature method checkInternal.
@Override
public FeatureResult<String> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<String> result = null;
if (token.getPossiblePosTags().size() > 0) {
StringBuilder sb = new StringBuilder();
boolean firstPosTag = true;
for (PosTag posTag : token.getPossiblePosTags()) {
if (!firstPosTag)
sb.append(',');
firstPosTag = false;
sb.append(posTag.getCode());
}
result = this.generateResult(sb.toString());
}
return result;
}
use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.
the class PosTaggerStatisticsWriter method onCompleteAnalysis.
@Override
public void onCompleteAnalysis() throws IOException {
if (writer != null) {
PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
for (PosTag posTag : posTagSet.getTags()) {
if (!stats.posTagCounts.containsKey(posTag.getCode())) {
stats.posTagCounts.put(posTag.getCode(), 0);
}
}
double unknownLexiconPercent = 1;
if (referenceStats != null) {
int unknownLexiconCount = 0;
for (String word : stats.words) {
if (!referenceStats.words.contains(word))
unknownLexiconCount++;
}
unknownLexiconPercent = (double) unknownLexiconCount / (double) stats.words.size();
}
double unknownLowercaseLexiconPercent = 1;
if (referenceStats != null) {
int unknownLowercaseLexiconCount = 0;
for (String lowercase : stats.lowerCaseWords) {
if (!referenceStats.lowerCaseWords.contains(lowercase))
unknownLowercaseLexiconCount++;
}
unknownLowercaseLexiconPercent = (double) unknownLowercaseLexiconCount / (double) stats.lowerCaseWords.size();
}
writer.write(CSV.format("sentenceCount") + CSV.format(stats.sentenceCount) + "\n");
writer.write(CSV.format("sentenceLengthMean") + CSV.format(stats.sentenceLengthStats.getMean()) + "\n");
writer.write(CSV.format("sentenceLengthStdDev") + CSV.format(stats.sentenceLengthStats.getStandardDeviation()) + "\n");
writer.write(CSV.format("lexiconSize") + CSV.format(stats.words.size()) + "\n");
writer.write(CSV.format("lexiconUnknownInRefCorpus") + CSV.format(unknownLexiconPercent * 100.0) + "\n");
writer.write(CSV.format("tokenCount") + CSV.format(stats.tokenCount) + "\n");
double unknownTokenPercent = ((double) stats.unknownTokenCount / (double) stats.tokenCount) * 100.0;
writer.write(CSV.format("tokenUnknownInRefCorpus") + CSV.format(unknownTokenPercent) + "\n");
double unknownInLexiconPercent = ((double) stats.unknownInLexiconCount / (double) stats.tokenCount) * 100.0;
writer.write(CSV.format("tokenUnknownInRefLexicon") + CSV.format(unknownInLexiconPercent) + "\n");
writer.write(CSV.format("lowercaseLexiconSize") + CSV.format(stats.lowerCaseWords.size()) + "\n");
writer.write(CSV.format("lowercaseLexiconUnknownInRefCorpus") + CSV.format(unknownLowercaseLexiconPercent * 100.0) + "\n");
writer.write(CSV.format("alphanumericCount") + CSV.format(stats.alphanumericCount) + "\n");
double unknownAlphanumericPercent = ((double) stats.unknownAlphanumericCount / (double) stats.alphanumericCount) * 100.0;
writer.write(CSV.format("alphaUnknownInRefCorpus") + CSV.format(unknownAlphanumericPercent) + "\n");
double unknownAlphaInLexiconPercent = ((double) stats.unknownAlphaInLexiconCount / (double) stats.alphanumericCount) * 100.0;
writer.write(CSV.format("alphaUnknownInRefLexicon") + CSV.format(unknownAlphaInLexiconPercent) + "\n");
writer.write(CSV.format("openClassCount") + CSV.format(stats.openClassCount) + "\n");
double openClassUnknownPercent = ((double) stats.openClassUnknownInRefCorpus / (double) stats.openClassCount) * 100.0;
writer.write(CSV.format("openClassUnknownInRefCorpus") + CSV.format(openClassUnknownPercent) + "\n");
double openClassUnknownInLexiconPercent = ((double) stats.openClassUnknownInLexicon / (double) stats.openClassCount) * 100.0;
writer.write(CSV.format("openClassUnknownInRefLexicon") + CSV.format(openClassUnknownInLexiconPercent) + "\n");
writer.write(CSV.format("closedClassCount") + CSV.format(stats.closedClassCount) + "\n");
double closedClassUnknownPercent = ((double) stats.closedClassUnknownInRefCorpus / (double) stats.closedClassCount) * 100.0;
writer.write(CSV.format("closedClassUnknownInRefCorpus") + CSV.format(closedClassUnknownPercent) + "\n");
double closedClassUnknownInLexiconPercent = ((double) stats.closedClassUnknownInLexicon / (double) stats.closedClassCount) * 100.0;
writer.write(CSV.format("closedClassUnknownInRefLexicon") + CSV.format(closedClassUnknownInLexiconPercent) + "\n");
for (String posTag : stats.posTagCounts.keySet()) {
int count = stats.posTagCounts.get(posTag);
writer.write(CSV.format(posTag) + CSV.format(count) + CSV.format(((double) count / (double) stats.tokenCount) * 100.0) + "\n");
}
writer.flush();
writer.close();
}
if (this.serializationFile != null) {
ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(serializationFile, false));
zos.putNextEntry(new ZipEntry("Contents.obj"));
ObjectOutputStream oos = new ObjectOutputStream(zos);
try {
oos.writeObject(stats);
} finally {
oos.flush();
}
zos.flush();
zos.close();
}
}
use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.
the class HasClosedClassesOnlyFeature method checkInternal.
@Override
public FeatureResult<Boolean> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<Boolean> result = null;
boolean hasClosedClassesOnly = false;
if (token.getPossiblePosTags().size() > 0)
hasClosedClassesOnly = true;
for (PosTag posTag : token.getPossiblePosTags()) {
if (!posTag.getOpenClassIndicator().isClosed()) {
hasClosedClassesOnly = false;
break;
}
}
result = this.generateResult(hasClosedClassesOnly);
return result;
}
use of com.joliciel.talismane.posTagger.PosTag in project talismane by joliciel-informatique.
the class Token method getLexicalEntry.
/**
* The "best" lexical entry for this token/postag combination if one exists,
* or null otherwise.
*/
public LexicalEntry getLexicalEntry(PosTag posTag) {
if (this.lexicalEntryMap == null) {
this.lexicalEntryMap = new HashMap<PosTag, List<LexicalEntry>>();
}
List<LexicalEntry> lexicalEntries = this.lexicalEntryMap.get(posTag);
if (lexicalEntries == null) {
lexicalEntries = TalismaneSession.get(sessionId).getMergedLexicon().findLexicalEntries(this.getText(), posTag);
this.lexicalEntryMap.put(posTag, lexicalEntries);
}
LexicalEntry bestEntry = null;
if (lexicalEntries.size() > 0)
bestEntry = lexicalEntries.get(0);
return bestEntry;
}
Aggregations