use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class RomanianWordRepeatBeginningRule method isAdverb.
@Override
protected boolean isAdverb(AnalyzedTokenReadings token) {
boolean isAdverb = false;
List<AnalyzedToken> readings = token.getReadings();
for (AnalyzedToken analyzedToken : readings) {
if (analyzedToken.getPOSTag() != null) {
if (analyzedToken.getPOSTag().startsWith("G")) {
// see file /resource/ro/coduri.html for POS tag descriptions
isAdverb = true;
} else {
if (!allowAmbiguousAdverbs()) {
return false;
}
}
}
}
return isAdverb;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class AbstractRomanianTaggerTest method assertHasLemmaAndPos.
/**
* Verify if <code>inflected</code> contains the specified lemma and pos
*
* @param inflected input word, inflected form
* @param lemma expected lemma
* @param posTag expected tag for lemma
*/
protected void assertHasLemmaAndPos(String inflected, String lemma, String posTag) throws IOException {
final List<AnalyzedTokenReadings> tags = tagger.tag(Arrays.asList(inflected));
final StringBuilder allTags = new StringBuilder();
boolean found = false;
for (AnalyzedTokenReadings analyzedTokenReadings : tags) {
for (AnalyzedToken token : analyzedTokenReadings) {
final String crtLemma = token.getLemma();
final String crtPOSTag = token.getPOSTag();
allTags.append(String.format("[%s/%s]", crtLemma, crtPOSTag));
found = (lemma == null || lemma.equals(crtLemma)) && (posTag == null || posTag.equals(crtPOSTag));
if (found) {
break;
}
}
if (found) {
break;
}
}
assertTrue(String.format("Lemma and POS not found for word [%s]! " + "Expected [%s/%s]. Actual: %s", inflected, lemma, posTag, allTags.toString()), found);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class BretonTagger method tag.
// This method is almost the same as the 'tag' method in
// BaseTagger class, except that when probing the
// dictionary fails, it retry without the suffixes
// -mañ, -se, -hont.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
List<AnalyzedToken> taggerTokens;
List<AnalyzedToken> lowerTaggerTokens;
List<AnalyzedToken> upperTaggerTokens;
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
Matcher matcher;
for (String word : sentenceTokens) {
String probeWord = word;
// which happens rarely when trying to remove suffixes -mañ, -se, etc.
for (; ; ) {
List<AnalyzedToken> l = new ArrayList<>();
String lowerWord = probeWord.toLowerCase(conversionLocale);
taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(probeWord));
lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
boolean isLowercase = probeWord.equals(lowerWord);
// Normal case.
addTokens(taggerTokens, l);
if (!isLowercase) {
// Lowercase.
addTokens(lowerTaggerTokens, l);
}
// Uppercase.
if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
if (isLowercase) {
upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(probeWord)));
if (!upperTaggerTokens.isEmpty()) {
addTokens(upperTaggerTokens, l);
}
}
if (l.isEmpty()) {
if ((matcher = patternSuffix.matcher(probeWord)).find()) {
// Remove the suffix and probe dictionary again.
// So given a word such as "xxx-mañ", we're going to
// try to probe the dictionary again with "xxx" this time.
probeWord = matcher.group(1);
continue;
}
l.add(new AnalyzedToken(word, null, null));
}
}
tokenReadings.add(new AnalyzedTokenReadings(l, pos));
pos += word.length();
break;
}
}
return tokenReadings;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class CatalanTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
final IStemmer dictLookup = new DictionaryLookup(getDictionary());
for (String word : sentenceTokens) {
// This hack allows all rules and dictionary entries to work with
// typewriter apostrophe
boolean containsTypewriterApostrophe = false;
if (word.length() > 1) {
if (word.contains("'")) {
containsTypewriterApostrophe = true;
}
word = word.replace("’", "'");
}
final List<AnalyzedToken> l = new ArrayList<>();
final String lowerWord = word.toLowerCase(conversionLocale);
final boolean isLowercase = word.equals(lowerWord);
final boolean isMixedCase = StringTools.isMixedCase(word);
List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
// normal case:
addTokens(taggerTokens, l);
// word with lowercase word tags:
if (!isLowercase && !isMixedCase) {
List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
addTokens(lowerTaggerTokens, l);
}
// additional tagging with prefixes
if (l.isEmpty() && !isMixedCase) {
addTokens(additionalTags(word, dictLookup), l);
}
if (l.isEmpty()) {
l.add(new AnalyzedToken(word, null, null));
}
AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos);
if (containsTypewriterApostrophe) {
List<ChunkTag> listChunkTags = new ArrayList<>();
listChunkTags.add(new ChunkTag("containsTypewriterApostrophe"));
atr.setChunkTags(listChunkTags);
}
tokenReadings.add(atr);
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class ReflexiveVerbsRule method matchPostagRegexp.
/**
* Match POS tag with regular expression
*/
private boolean matchPostagRegexp(AnalyzedTokenReadings aToken, Pattern pattern) {
boolean matches = false;
for (AnalyzedToken analyzedToken : aToken) {
String posTag = analyzedToken.getPOSTag();
if (posTag == null) {
posTag = "UNKNOWN";
}
final Matcher m = pattern.matcher(posTag);
if (m.matches()) {
matches = true;
break;
}
}
return matches;
}
Aggregations