use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class EnglishChunkerTest method createReadingsList.
private List<AnalyzedTokenReadings> createReadingsList(String sentence) {
StringTokenizer tokenizer = new StringTokenizer(sentence, " ", true);
List<AnalyzedTokenReadings> result = new ArrayList<>();
int pos = 0;
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (token.trim().isEmpty()) {
result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), pos));
} else {
result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, "fake", "fake"), pos));
}
pos += token.length();
}
return result;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class PolishTagger method tag.
@Override
public final List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) {
List<AnalyzedToken> taggerTokens;
List<AnalyzedToken> lowerTaggerTokens;
List<AnalyzedToken> upperTaggerTokens;
final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
for (String word : sentenceTokens) {
final List<AnalyzedToken> l = new ArrayList<>();
final String lowerWord = word.toLowerCase(plLocale);
taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
final boolean isLowercase = word.equals(lowerWord);
//normal case
addTokens(taggerTokens, l);
if (!isLowercase) {
//lowercase
addTokens(lowerTaggerTokens, l);
}
//uppercase
if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
if (isLowercase) {
upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(word)));
if (!upperTaggerTokens.isEmpty()) {
addTokens(upperTaggerTokens, l);
} else {
l.add(new AnalyzedToken(word, null, null));
}
} else {
l.add(new AnalyzedToken(word, null, null));
}
}
tokenReadings.add(new AnalyzedTokenReadings(l, pos));
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class EsperantoTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
lazyInit();
Matcher matcher;
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
for (String word : sentenceTokens) {
List<AnalyzedToken> l = new ArrayList<>();
// spurious tagging as single letter words "A", "O", "E", etc.
if (word.length() > 1) {
String lWord = word.toLowerCase();
List<TaggedWord> manualTags = manualTagger.tag(lWord);
if (manualTags.size() > 0) {
// This is a closed word for which we know its lemmas and tags.
for (TaggedWord manualTag : manualTags) {
l.add(new AnalyzedToken(word, manualTag.getPosTag(), manualTag.getLemma()));
}
} else {
// Tiu, kiu (tabelvortoj).
if ((matcher = patternTabelvorto.matcher(lWord)).find()) {
String type1Group = matcher.group(1).substring(0, 1).toLowerCase();
String type2Group = matcher.group(2);
String plGroup = matcher.group(3);
String accGroup = matcher.group(4);
String type3Group = matcher.group(5);
String type;
String plural;
String accusative;
if (accGroup == null) {
accusative = "xxx";
} else {
accusative = accGroup.equalsIgnoreCase("n") ? "akz" : "nak";
}
if (plGroup == null) {
plural = " pn ";
} else {
plural = plGroup.equalsIgnoreCase("j") ? " pl " : " np ";
}
type = ((type2Group == null) ? type3Group : type2Group).toLowerCase();
l.add(new AnalyzedToken(word, "T " + accusative + plural + type1Group + " " + type, null));
if ((matcher = patternTabelvortoAdverb.matcher(lWord)).find()) {
l.add(new AnalyzedToken(word, "E nak", lWord));
}
// Words ending in .*oj?n? are nouns.
} else if (lWord.endsWith("o")) {
l.add(new AnalyzedToken(word, "O nak np", lWord));
} else if (lWord.length() >= 2 && lWord.endsWith("'")) {
l.add(new AnalyzedToken(word, "O nak np", lWord.substring(0, lWord.length() - 1) + "o"));
} else if (lWord.endsWith("oj")) {
l.add(new AnalyzedToken(word, "O nak pl", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("on")) {
l.add(new AnalyzedToken(word, "O akz np", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("ojn")) {
l.add(new AnalyzedToken(word, "O akz pl", lWord.substring(0, lWord.length() - 2)));
// Words ending in .*aj?n? are adjectives.
} else if (lWord.endsWith("a")) {
l.add(new AnalyzedToken(word, "A nak np", lWord));
} else if (lWord.endsWith("aj")) {
l.add(new AnalyzedToken(word, "A nak pl", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("an")) {
l.add(new AnalyzedToken(word, "A akz np", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("ajn")) {
l.add(new AnalyzedToken(word, "A akz pl", lWord.substring(0, lWord.length() - 2)));
// Words ending in .*en? are adverbs.
} else if (lWord.endsWith("e")) {
l.add(new AnalyzedToken(word, "E nak", lWord));
} else if (lWord.endsWith("en")) {
l.add(new AnalyzedToken(word, "E akz", lWord.substring(0, lWord.length() - 1)));
// Verbs.
} else if ((matcher = patternVerb.matcher(lWord)).find()) {
String verb = matcher.group(1) + "i";
String tense = matcher.group(2);
String transitive = findTransitivity(verb);
l.add(new AnalyzedToken(word, "V " + transitive + " " + tense, verb));
// Irregular word (no tag).
} else {
l.add(new AnalyzedToken(word, null, null));
}
// Participle (can be combined with other tags).
if ((matcher = patternParticiple.matcher(lWord)).find()) {
if (!setNonParticiple.contains(matcher.group(1))) {
String verb = matcher.group(2) + "i";
String aio = matcher.group(3);
String antAt = matcher.group(4).equals("n") ? "n" : "-";
String aoe = matcher.group(5);
String plural = matcher.group(6).equals("j") ? "pl" : "np";
String accusative = matcher.group(7).equals("n") ? "akz" : "nak";
String transitive = findTransitivity(verb);
l.add(new AnalyzedToken(word, "C " + accusative + " " + plural + " " + transitive + " " + aio + " " + antAt + " " + aoe, verb));
}
}
}
} else {
// Single letter word (no tag).
l.add(new AnalyzedToken(word, null, null));
}
tokenReadings.add(new AnalyzedTokenReadings(l, 0));
}
return tokenReadings;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class JapaneseTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>(sentenceTokens.size());
int pos = 0;
for (String word : sentenceTokens) {
AnalyzedToken at = asAnalyzedToken(word);
tokenReadings.add(new AnalyzedTokenReadings(at, pos));
pos += at.getToken().length();
}
return tokenReadings;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class BaseTagger method getAnalyzedTokens.
protected List<AnalyzedToken> getAnalyzedTokens(String word) {
List<AnalyzedToken> result = new ArrayList<>();
String lowerWord = word.toLowerCase(conversionLocale);
boolean isLowercase = word.equals(lowerWord);
boolean isMixedCase = StringTools.isMixedCase(word);
List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
//normal case:
addTokens(taggerTokens, result);
//tag non-lowercase (alluppercase or startuppercase), but not mixedcase word with lowercase word tags:
if (!isLowercase && !isMixedCase) {
addTokens(lowerTaggerTokens, result);
}
//tag lowercase word with startuppercase word tags:
if (tagLowercaseWithUppercase) {
if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
if (isLowercase) {
List<AnalyzedToken> upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(word)));
if (!upperTaggerTokens.isEmpty()) {
addTokens(upperTaggerTokens, result);
}
}
}
}
// Additional language-dependent-tagging:
if (result.isEmpty()) {
List<AnalyzedToken> additionalTaggedTokens = additionalTags(word, getWordTagger());
addTokens(additionalTaggedTokens, result);
}
if (result.isEmpty()) {
result.add(new AnalyzedToken(word, null, null));
}
return result;
}
Aggregations