use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.
the class GermanTagger method tag.
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens, boolean ignoreCase) throws IOException {
initializeIfRequired();
boolean firstWord = true;
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
for (String word : sentenceTokens) {
List<AnalyzedToken> l = new ArrayList<>();
List<TaggedWord> taggerTokens = getWordTagger().tag(word);
if (firstWord && taggerTokens.isEmpty() && ignoreCase) {
// e.g. "Das" -> "das" at start of sentence
taggerTokens = getWordTagger().tag(word.toLowerCase());
firstWord = word.matches("^\\W?$");
} else if (pos == 0 && ignoreCase) {
// "Haben", "Sollen", "Können", "Gerade" etc. at start of sentence
taggerTokens.addAll(getWordTagger().tag(word.toLowerCase()));
}
if (taggerTokens.size() > 0) {
l.addAll(getAnalyzedTokens(taggerTokens, word));
} else {
// word not known, try to decompose it and use the last part for POS tagging:
if (!StringTools.isEmpty(word.trim())) {
List<String> compoundParts = compoundTokenizer.tokenize(word);
if (compoundParts.size() <= 1) {
l.add(getNoInfoToken(word));
} else {
// last part governs a word's POS:
String lastPart = compoundParts.get(compoundParts.size() - 1);
if (StringTools.startsWithUppercase(word)) {
lastPart = StringTools.uppercaseFirstChar(lastPart);
}
List<TaggedWord> partTaggerTokens = getWordTagger().tag(lastPart);
if (partTaggerTokens.size() > 0) {
l.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts));
} else {
l.add(getNoInfoToken(word));
}
}
} else {
l.add(getNoInfoToken(word));
}
}
tokenReadings.add(new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), pos));
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.
the class CompoundTagger method doGuessCompoundTag.
@Nullable
private List<AnalyzedToken> doGuessCompoundTag(String word) {
int dashIdx = word.lastIndexOf('-');
if (dashIdx == 0 || dashIdx == word.length() - 1)
return null;
int firstDashIdx = word.indexOf('-');
if (dashIdx != firstDashIdx)
return null;
String leftWord = word.substring(0, dashIdx);
String rightWord = word.substring(dashIdx + 1);
// авіа..., авто... пишуться разом
if (LEFT_INVALID.contains(leftWord.toLowerCase()))
return null;
// wrong: пів-качана
if (leftWord.equalsIgnoreCase("пів") && Character.isLowerCase(rightWord.charAt(0)))
return null;
List<TaggedWord> leftWdList = tagAsIsAndWithLowerCase(leftWord);
if (rightPartsWithLeftTagMap.containsKey(rightWord)) {
if (leftWdList.isEmpty())
return null;
Pattern leftTagRegex = rightPartsWithLeftTagMap.get(rightWord);
List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(leftAnalyzedTokens.size());
// ignore хто-то
if (rightWord.equals("то") && LemmaHelper.hasLemma(leftAnalyzedTokens, Arrays.asList("хто", "що", "чи")))
return null;
for (AnalyzedToken analyzedToken : leftAnalyzedTokens) {
String posTag = analyzedToken.getPOSTag();
if (posTag != null && (leftWord.equals("дуже") && posTag.contains("adv")) || (leftTagRegex.matcher(posTag).matches())) {
newAnalyzedTokens.add(new AnalyzedToken(word, posTag, analyzedToken.getLemma()));
}
}
return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}
if (UkrainianTagger.NUMBER.matcher(leftWord).matches()) {
List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>();
// e.g. 101-го
if (NUMR_ENDING_MAP.containsKey(rightWord)) {
List<String> tags = NUMR_ENDING_MAP.get(rightWord);
for (String tag : tags) {
// TODO: shall it be numr or adj?
newAnalyzedTokens.add(new AnalyzedToken(word, IPOSTag.adj.getText() + tag + ":&numr", leftWord + "-" + "й"));
}
} else {
List<TaggedWord> rightWdList = wordTagger.tag(rightWord);
if (rightWdList.isEmpty())
return null;
List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);
// e.g. 100-річному
for (AnalyzedToken analyzedToken : rightAnalyzedTokens) {
if (analyzedToken.getPOSTag().startsWith(IPOSTag.adj.getText())) {
newAnalyzedTokens.add(new AnalyzedToken(word, analyzedToken.getPOSTag(), leftWord + "-" + analyzedToken.getLemma()));
}
}
}
return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}
if (leftWord.equalsIgnoreCase("по") && rightWord.endsWith("ськи")) {
rightWord += "й";
}
List<TaggedWord> rightWdList = wordTagger.tag(rightWord);
if (rightWdList.isEmpty())
return null;
List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);
if (leftWord.equalsIgnoreCase("по")) {
if (rightWord.endsWith("ому")) {
return poAdvMatch(word, rightAnalyzedTokens, ADJ_TAG_FOR_PO_ADV_MIS);
} else if (rightWord.endsWith("ський")) {
return poAdvMatch(word, rightAnalyzedTokens, ADJ_TAG_FOR_PO_ADV_NAZ);
}
return null;
}
// exclude: Малишко-це, відносини-коли
List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
if (!leftWord.equalsIgnoreCase(rightWord) && PosTagHelper.hasPosTag(rightAnalyzedTokens, "(part|conj).*|.*:&pron.*") && !(PosTagHelper.hasPosTag(leftAnalyzedTokens, "numr.*") && PosTagHelper.hasPosTag(rightAnalyzedTokens, "numr.*")))
return null;
if (Character.isUpperCase(rightWord.charAt(0))) {
if (word.startsWith("пів-")) {
List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(rightAnalyzedTokens.size());
for (AnalyzedToken rightAnalyzedToken : rightAnalyzedTokens) {
String rightPosTag = rightAnalyzedToken.getPOSTag();
if (rightPosTag == null)
continue;
if (NOUN_SING_V_ROD_REGEX.matcher(rightPosTag).matches()) {
for (String vid : PosTagHelper.VIDMINKY_MAP.keySet()) {
if (vid.equals("v_kly"))
continue;
String posTag = rightPosTag.replace("v_rod", vid);
newAnalyzedTokens.add(new AnalyzedToken(word, posTag, word));
}
}
}
return newAnalyzedTokens;
} else {
// we don't want Нью-Париж
return null;
}
}
if (dashPrefixes.contains(leftWord) || dashPrefixes.contains(leftWord.toLowerCase()) || DASH_PREFIX_LAT_PATTERN.matcher(leftWord).matches()) {
return getNvPrefixNounMatch(word, rightAnalyzedTokens, leftWord);
}
if (Character.isUpperCase(leftWord.charAt(0)) && cityAvenue.contains(rightWord)) {
if (leftWdList.isEmpty())
return null;
return cityAvenueMatch(word, leftAnalyzedTokens);
}
if (!PosTagHelper.hasPosTag(leftAnalyzedTokens, "intj.*")) {
String noDashWord = word.replace("-", "");
List<TaggedWord> noDashWordList = tagAsIsAndWithLowerCase(noDashWord);
List<AnalyzedToken> noDashAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(noDashWord, noDashWordList);
if (!noDashAnalyzedTokens.isEmpty())
return null;
}
if (!leftWdList.isEmpty()) {
List<AnalyzedToken> tagMatch = tagMatch(word, leftAnalyzedTokens, rightAnalyzedTokens);
if (tagMatch != null) {
return tagMatch;
}
}
if (LEFT_O_ADJ_INVALID.contains(leftWord.toLowerCase()))
return null;
if (O_ADJ_PATTERN.matcher(leftWord).matches()) {
return oAdjMatch(word, rightAnalyzedTokens, leftWord);
}
debug_compound_unknown_write(word);
return null;
}
use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.
the class CompoundTagger method oAdjMatch.
@Nullable
private List<AnalyzedToken> oAdjMatch(String word, List<AnalyzedToken> analyzedTokens, String leftWord) {
List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(analyzedTokens.size());
String leftBase = leftWord.substring(0, leftWord.length() - 1);
String extraTag = "";
if (!LEFT_O_ADJ.contains(leftWord.toLowerCase(conversionLocale))) {
// яскраво для яскраво-барвистий
List<TaggedWord> taggedWords = tagBothCases(leftWord);
if (taggedWords.isEmpty()) {
// кричущий для кричуще-яскравий
taggedWords = tagBothCases(oToYj(leftWord));
}
if (taggedWords.isEmpty()) {
// паталог для паталого-анатомічний
taggedWords = tagBothCases(leftBase);
}
if (taggedWords.isEmpty()) {
// два для дво-триметровий
taggedWords = tagBothCases(leftBase + "а");
}
if (taggedWords.isEmpty())
return null;
for (TaggedWord taggedWord : taggedWords) {
if (taggedWord.getPosTag().contains(":bad")) {
extraTag = ":bad";
break;
}
}
}
for (AnalyzedToken analyzedToken : analyzedTokens) {
String posTag = analyzedToken.getPOSTag();
if (posTag.startsWith(IPOSTag.adj.getText())) {
newAnalyzedTokens.add(new AnalyzedToken(word, posTag + extraTag, leftWord.toLowerCase() + "-" + analyzedToken.getLemma()));
}
}
return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}
use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.
the class EsperantoTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
lazyInit();
Matcher matcher;
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
for (String word : sentenceTokens) {
List<AnalyzedToken> l = new ArrayList<>();
// spurious tagging as single letter words "A", "O", "E", etc.
if (word.length() > 1) {
String lWord = word.toLowerCase();
List<TaggedWord> manualTags = manualTagger.tag(lWord);
if (manualTags.size() > 0) {
// This is a closed word for which we know its lemmas and tags.
for (TaggedWord manualTag : manualTags) {
l.add(new AnalyzedToken(word, manualTag.getPosTag(), manualTag.getLemma()));
}
} else {
// Tiu, kiu (tabelvortoj).
if ((matcher = patternTabelvorto.matcher(lWord)).find()) {
String type1Group = matcher.group(1).substring(0, 1).toLowerCase();
String type2Group = matcher.group(2);
String plGroup = matcher.group(3);
String accGroup = matcher.group(4);
String type3Group = matcher.group(5);
String type;
String plural;
String accusative;
if (accGroup == null) {
accusative = "xxx";
} else {
accusative = accGroup.equalsIgnoreCase("n") ? "akz" : "nak";
}
if (plGroup == null) {
plural = " pn ";
} else {
plural = plGroup.equalsIgnoreCase("j") ? " pl " : " np ";
}
type = ((type2Group == null) ? type3Group : type2Group).toLowerCase();
l.add(new AnalyzedToken(word, "T " + accusative + plural + type1Group + " " + type, null));
if ((matcher = patternTabelvortoAdverb.matcher(lWord)).find()) {
l.add(new AnalyzedToken(word, "E nak", lWord));
}
// Words ending in .*oj?n? are nouns.
} else if (lWord.endsWith("o")) {
l.add(new AnalyzedToken(word, "O nak np", lWord));
} else if (lWord.length() >= 2 && lWord.endsWith("'")) {
l.add(new AnalyzedToken(word, "O nak np", lWord.substring(0, lWord.length() - 1) + "o"));
} else if (lWord.endsWith("oj")) {
l.add(new AnalyzedToken(word, "O nak pl", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("on")) {
l.add(new AnalyzedToken(word, "O akz np", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("ojn")) {
l.add(new AnalyzedToken(word, "O akz pl", lWord.substring(0, lWord.length() - 2)));
// Words ending in .*aj?n? are adjectives.
} else if (lWord.endsWith("a")) {
l.add(new AnalyzedToken(word, "A nak np", lWord));
} else if (lWord.endsWith("aj")) {
l.add(new AnalyzedToken(word, "A nak pl", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("an")) {
l.add(new AnalyzedToken(word, "A akz np", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("ajn")) {
l.add(new AnalyzedToken(word, "A akz pl", lWord.substring(0, lWord.length() - 2)));
// Words ending in .*en? are adverbs.
} else if (lWord.endsWith("e")) {
l.add(new AnalyzedToken(word, "E nak", lWord));
} else if (lWord.endsWith("en")) {
l.add(new AnalyzedToken(word, "E akz", lWord.substring(0, lWord.length() - 1)));
// Verbs.
} else if ((matcher = patternVerb.matcher(lWord)).find()) {
String verb = matcher.group(1) + "i";
String tense = matcher.group(2);
String transitive = findTransitivity(verb);
l.add(new AnalyzedToken(word, "V " + transitive + " " + tense, verb));
// Irregular word (no tag).
} else {
l.add(new AnalyzedToken(word, null, null));
}
// Participle (can be combined with other tags).
if ((matcher = patternParticiple.matcher(lWord)).find()) {
if (!setNonParticiple.contains(matcher.group(1))) {
String verb = matcher.group(2) + "i";
String aio = matcher.group(3);
String antAt = matcher.group(4).equals("n") ? "n" : "-";
String aoe = matcher.group(5);
String plural = matcher.group(6).equals("j") ? "pl" : "np";
String accusative = matcher.group(7).equals("n") ? "akz" : "nak";
String transitive = findTransitivity(verb);
l.add(new AnalyzedToken(word, "C " + accusative + " " + plural + " " + transitive + " " + aio + " " + antAt + " " + aoe, verb));
}
}
}
} else {
// Single letter word (no tag).
l.add(new AnalyzedToken(word, null, null));
}
tokenReadings.add(new AnalyzedTokenReadings(l, 0));
}
return tokenReadings;
}
use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.
the class GermanTagger method getAnalyzedTokens.
private List<AnalyzedToken> getAnalyzedTokens(List<TaggedWord> taggedWords, String word, List<String> compoundParts) {
List<AnalyzedToken> result = new ArrayList<>();
for (TaggedWord taggedWord : taggedWords) {
List<String> allButLastPart = compoundParts.subList(0, compoundParts.size() - 1);
String lemma = String.join("", allButLastPart) + StringTools.lowercaseFirstChar(taggedWord.getLemma());
result.add(new AnalyzedToken(word, taggedWord.getPosTag(), lemma));
}
return result;
}
Aggregations