Search in sources :

Example 1 with TaggedWord

use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.

the class GermanTagger method tag.

public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens, boolean ignoreCase) throws IOException {
    initializeIfRequired();
    boolean firstWord = true;
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    for (String word : sentenceTokens) {
        List<AnalyzedToken> l = new ArrayList<>();
        List<TaggedWord> taggerTokens = getWordTagger().tag(word);
        if (firstWord && taggerTokens.isEmpty() && ignoreCase) {
            // e.g. "Das" -> "das" at start of sentence
            taggerTokens = getWordTagger().tag(word.toLowerCase());
            firstWord = word.matches("^\\W?$");
        } else if (pos == 0 && ignoreCase) {
            // "Haben", "Sollen", "Können", "Gerade" etc. at start of sentence
            taggerTokens.addAll(getWordTagger().tag(word.toLowerCase()));
        }
        if (taggerTokens.size() > 0) {
            l.addAll(getAnalyzedTokens(taggerTokens, word));
        } else {
            // word not known, try to decompose it and use the last part for POS tagging:
            if (!StringTools.isEmpty(word.trim())) {
                List<String> compoundParts = compoundTokenizer.tokenize(word);
                if (compoundParts.size() <= 1) {
                    l.add(getNoInfoToken(word));
                } else {
                    // last part governs a word's POS:
                    String lastPart = compoundParts.get(compoundParts.size() - 1);
                    if (StringTools.startsWithUppercase(word)) {
                        lastPart = StringTools.uppercaseFirstChar(lastPart);
                    }
                    List<TaggedWord> partTaggerTokens = getWordTagger().tag(lastPart);
                    if (partTaggerTokens.size() > 0) {
                        l.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts));
                    } else {
                        l.add(getNoInfoToken(word));
                    }
                }
            } else {
                l.add(getNoInfoToken(word));
            }
        }
        tokenReadings.add(new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), pos));
        pos += word.length();
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 2 with TaggedWord

use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.

the class CompoundTagger method doGuessCompoundTag.

@Nullable
private List<AnalyzedToken> doGuessCompoundTag(String word) {
    int dashIdx = word.lastIndexOf('-');
    if (dashIdx == 0 || dashIdx == word.length() - 1)
        return null;
    int firstDashIdx = word.indexOf('-');
    if (dashIdx != firstDashIdx)
        return null;
    String leftWord = word.substring(0, dashIdx);
    String rightWord = word.substring(dashIdx + 1);
    // авіа..., авто... пишуться разом
    if (LEFT_INVALID.contains(leftWord.toLowerCase()))
        return null;
    // wrong: пів-качана
    if (leftWord.equalsIgnoreCase("пів") && Character.isLowerCase(rightWord.charAt(0)))
        return null;
    List<TaggedWord> leftWdList = tagAsIsAndWithLowerCase(leftWord);
    if (rightPartsWithLeftTagMap.containsKey(rightWord)) {
        if (leftWdList.isEmpty())
            return null;
        Pattern leftTagRegex = rightPartsWithLeftTagMap.get(rightWord);
        List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
        List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(leftAnalyzedTokens.size());
        // ignore хто-то
        if (rightWord.equals("то") && LemmaHelper.hasLemma(leftAnalyzedTokens, Arrays.asList("хто", "що", "чи")))
            return null;
        for (AnalyzedToken analyzedToken : leftAnalyzedTokens) {
            String posTag = analyzedToken.getPOSTag();
            if (posTag != null && (leftWord.equals("дуже") && posTag.contains("adv")) || (leftTagRegex.matcher(posTag).matches())) {
                newAnalyzedTokens.add(new AnalyzedToken(word, posTag, analyzedToken.getLemma()));
            }
        }
        return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
    }
    if (UkrainianTagger.NUMBER.matcher(leftWord).matches()) {
        List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>();
        // e.g. 101-го
        if (NUMR_ENDING_MAP.containsKey(rightWord)) {
            List<String> tags = NUMR_ENDING_MAP.get(rightWord);
            for (String tag : tags) {
                // TODO: shall it be numr or adj?
                newAnalyzedTokens.add(new AnalyzedToken(word, IPOSTag.adj.getText() + tag + ":&numr", leftWord + "-" + "й"));
            }
        } else {
            List<TaggedWord> rightWdList = wordTagger.tag(rightWord);
            if (rightWdList.isEmpty())
                return null;
            List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);
            // e.g. 100-річному
            for (AnalyzedToken analyzedToken : rightAnalyzedTokens) {
                if (analyzedToken.getPOSTag().startsWith(IPOSTag.adj.getText())) {
                    newAnalyzedTokens.add(new AnalyzedToken(word, analyzedToken.getPOSTag(), leftWord + "-" + analyzedToken.getLemma()));
                }
            }
        }
        return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
    }
    if (leftWord.equalsIgnoreCase("по") && rightWord.endsWith("ськи")) {
        rightWord += "й";
    }
    List<TaggedWord> rightWdList = wordTagger.tag(rightWord);
    if (rightWdList.isEmpty())
        return null;
    List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);
    if (leftWord.equalsIgnoreCase("по")) {
        if (rightWord.endsWith("ому")) {
            return poAdvMatch(word, rightAnalyzedTokens, ADJ_TAG_FOR_PO_ADV_MIS);
        } else if (rightWord.endsWith("ський")) {
            return poAdvMatch(word, rightAnalyzedTokens, ADJ_TAG_FOR_PO_ADV_NAZ);
        }
        return null;
    }
    // exclude: Малишко-це, відносини-коли
    List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
    if (!leftWord.equalsIgnoreCase(rightWord) && PosTagHelper.hasPosTag(rightAnalyzedTokens, "(part|conj).*|.*:&pron.*") && !(PosTagHelper.hasPosTag(leftAnalyzedTokens, "numr.*") && PosTagHelper.hasPosTag(rightAnalyzedTokens, "numr.*")))
        return null;
    if (Character.isUpperCase(rightWord.charAt(0))) {
        if (word.startsWith("пів-")) {
            List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(rightAnalyzedTokens.size());
            for (AnalyzedToken rightAnalyzedToken : rightAnalyzedTokens) {
                String rightPosTag = rightAnalyzedToken.getPOSTag();
                if (rightPosTag == null)
                    continue;
                if (NOUN_SING_V_ROD_REGEX.matcher(rightPosTag).matches()) {
                    for (String vid : PosTagHelper.VIDMINKY_MAP.keySet()) {
                        if (vid.equals("v_kly"))
                            continue;
                        String posTag = rightPosTag.replace("v_rod", vid);
                        newAnalyzedTokens.add(new AnalyzedToken(word, posTag, word));
                    }
                }
            }
            return newAnalyzedTokens;
        } else {
            // we don't want Нью-Париж
            return null;
        }
    }
    if (dashPrefixes.contains(leftWord) || dashPrefixes.contains(leftWord.toLowerCase()) || DASH_PREFIX_LAT_PATTERN.matcher(leftWord).matches()) {
        return getNvPrefixNounMatch(word, rightAnalyzedTokens, leftWord);
    }
    if (Character.isUpperCase(leftWord.charAt(0)) && cityAvenue.contains(rightWord)) {
        if (leftWdList.isEmpty())
            return null;
        return cityAvenueMatch(word, leftAnalyzedTokens);
    }
    if (!PosTagHelper.hasPosTag(leftAnalyzedTokens, "intj.*")) {
        String noDashWord = word.replace("-", "");
        List<TaggedWord> noDashWordList = tagAsIsAndWithLowerCase(noDashWord);
        List<AnalyzedToken> noDashAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(noDashWord, noDashWordList);
        if (!noDashAnalyzedTokens.isEmpty())
            return null;
    }
    if (!leftWdList.isEmpty()) {
        List<AnalyzedToken> tagMatch = tagMatch(word, leftAnalyzedTokens, rightAnalyzedTokens);
        if (tagMatch != null) {
            return tagMatch;
        }
    }
    if (LEFT_O_ADJ_INVALID.contains(leftWord.toLowerCase()))
        return null;
    if (O_ADJ_PATTERN.matcher(leftWord).matches()) {
        return oAdjMatch(word, rightAnalyzedTokens, leftWord);
    }
    debug_compound_unknown_write(word);
    return null;
}
Also used : Pattern(java.util.regex.Pattern) TaggedWord(org.languagetool.tagging.TaggedWord) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) Nullable(org.jetbrains.annotations.Nullable)

Example 3 with TaggedWord

use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.

the class CompoundTagger method oAdjMatch.

@Nullable
private List<AnalyzedToken> oAdjMatch(String word, List<AnalyzedToken> analyzedTokens, String leftWord) {
    List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(analyzedTokens.size());
    String leftBase = leftWord.substring(0, leftWord.length() - 1);
    String extraTag = "";
    if (!LEFT_O_ADJ.contains(leftWord.toLowerCase(conversionLocale))) {
        // яскраво для яскраво-барвистий
        List<TaggedWord> taggedWords = tagBothCases(leftWord);
        if (taggedWords.isEmpty()) {
            // кричущий для кричуще-яскравий
            taggedWords = tagBothCases(oToYj(leftWord));
        }
        if (taggedWords.isEmpty()) {
            // паталог для паталого-анатомічний
            taggedWords = tagBothCases(leftBase);
        }
        if (taggedWords.isEmpty()) {
            // два для дво-триметровий
            taggedWords = tagBothCases(leftBase + "а");
        }
        if (taggedWords.isEmpty())
            return null;
        for (TaggedWord taggedWord : taggedWords) {
            if (taggedWord.getPosTag().contains(":bad")) {
                extraTag = ":bad";
                break;
            }
        }
    }
    for (AnalyzedToken analyzedToken : analyzedTokens) {
        String posTag = analyzedToken.getPOSTag();
        if (posTag.startsWith(IPOSTag.adj.getText())) {
            newAnalyzedTokens.add(new AnalyzedToken(word, posTag + extraTag, leftWord.toLowerCase() + "-" + analyzedToken.getLemma()));
        }
    }
    return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) ArrayList(java.util.ArrayList) Nullable(org.jetbrains.annotations.Nullable)

Example 4 with TaggedWord

use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.

the class EsperantoTagger method tag.

@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
    lazyInit();
    Matcher matcher;
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    for (String word : sentenceTokens) {
        List<AnalyzedToken> l = new ArrayList<>();
        // spurious tagging as single letter words "A", "O", "E", etc.
        if (word.length() > 1) {
            String lWord = word.toLowerCase();
            List<TaggedWord> manualTags = manualTagger.tag(lWord);
            if (manualTags.size() > 0) {
                // This is a closed word for which we know its lemmas and tags.
                for (TaggedWord manualTag : manualTags) {
                    l.add(new AnalyzedToken(word, manualTag.getPosTag(), manualTag.getLemma()));
                }
            } else {
                // Tiu, kiu (tabelvortoj).
                if ((matcher = patternTabelvorto.matcher(lWord)).find()) {
                    String type1Group = matcher.group(1).substring(0, 1).toLowerCase();
                    String type2Group = matcher.group(2);
                    String plGroup = matcher.group(3);
                    String accGroup = matcher.group(4);
                    String type3Group = matcher.group(5);
                    String type;
                    String plural;
                    String accusative;
                    if (accGroup == null) {
                        accusative = "xxx";
                    } else {
                        accusative = accGroup.equalsIgnoreCase("n") ? "akz" : "nak";
                    }
                    if (plGroup == null) {
                        plural = " pn ";
                    } else {
                        plural = plGroup.equalsIgnoreCase("j") ? " pl " : " np ";
                    }
                    type = ((type2Group == null) ? type3Group : type2Group).toLowerCase();
                    l.add(new AnalyzedToken(word, "T " + accusative + plural + type1Group + " " + type, null));
                    if ((matcher = patternTabelvortoAdverb.matcher(lWord)).find()) {
                        l.add(new AnalyzedToken(word, "E nak", lWord));
                    }
                // Words ending in .*oj?n? are nouns.
                } else if (lWord.endsWith("o")) {
                    l.add(new AnalyzedToken(word, "O nak np", lWord));
                } else if (lWord.length() >= 2 && lWord.endsWith("'")) {
                    l.add(new AnalyzedToken(word, "O nak np", lWord.substring(0, lWord.length() - 1) + "o"));
                } else if (lWord.endsWith("oj")) {
                    l.add(new AnalyzedToken(word, "O nak pl", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("on")) {
                    l.add(new AnalyzedToken(word, "O akz np", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("ojn")) {
                    l.add(new AnalyzedToken(word, "O akz pl", lWord.substring(0, lWord.length() - 2)));
                // Words ending in .*aj?n? are adjectives.
                } else if (lWord.endsWith("a")) {
                    l.add(new AnalyzedToken(word, "A nak np", lWord));
                } else if (lWord.endsWith("aj")) {
                    l.add(new AnalyzedToken(word, "A nak pl", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("an")) {
                    l.add(new AnalyzedToken(word, "A akz np", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("ajn")) {
                    l.add(new AnalyzedToken(word, "A akz pl", lWord.substring(0, lWord.length() - 2)));
                // Words ending in .*en? are adverbs.
                } else if (lWord.endsWith("e")) {
                    l.add(new AnalyzedToken(word, "E nak", lWord));
                } else if (lWord.endsWith("en")) {
                    l.add(new AnalyzedToken(word, "E akz", lWord.substring(0, lWord.length() - 1)));
                // Verbs.
                } else if ((matcher = patternVerb.matcher(lWord)).find()) {
                    String verb = matcher.group(1) + "i";
                    String tense = matcher.group(2);
                    String transitive = findTransitivity(verb);
                    l.add(new AnalyzedToken(word, "V " + transitive + " " + tense, verb));
                // Irregular word (no tag).
                } else {
                    l.add(new AnalyzedToken(word, null, null));
                }
                // Participle (can be combined with other tags).
                if ((matcher = patternParticiple.matcher(lWord)).find()) {
                    if (!setNonParticiple.contains(matcher.group(1))) {
                        String verb = matcher.group(2) + "i";
                        String aio = matcher.group(3);
                        String antAt = matcher.group(4).equals("n") ? "n" : "-";
                        String aoe = matcher.group(5);
                        String plural = matcher.group(6).equals("j") ? "pl" : "np";
                        String accusative = matcher.group(7).equals("n") ? "akz" : "nak";
                        String transitive = findTransitivity(verb);
                        l.add(new AnalyzedToken(word, "C " + accusative + " " + plural + " " + transitive + " " + aio + " " + antAt + " " + aoe, verb));
                    }
                }
            }
        } else {
            // Single letter word (no tag).
            l.add(new AnalyzedToken(word, null, null));
        }
        tokenReadings.add(new AnalyzedTokenReadings(l, 0));
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 5 with TaggedWord

use of org.languagetool.tagging.TaggedWord in project languagetool by languagetool-org.

the class GermanTagger method getAnalyzedTokens.

private List<AnalyzedToken> getAnalyzedTokens(List<TaggedWord> taggedWords, String word, List<String> compoundParts) {
    List<AnalyzedToken> result = new ArrayList<>();
    for (TaggedWord taggedWord : taggedWords) {
        List<String> allButLastPart = compoundParts.subList(0, compoundParts.size() - 1);
        String lemma = String.join("", allButLastPart) + StringTools.lowercaseFirstChar(taggedWord.getLemma());
        result.add(new AnalyzedToken(word, taggedWord.getPosTag(), lemma));
    }
    return result;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) ArrayList(java.util.ArrayList)

Aggregations

ArrayList (java.util.ArrayList)6 AnalyzedToken (org.languagetool.AnalyzedToken)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)3 Nullable (org.jetbrains.annotations.Nullable)2 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1