Search in sources :

Example 46 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class UkrainianTagger method getAnalyzedTokens.

@Override
protected List<AnalyzedToken> getAnalyzedTokens(String word) {
    List<AnalyzedToken> tokens = super.getAnalyzedTokens(word);
    if (tokens.get(0).getPOSTag() == null) {
        char otherHyphen = getOtherHyphen(word);
        if (otherHyphen != '' && ALT_DASHES_IN_WORD.matcher(word).find()) {
            String newWord = word.replace(otherHyphen, '-');
            List<AnalyzedToken> newTokens = super.getAnalyzedTokens(newWord);
            for (int i = 0; i < newTokens.size(); i++) {
                AnalyzedToken analyzedToken = newTokens.get(i);
                if (newWord.equals(analyzedToken.getToken())) {
                    String lemma = analyzedToken.getLemma();
                    if (lemma != null) {
                        lemma = lemma.replace('-', otherHyphen);
                    }
                    AnalyzedToken newToken = new AnalyzedToken(word, analyzedToken.getPOSTag(), lemma);
                    newTokens.set(i, newToken);
                }
            }
            tokens = newTokens;
        }
    }
    return tokens;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken)

Example 47 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class TokenInflectionAgreementRule method match.

@Override
public final RuleMatch[] match(AnalyzedSentence text) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
    List<AnalyzedToken> adjTokenReadings = new ArrayList<>();
    AnalyzedTokenReadings adjAnalyzedTokenReadings = null;
    for (int i = 1; i < tokens.length; i++) {
        AnalyzedTokenReadings tokenReadings = tokens[i];
        String posTag0 = tokenReadings.getAnalyzedToken(0).getPOSTag();
        if (posTag0 == null) {
            //          || posTag0.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
            adjTokenReadings.clear();
            continue;
        }
        if (adjTokenReadings.isEmpty()) {
            // no need to start checking on last token or if no noun
            if (i == tokens.length - 1)
                continue;
            //TODO: nv still can be wrong if :np/:ns is present to it's not much gain for lots of work
            if (PosTagHelper.hasPosTagPart(tokens[i], ":nv") || //TODO: turn back on when we can handle pron
            PosTagHelper.hasPosTagPart(tokens[i], "&pron") || PosTagHelper.hasPosTagPart(tokens[i], "<"))
                continue;
            if (!PosTagHelper.hasPosTagPart(tokens[i + 1], "noun:") || PosTagHelper.hasPosTagPart(tokens[i + 1], ":nv") || PosTagHelper.hasPosTagPart(tokens[i + 1], "&pron") || PosTagHelper.hasPosTagPart(tokens[i + 1], "<"))
                continue;
            if (LemmaHelper.hasLemma(tokens[i], Arrays.asList("червоний", "правий", "місцевий", "найсильніший", "найкращі"), ":p:") || LemmaHelper.hasLemma(tokens[i], Arrays.asList("новенький", "головний", "вибраний", "більший", "побачений", "подібний"), ":n:") || LemmaHelper.hasLemma(tokens[i], Arrays.asList("державний"), ":f:")) {
                adjTokenReadings.clear();
                break;
            }
            for (AnalyzedToken token : tokenReadings) {
                String adjPosTag = token.getPOSTag();
                if (adjPosTag == null) {
                    // can happen for words with ́ or ­
                    continue;
                }
                if (adjPosTag.startsWith("adj")) {
                    adjTokenReadings.add(token);
                    adjAnalyzedTokenReadings = tokenReadings;
                } else {
                    adjTokenReadings.clear();
                    break;
                }
            }
            continue;
        }
        List<AnalyzedToken> slaveTokenReadings = new ArrayList<>();
        for (AnalyzedToken token : tokenReadings) {
            String nounPosTag = token.getPOSTag();
            if (nounPosTag == null) {
                // can happen for words with ́ or ­
                continue;
            }
            if (nounPosTag.startsWith("noun") && !nounPosTag.contains(NO_VIDMINOK_SUBSTR)) {
                slaveTokenReadings.add(token);
            } else if (nounPosTag.equals(JLanguageTool.SENTENCE_END_TAGNAME) || nounPosTag.equals(JLanguageTool.PARAGRAPH_END_TAGNAME)) {
                continue;
            } else {
                slaveTokenReadings.clear();
                break;
            }
        }
        if (slaveTokenReadings.isEmpty()) {
            adjTokenReadings.clear();
            continue;
        }
        if (DEBUG) {
            System.err.println(MessageFormat.format("=== Checking:\n\t{0}\n\t{1}", adjTokenReadings, slaveTokenReadings));
        }
        // perform the check
        List<InflectionHelper.Inflection> masterInflections = InflectionHelper.getAdjInflections(adjTokenReadings);
        List<InflectionHelper.Inflection> slaveInflections = InflectionHelper.getNounInflections(slaveTokenReadings);
        if (Collections.disjoint(masterInflections, slaveInflections)) {
            if (TokenInflectionExceptionHelper.isException(tokens, i, masterInflections, slaveInflections, adjTokenReadings, slaveTokenReadings)) {
                adjTokenReadings.clear();
                continue;
            }
            if (DEBUG) {
                System.err.println(MessageFormat.format("=== Found:\n\t{0}\n\t", adjAnalyzedTokenReadings.getToken() + ": " + masterInflections + " // " + adjAnalyzedTokenReadings, slaveTokenReadings.get(0).getToken() + ": " + slaveInflections + " // " + slaveTokenReadings));
            }
            String msg = String.format("Потенційна помилка: прикметник не узгоджений з іменником: \"%s\": [%s] і \"%s\": [%s]", adjTokenReadings.get(0).getToken(), formatInflections(masterInflections, true), slaveTokenReadings.get(0).getToken(), formatInflections(slaveInflections, false));
            if (PosTagHelper.hasPosTagPart(adjTokenReadings, ":m:v_rod") && tokens[i].getToken().matches(".*[ую]") && PosTagHelper.hasPosTag(slaveTokenReadings, "noun.*:m:v_dav.*")) {
                msg += ". Можливо вжито невнормований родовий відмінок ч.р. з закінченням -у/-ю замість -а/-я (така тенденція є в сучасній мові)?";
            }
            RuleMatch potentialRuleMatch = new RuleMatch(this, adjAnalyzedTokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());
            Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();
            List<String> suggestions = new ArrayList<>();
            try {
                for (Inflection adjInflection : masterInflections) {
                    String genderTag = ":" + adjInflection.gender + ":";
                    String vidmTag = adjInflection._case;
                    if (!adjInflection._case.equals("v_kly") && (adjInflection.gender.equals("p") || PosTagHelper.hasPosTagPart(slaveTokenReadings, genderTag))) {
                        for (AnalyzedToken nounToken : slaveTokenReadings) {
                            if (adjInflection.animMatters()) {
                                if (!nounToken.getPOSTag().contains(":" + adjInflection.animTag))
                                    continue;
                            }
                            String newNounPosTag = nounToken.getPOSTag().replaceFirst(":.:v_...", genderTag + vidmTag);
                            String[] synthesized = ukrainianSynthesizer.synthesize(nounToken, newNounPosTag, false);
                            for (String s : synthesized) {
                                String suggestion = adjAnalyzedTokenReadings.getToken() + " " + s;
                                if (!suggestions.contains(suggestion)) {
                                    suggestions.add(suggestion);
                                }
                            }
                        }
                    }
                }
                for (Inflection nounInflection : slaveInflections) {
                    String genderTag = ":" + nounInflection.gender + ":";
                    String vidmTag = nounInflection._case;
                    if (nounInflection.animMatters()) {
                        vidmTag += ":r" + nounInflection.animTag;
                    }
                    for (AnalyzedToken adjToken : adjTokenReadings) {
                        String newAdjTag = adjToken.getPOSTag().replaceFirst(":.:v_...(:r(in)?anim)?", genderTag + vidmTag);
                        String[] synthesized = ukrainianSynthesizer.synthesize(adjToken, newAdjTag, false);
                        for (String s : synthesized) {
                            String suggestion = s + " " + tokenReadings.getToken();
                            if (!suggestions.contains(suggestion)) {
                                suggestions.add(suggestion);
                            }
                        }
                    }
                }
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            if (suggestions.size() > 0) {
                potentialRuleMatch.setSuggestedReplacements(suggestions);
            }
            ruleMatches.add(potentialRuleMatch);
        }
        adjTokenReadings.clear();
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : ArrayList(java.util.ArrayList) Inflection(org.languagetool.rules.uk.InflectionHelper.Inflection) IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) RuleMatch(org.languagetool.rules.RuleMatch) AnalyzedToken(org.languagetool.AnalyzedToken) Synthesizer(org.languagetool.synthesis.Synthesizer)

Example 48 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class UkrainianHybridDisambiguator method removeIanimVKly.

private void removeIanimVKly(AnalyzedSentence input) {
    AnalyzedTokenReadings[] tokens = input.getTokensWithoutWhitespace();
    for (int i = 1; i < tokens.length; i++) {
        List<AnalyzedToken> analyzedTokens = tokens[i].getReadings();
        if (i < tokens.length - 1 && Arrays.asList(",", "!", "»").contains(tokens[i + 1].getToken()) && PosTagHelper.hasPosTag(tokens[i - 1], "adj.*v_kly.*"))
            continue;
        ArrayList<AnalyzedToken> inanimVklyReadings = new ArrayList<>();
        boolean otherFound = false;
        for (int j = 0; j < analyzedTokens.size(); j++) {
            String posTag = analyzedTokens.get(j).getPOSTag();
            if (posTag == null)
                break;
            if (posTag.equals(JLanguageTool.SENTENCE_END_TAGNAME))
                continue;
            if (INANIM_VKLY.matcher(posTag).matches()) {
                inanimVklyReadings.add(analyzedTokens.get(j));
            } else {
                otherFound = true;
            }
        }
        if (inanimVklyReadings.size() > 0 && otherFound) {
            //        System.err.println("====================1 " + tokens[i]);
            for (AnalyzedToken analyzedToken : inanimVklyReadings) {
                tokens[i].removeReading(analyzedToken);
            //          System.err.println("===== Removing: " + analyzedToken);
            //          System.err.println("====================2 " + tokens[i]);
            }
        }
    }
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 49 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class CompoundTagger method oAdjMatch.

@Nullable
private List<AnalyzedToken> oAdjMatch(String word, List<AnalyzedToken> analyzedTokens, String leftWord) {
    List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(analyzedTokens.size());
    String leftBase = leftWord.substring(0, leftWord.length() - 1);
    String extraTag = "";
    if (!LEFT_O_ADJ.contains(leftWord.toLowerCase(conversionLocale))) {
        // яскраво для яскраво-барвистий
        List<TaggedWord> taggedWords = tagBothCases(leftWord);
        if (taggedWords.isEmpty()) {
            // кричущий для кричуще-яскравий
            taggedWords = tagBothCases(oToYj(leftWord));
        }
        if (taggedWords.isEmpty()) {
            // паталог для паталого-анатомічний
            taggedWords = tagBothCases(leftBase);
        }
        if (taggedWords.isEmpty()) {
            // два для дво-триметровий
            taggedWords = tagBothCases(leftBase + "а");
        }
        if (taggedWords.isEmpty())
            return null;
        for (TaggedWord taggedWord : taggedWords) {
            if (taggedWord.getPosTag().contains(":bad")) {
                extraTag = ":bad";
                break;
            }
        }
    }
    for (AnalyzedToken analyzedToken : analyzedTokens) {
        String posTag = analyzedToken.getPOSTag();
        if (posTag.startsWith(IPOSTag.adj.getText())) {
            newAnalyzedTokens.add(new AnalyzedToken(word, posTag + extraTag, leftWord.toLowerCase() + "-" + analyzedToken.getLemma()));
        }
    }
    return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) ArrayList(java.util.ArrayList) Nullable(org.jetbrains.annotations.Nullable)

Example 50 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class TokenAgreementRule method createRuleMatch.

private RuleMatch createRuleMatch(AnalyzedTokenReadings tokenReadings, AnalyzedTokenReadings reqTokenReadings, List<String> posTagsToFind) {
    String tokenString = tokenReadings.getToken();
    Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();
    List<String> suggestions = new ArrayList<>();
    String requiredPostTagsRegEx = ":(" + String.join("|", posTagsToFind) + ")";
    for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
        String oldPosTag = analyzedToken.getPOSTag();
        if (oldPosTag == null)
            continue;
        String requiredPostTagsRegExToApply = requiredPostTagsRegEx;
        Matcher matcher = REQ_ANIM_INANIM_PATTERN.matcher(oldPosTag);
        if (matcher.find()) {
            requiredPostTagsRegExToApply += matcher.group(0);
        } else {
            requiredPostTagsRegExToApply += "(?:" + reqAnimInanimRegex + ")?";
        }
        String posTag = oldPosTag.replaceFirst(":v_[a-z]+", requiredPostTagsRegExToApply);
        try {
            String[] synthesized = ukrainianSynthesizer.synthesize(analyzedToken, posTag, true);
            suggestions.addAll(Arrays.asList(synthesized));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
    if (suggestions.size() > 0) {
        // remove duplicates
        suggestions = new ArrayList<>(new LinkedHashSet<>(suggestions));
    }
    List<String> reqVidminkyNames = new ArrayList<>();
    for (String vidm : posTagsToFind) {
        reqVidminkyNames.add(PosTagHelper.VIDMINKY_MAP.get(vidm));
    }
    List<String> foundVidminkyNames = new ArrayList<>();
    for (AnalyzedToken token : tokenReadings) {
        String posTag2 = token.getPOSTag();
        if (posTag2 != null && posTag2.contains(VIDMINOK_SUBSTR)) {
            String vidmName = PosTagHelper.VIDMINKY_MAP.get(posTag2.replaceFirst("^.*" + VIDMINOK_REGEX + ".*$", "$1"));
            if (foundVidminkyNames.contains(vidmName)) {
                if (posTag2.contains(":p:")) {
                    vidmName = vidmName + " (мн.)";
                    foundVidminkyNames.add(vidmName);
                }
            // else skip dup
            } else {
                foundVidminkyNames.add(vidmName);
            }
        }
    }
    String msg = MessageFormat.format("Прийменник «{0}» вимагає іншого відмінка: {1}, а знайдено: {2}", reqTokenReadings.getToken(), String.join(", ", reqVidminkyNames), String.join(", ", foundVidminkyNames));
    if (tokenString.equals("їх") && requiredPostTagsRegEx != null) {
        msg += ". Можливо тут потрібно присвійний займенник «їхній»?";
        try {
            String newYihPostag = "adj:p" + requiredPostTagsRegEx + ".*";
            String[] synthesized = ukrainianSynthesizer.synthesize(new AnalyzedToken("їхній", "adj:m:v_naz:&pron:pos", "їхній"), newYihPostag, true);
            suggestions.addAll(Arrays.asList(synthesized));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    } else if (reqTokenReadings.getToken().equalsIgnoreCase("о")) {
        for (AnalyzedToken token : tokenReadings.getReadings()) {
            String posTag2 = token.getPOSTag();
            if (posTag2.matches("noun:anim.*:v_naz.*")) {
                msg += ". Можливо тут «о» — це вигук і потрібно кличний відмінок?";
                try {
                    String newPostag = posTag2.replace("v_naz", "v_kly");
                    String[] synthesized = ukrainianSynthesizer.synthesize(token, newPostag, false);
                    for (String string : synthesized) {
                        if (!string.equals(token.getToken()) && !suggestions.contains(string)) {
                            suggestions.add(string);
                        }
                    }
                    break;
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }
    }
    RuleMatch potentialRuleMatch = new RuleMatch(this, tokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());
    potentialRuleMatch.setSuggestedReplacements(suggestions);
    return potentialRuleMatch;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) AnalyzedToken(org.languagetool.AnalyzedToken) RuleMatch(org.languagetool.rules.RuleMatch) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Synthesizer(org.languagetool.synthesis.Synthesizer)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2