Search in sources :

Example 26 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class KhmerWordRepeatRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    AnalyzedTokenReadings[] tokensWithWS = sentence.getTokens();
    String prevToken = "";
    // we start from token 1, token 0 is SENT_START 
    for (int i = 1; i < tokens.length; i++) {
        String token = tokens[i].getToken();
        if (isWord(token) && prevToken.equalsIgnoreCase(token) && !ignore(sentence, tokensWithWS, i)) {
            int prevPos = tokens[i - 1].getStartPos();
            int pos = tokens[i].getStartPos();
            RuleMatch ruleMatch = new RuleMatch(this, prevPos, pos + prevToken.length(), messages.getString("repetition"), messages.getString("desc_repetition_short"));
            List<String> replacements = new ArrayList<>();
            // case 1: replace zero-width space w/ real space 
            replacements.add(prevToken + " " + token);
            // case 2: remove repeated word - same as original suggestion 
            replacements.add(prevToken);
            // case 3: same as case 2, just add "repetition character"
            replacements.add(prevToken + "ៗ");
            ruleMatch.setSuggestedReplacements(replacements);
            ruleMatches.add(ruleMatch);
        }
        prevToken = token;
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 27 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class MorfologikRussianSpellerRule method ignoreToken.

@Override
protected boolean ignoreToken(AnalyzedTokenReadings[] tokens, int idx) throws IOException {
    String word = tokens[idx].getToken();
    // don't check words that don't have  letters
    if (!RUSSIAN_LETTERS.matcher(word).matches()) {
        return true;
    }
    List<String> words = new ArrayList<>();
    for (AnalyzedTokenReadings token : tokens) {
        words.add(token.getToken());
    }
    return ignoreWord(words, idx);
}
Also used : ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 28 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class RussianPartialPosTagFilter method tag.

@Override
protected List<AnalyzedTokenReadings> tag(String token) {
    try {
        List<AnalyzedTokenReadings> tags = tagger.tag(Collections.singletonList(token));
        AnalyzedTokenReadings[] atr = tags.toArray(new AnalyzedTokenReadings[tags.size()]);
        AnalyzedSentence disambiguated = disambiguator.disambiguate(new AnalyzedSentence(atr));
        return Arrays.asList(disambiguated.getTokens());
    } catch (IOException e) {
        throw new RuntimeException("Could not tag and disambiguate '" + token + "'", e);
    }
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 29 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class RussianTagger method tag.

@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    for (String word : sentenceTokens) {
        if (word.length() > 1) {
            word = word.replace("о́", "о");
            word = word.replace("а́", "а");
            word = word.replace("е́", "е");
            word = word.replace("у́", "у");
            word = word.replace("и́", "и");
            word = word.replace("ы́", "ы");
            word = word.replace("э́", "э");
            word = word.replace("ю́", "ю");
            word = word.replace("я́", "я");
            word = word.replace("о̀", "о");
            word = word.replace("а̀", "а");
            word = word.replace("ѐ", "е");
            word = word.replace("у̀", "у");
            word = word.replace("ѝ", "и");
            word = word.replace("ы̀", "ы");
            word = word.replace("э̀", "э");
            word = word.replace("ю̀", "ю");
            word = word.replace("я̀", "я");
            word = word.replace("ʼ", "ъ");
        }
        List<AnalyzedToken> l = getAnalyzedTokens(word);
        tokenReadings.add(new AnalyzedTokenReadings(l, pos));
        pos += word.length();
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 30 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class MultiWordChunkerTest method testDisambiguate.

@Test
public void testDisambiguate() throws Exception {
    Disambiguator chunker = new MultiWordChunker("/pl/multiwords.txt");
    JLanguageTool lt = new JLanguageTool(new English());
    AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence("A test... More.");
    AnalyzedSentence disambiguated = chunker.disambiguate(analyzedSentence);
    AnalyzedTokenReadings[] tokens = disambiguated.getTokens();
    assertTrue(tokens[4].getReadings().toString().contains("<ELLIPSIS>"));
    assertTrue(tokens[6].getReadings().toString().contains("</ELLIPSIS>"));
}
Also used : English(org.languagetool.language.English) AnalyzedSentence(org.languagetool.AnalyzedSentence) JLanguageTool(org.languagetool.JLanguageTool) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2