Search in sources :

Example 91 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class QuestionWhitespaceRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokens();
    String prevToken = "";
    for (int i = 1; i < tokens.length; i++) {
        String token = tokens[i].getToken();
        boolean isWhiteBefore = tokens[i].isWhitespaceBefore() && !" ".equals(prevToken) && !" ".equals(prevToken);
        String msg = null;
        int fixLen = 0;
        String suggestionText = null;
        if (isWhiteBefore) {
            switch(token) {
                case "?":
                    msg = "Point d'interrogation est précédé d'une espace fine insécable.";
                    // non-breaking space
                    suggestionText = " ?";
                    fixLen = 1;
                    break;
                case "!":
                    msg = "Point d'exclamation est précédé d'une espace fine insécable.";
                    // non-breaking space
                    suggestionText = " !";
                    fixLen = 1;
                    break;
                case "»":
                    msg = "Le guillemet fermant est précédé d'une espace fine insécable.";
                    // non-breaking space
                    suggestionText = " »";
                    fixLen = 1;
                    break;
                case ";":
                    msg = "Point-virgule est précédé d'une espace fine insécable.";
                    // non-breaking space
                    suggestionText = " ;";
                    fixLen = 1;
                    break;
                case ":":
                    msg = "Deux-points sont précédé d'une espace fine insécable.";
                    // non-breaking space
                    suggestionText = " :";
                    fixLen = 1;
                    break;
            }
        } else {
            // "espace insécable" (U+00a0) is also often used. Let's accept both.
            if (token.equals("?") && !prevToken.equals("!") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
                msg = "Point d'interrogation est précédé d'une espace fine insécable.";
                // non-breaking space
                suggestionText = prevToken + " ?";
                fixLen = 1;
            } else if (token.equals("!") && !prevToken.equals("?") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
                msg = "Point d'exclamation est précédé d'une espace fine insécable.";
                // non-breaking space
                suggestionText = prevToken + " !";
                fixLen = 1;
            } else if (token.equals(";") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
                msg = "Point-virgule est précédé d'une espace fine insécable.";
                // non-breaking space
                suggestionText = prevToken + " ;";
                fixLen = 1;
            } else if (token.equals(":") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
                // Avoid false positive for URL like http://www.languagetool.org.
                Matcher matcherUrl = urlPattern.matcher(prevToken);
                if (!matcherUrl.find()) {
                    msg = "Deux-points précédés d'une espace fine insécable.";
                    // non-breaking space
                    suggestionText = prevToken + " :";
                    fixLen = 1;
                }
            } else if (token.equals("»") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
                msg = "Le guillemet fermant est précédé d'une espace fine insécable.";
                // non-breaking space
                suggestionText = prevToken + " »";
                fixLen = 1;
            }
        }
        if (StringTools.isEmpty(token) && prevToken.equals("«")) {
            msg = "Le guillemet ouvrant est suivi d'une espace fine insécable.";
            // non-breaking space
            suggestionText = "« ";
            fixLen = 1;
        } else if (!StringTools.isEmpty(token) && prevToken.equals("«") && !token.equals(" ") && !token.equals(" ")) {
            msg = "Le guillemet ouvrant est suivi d'une espace fine insécable.";
            // non-breaking space
            suggestionText = "« ";
            fixLen = 0;
        }
        if (msg != null) {
            int fromPos = tokens[i - 1].getStartPos();
            int toPos = tokens[i - 1].getStartPos() + fixLen + tokens[i - 1].getToken().length();
            RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg, "Insérer un espace insécable");
            if (suggestionText != null) {
                ruleMatch.setSuggestedReplacement(suggestionText);
            }
            ruleMatches.add(ruleMatch);
        }
        prevToken = token;
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 92 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class EnglishChunkFilterTest method testPluralByPluralNoun.

@Test
public void testPluralByPluralNoun() throws IOException {
    String input = "I/X have/N-VP ten/B-NP books/I-NP ./.";
    List<ChunkTaggedToken> tokens = makeTokens(input);
    // 'books'
    tokens.remove(3);
    AnalyzedTokenReadings readings = new AnalyzedTokenReadings(Arrays.asList(new AnalyzedToken("books", "NNS", "book"), new AnalyzedToken("books", "VBZ", "book")), 0);
    tokens.add(3, new ChunkTaggedToken("books", Collections.singletonList(new ChunkTag("I-NP")), readings));
    assertChunks(tokens, "I/X have/N-VP ten/B-NP-plural books/E-NP-plural ./.");
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Example 93 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class EnglishChunkerTest method createReadingsList.

private List<AnalyzedTokenReadings> createReadingsList(String sentence) {
    StringTokenizer tokenizer = new StringTokenizer(sentence, " ", true);
    List<AnalyzedTokenReadings> result = new ArrayList<>();
    int pos = 0;
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        if (token.trim().isEmpty()) {
            result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), pos));
        } else {
            result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, "fake", "fake"), pos));
        }
        pos += token.length();
    }
    return result;
}
Also used : StringTokenizer(java.util.StringTokenizer) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 94 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class PolishTagger method tag.

@Override
public final List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) {
    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    for (String word : sentenceTokens) {
        final List<AnalyzedToken> l = new ArrayList<>();
        final String lowerWord = word.toLowerCase(plLocale);
        taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
        lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
        final boolean isLowercase = word.equals(lowerWord);
        //normal case
        addTokens(taggerTokens, l);
        if (!isLowercase) {
            //lowercase
            addTokens(lowerTaggerTokens, l);
        }
        //uppercase
        if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
            if (isLowercase) {
                upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(word)));
                if (!upperTaggerTokens.isEmpty()) {
                    addTokens(upperTaggerTokens, l);
                } else {
                    l.add(new AnalyzedToken(word, null, null));
                }
            } else {
                l.add(new AnalyzedToken(word, null, null));
            }
        }
        tokenReadings.add(new AnalyzedTokenReadings(l, pos));
        pos += word.length();
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 95 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class EsperantoTagger method tag.

@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
    lazyInit();
    Matcher matcher;
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    for (String word : sentenceTokens) {
        List<AnalyzedToken> l = new ArrayList<>();
        // spurious tagging as single letter words "A", "O", "E", etc.
        if (word.length() > 1) {
            String lWord = word.toLowerCase();
            List<TaggedWord> manualTags = manualTagger.tag(lWord);
            if (manualTags.size() > 0) {
                // This is a closed word for which we know its lemmas and tags.
                for (TaggedWord manualTag : manualTags) {
                    l.add(new AnalyzedToken(word, manualTag.getPosTag(), manualTag.getLemma()));
                }
            } else {
                // Tiu, kiu (tabelvortoj).
                if ((matcher = patternTabelvorto.matcher(lWord)).find()) {
                    String type1Group = matcher.group(1).substring(0, 1).toLowerCase();
                    String type2Group = matcher.group(2);
                    String plGroup = matcher.group(3);
                    String accGroup = matcher.group(4);
                    String type3Group = matcher.group(5);
                    String type;
                    String plural;
                    String accusative;
                    if (accGroup == null) {
                        accusative = "xxx";
                    } else {
                        accusative = accGroup.equalsIgnoreCase("n") ? "akz" : "nak";
                    }
                    if (plGroup == null) {
                        plural = " pn ";
                    } else {
                        plural = plGroup.equalsIgnoreCase("j") ? " pl " : " np ";
                    }
                    type = ((type2Group == null) ? type3Group : type2Group).toLowerCase();
                    l.add(new AnalyzedToken(word, "T " + accusative + plural + type1Group + " " + type, null));
                    if ((matcher = patternTabelvortoAdverb.matcher(lWord)).find()) {
                        l.add(new AnalyzedToken(word, "E nak", lWord));
                    }
                // Words ending in .*oj?n? are nouns.
                } else if (lWord.endsWith("o")) {
                    l.add(new AnalyzedToken(word, "O nak np", lWord));
                } else if (lWord.length() >= 2 && lWord.endsWith("'")) {
                    l.add(new AnalyzedToken(word, "O nak np", lWord.substring(0, lWord.length() - 1) + "o"));
                } else if (lWord.endsWith("oj")) {
                    l.add(new AnalyzedToken(word, "O nak pl", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("on")) {
                    l.add(new AnalyzedToken(word, "O akz np", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("ojn")) {
                    l.add(new AnalyzedToken(word, "O akz pl", lWord.substring(0, lWord.length() - 2)));
                // Words ending in .*aj?n? are adjectives.
                } else if (lWord.endsWith("a")) {
                    l.add(new AnalyzedToken(word, "A nak np", lWord));
                } else if (lWord.endsWith("aj")) {
                    l.add(new AnalyzedToken(word, "A nak pl", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("an")) {
                    l.add(new AnalyzedToken(word, "A akz np", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("ajn")) {
                    l.add(new AnalyzedToken(word, "A akz pl", lWord.substring(0, lWord.length() - 2)));
                // Words ending in .*en? are adverbs.
                } else if (lWord.endsWith("e")) {
                    l.add(new AnalyzedToken(word, "E nak", lWord));
                } else if (lWord.endsWith("en")) {
                    l.add(new AnalyzedToken(word, "E akz", lWord.substring(0, lWord.length() - 1)));
                // Verbs.
                } else if ((matcher = patternVerb.matcher(lWord)).find()) {
                    String verb = matcher.group(1) + "i";
                    String tense = matcher.group(2);
                    String transitive = findTransitivity(verb);
                    l.add(new AnalyzedToken(word, "V " + transitive + " " + tense, verb));
                // Irregular word (no tag).
                } else {
                    l.add(new AnalyzedToken(word, null, null));
                }
                // Participle (can be combined with other tags).
                if ((matcher = patternParticiple.matcher(lWord)).find()) {
                    if (!setNonParticiple.contains(matcher.group(1))) {
                        String verb = matcher.group(2) + "i";
                        String aio = matcher.group(3);
                        String antAt = matcher.group(4).equals("n") ? "n" : "-";
                        String aoe = matcher.group(5);
                        String plural = matcher.group(6).equals("j") ? "pl" : "np";
                        String accusative = matcher.group(7).equals("n") ? "akz" : "nak";
                        String transitive = findTransitivity(verb);
                        l.add(new AnalyzedToken(word, "C " + accusative + " " + plural + " " + transitive + " " + aio + " " + antAt + " " + aoe, verb));
                    }
                }
            }
        } else {
            // Single letter word (no tag).
            l.add(new AnalyzedToken(word, null, null));
        }
        tokenReadings.add(new AnalyzedTokenReadings(l, 0));
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2