Search in sources :

Example 71 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class EnglishChunkerTest method createReadingsList.

private List<AnalyzedTokenReadings> createReadingsList(String sentence) {
    StringTokenizer tokenizer = new StringTokenizer(sentence, " ", true);
    List<AnalyzedTokenReadings> result = new ArrayList<>();
    int pos = 0;
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        if (token.trim().isEmpty()) {
            result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), pos));
        } else {
            result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, "fake", "fake"), pos));
        }
        pos += token.length();
    }
    return result;
}
Also used : StringTokenizer(java.util.StringTokenizer) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 72 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class PolishTagger method tag.

@Override
public final List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) {
    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    for (String word : sentenceTokens) {
        final List<AnalyzedToken> l = new ArrayList<>();
        final String lowerWord = word.toLowerCase(plLocale);
        taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
        lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
        final boolean isLowercase = word.equals(lowerWord);
        //normal case
        addTokens(taggerTokens, l);
        if (!isLowercase) {
            //lowercase
            addTokens(lowerTaggerTokens, l);
        }
        //uppercase
        if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
            if (isLowercase) {
                upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(word)));
                if (!upperTaggerTokens.isEmpty()) {
                    addTokens(upperTaggerTokens, l);
                } else {
                    l.add(new AnalyzedToken(word, null, null));
                }
            } else {
                l.add(new AnalyzedToken(word, null, null));
            }
        }
        tokenReadings.add(new AnalyzedTokenReadings(l, pos));
        pos += word.length();
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 73 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class EsperantoTagger method tag.

@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
    lazyInit();
    Matcher matcher;
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    for (String word : sentenceTokens) {
        List<AnalyzedToken> l = new ArrayList<>();
        // spurious tagging as single letter words "A", "O", "E", etc.
        if (word.length() > 1) {
            String lWord = word.toLowerCase();
            List<TaggedWord> manualTags = manualTagger.tag(lWord);
            if (manualTags.size() > 0) {
                // This is a closed word for which we know its lemmas and tags.
                for (TaggedWord manualTag : manualTags) {
                    l.add(new AnalyzedToken(word, manualTag.getPosTag(), manualTag.getLemma()));
                }
            } else {
                // Tiu, kiu (tabelvortoj).
                if ((matcher = patternTabelvorto.matcher(lWord)).find()) {
                    String type1Group = matcher.group(1).substring(0, 1).toLowerCase();
                    String type2Group = matcher.group(2);
                    String plGroup = matcher.group(3);
                    String accGroup = matcher.group(4);
                    String type3Group = matcher.group(5);
                    String type;
                    String plural;
                    String accusative;
                    if (accGroup == null) {
                        accusative = "xxx";
                    } else {
                        accusative = accGroup.equalsIgnoreCase("n") ? "akz" : "nak";
                    }
                    if (plGroup == null) {
                        plural = " pn ";
                    } else {
                        plural = plGroup.equalsIgnoreCase("j") ? " pl " : " np ";
                    }
                    type = ((type2Group == null) ? type3Group : type2Group).toLowerCase();
                    l.add(new AnalyzedToken(word, "T " + accusative + plural + type1Group + " " + type, null));
                    if ((matcher = patternTabelvortoAdverb.matcher(lWord)).find()) {
                        l.add(new AnalyzedToken(word, "E nak", lWord));
                    }
                // Words ending in .*oj?n? are nouns.
                } else if (lWord.endsWith("o")) {
                    l.add(new AnalyzedToken(word, "O nak np", lWord));
                } else if (lWord.length() >= 2 && lWord.endsWith("'")) {
                    l.add(new AnalyzedToken(word, "O nak np", lWord.substring(0, lWord.length() - 1) + "o"));
                } else if (lWord.endsWith("oj")) {
                    l.add(new AnalyzedToken(word, "O nak pl", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("on")) {
                    l.add(new AnalyzedToken(word, "O akz np", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("ojn")) {
                    l.add(new AnalyzedToken(word, "O akz pl", lWord.substring(0, lWord.length() - 2)));
                // Words ending in .*aj?n? are adjectives.
                } else if (lWord.endsWith("a")) {
                    l.add(new AnalyzedToken(word, "A nak np", lWord));
                } else if (lWord.endsWith("aj")) {
                    l.add(new AnalyzedToken(word, "A nak pl", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("an")) {
                    l.add(new AnalyzedToken(word, "A akz np", lWord.substring(0, lWord.length() - 1)));
                } else if (lWord.endsWith("ajn")) {
                    l.add(new AnalyzedToken(word, "A akz pl", lWord.substring(0, lWord.length() - 2)));
                // Words ending in .*en? are adverbs.
                } else if (lWord.endsWith("e")) {
                    l.add(new AnalyzedToken(word, "E nak", lWord));
                } else if (lWord.endsWith("en")) {
                    l.add(new AnalyzedToken(word, "E akz", lWord.substring(0, lWord.length() - 1)));
                // Verbs.
                } else if ((matcher = patternVerb.matcher(lWord)).find()) {
                    String verb = matcher.group(1) + "i";
                    String tense = matcher.group(2);
                    String transitive = findTransitivity(verb);
                    l.add(new AnalyzedToken(word, "V " + transitive + " " + tense, verb));
                // Irregular word (no tag).
                } else {
                    l.add(new AnalyzedToken(word, null, null));
                }
                // Participle (can be combined with other tags).
                if ((matcher = patternParticiple.matcher(lWord)).find()) {
                    if (!setNonParticiple.contains(matcher.group(1))) {
                        String verb = matcher.group(2) + "i";
                        String aio = matcher.group(3);
                        String antAt = matcher.group(4).equals("n") ? "n" : "-";
                        String aoe = matcher.group(5);
                        String plural = matcher.group(6).equals("j") ? "pl" : "np";
                        String accusative = matcher.group(7).equals("n") ? "akz" : "nak";
                        String transitive = findTransitivity(verb);
                        l.add(new AnalyzedToken(word, "C " + accusative + " " + plural + " " + transitive + " " + aio + " " + antAt + " " + aoe, verb));
                    }
                }
            }
        } else {
            // Single letter word (no tag).
            l.add(new AnalyzedToken(word, null, null));
        }
        tokenReadings.add(new AnalyzedTokenReadings(l, 0));
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 74 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class JapaneseTagger method tag.

@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>(sentenceTokens.size());
    int pos = 0;
    for (String word : sentenceTokens) {
        AnalyzedToken at = asAnalyzedToken(word);
        tokenReadings.add(new AnalyzedTokenReadings(at, pos));
        pos += at.getToken().length();
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 75 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class BaseTagger method getAnalyzedTokens.

protected List<AnalyzedToken> getAnalyzedTokens(String word) {
    List<AnalyzedToken> result = new ArrayList<>();
    String lowerWord = word.toLowerCase(conversionLocale);
    boolean isLowercase = word.equals(lowerWord);
    boolean isMixedCase = StringTools.isMixedCase(word);
    List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
    List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
    //normal case:
    addTokens(taggerTokens, result);
    //tag non-lowercase (alluppercase or startuppercase), but not mixedcase word with lowercase word tags:
    if (!isLowercase && !isMixedCase) {
        addTokens(lowerTaggerTokens, result);
    }
    //tag lowercase word with startuppercase word tags:
    if (tagLowercaseWithUppercase) {
        if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
            if (isLowercase) {
                List<AnalyzedToken> upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(word)));
                if (!upperTaggerTokens.isEmpty()) {
                    addTokens(upperTaggerTokens, result);
                }
            }
        }
    }
    // Additional language-dependent-tagging:
    if (result.isEmpty()) {
        List<AnalyzedToken> additionalTaggedTokens = additionalTags(word, getWordTagger());
        addTokens(additionalTaggedTokens, result);
    }
    if (result.isEmpty()) {
        result.add(new AnalyzedToken(word, null, null));
    }
    return result;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2