Search in sources :

Example 36 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class GermanTagger method tag.

public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens, boolean ignoreCase) throws IOException {
    initializeIfRequired();
    boolean firstWord = true;
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    for (String word : sentenceTokens) {
        List<AnalyzedToken> l = new ArrayList<>();
        List<TaggedWord> taggerTokens = getWordTagger().tag(word);
        if (firstWord && taggerTokens.isEmpty() && ignoreCase) {
            // e.g. "Das" -> "das" at start of sentence
            taggerTokens = getWordTagger().tag(word.toLowerCase());
            firstWord = word.matches("^\\W?$");
        } else if (pos == 0 && ignoreCase) {
            // "Haben", "Sollen", "Können", "Gerade" etc. at start of sentence
            taggerTokens.addAll(getWordTagger().tag(word.toLowerCase()));
        }
        if (taggerTokens.size() > 0) {
            l.addAll(getAnalyzedTokens(taggerTokens, word));
        } else {
            // word not known, try to decompose it and use the last part for POS tagging:
            if (!StringTools.isEmpty(word.trim())) {
                List<String> compoundParts = compoundTokenizer.tokenize(word);
                if (compoundParts.size() <= 1) {
                    l.add(getNoInfoToken(word));
                } else {
                    // last part governs a word's POS:
                    String lastPart = compoundParts.get(compoundParts.size() - 1);
                    if (StringTools.startsWithUppercase(word)) {
                        lastPart = StringTools.uppercaseFirstChar(lastPart);
                    }
                    List<TaggedWord> partTaggerTokens = getWordTagger().tag(lastPart);
                    if (partTaggerTokens.size() > 0) {
                        l.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts));
                    } else {
                        l.add(getNoInfoToken(word));
                    }
                }
            } else {
                l.add(getNoInfoToken(word));
            }
        }
        tokenReadings.add(new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), pos));
        pos += word.length();
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 37 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AccentuationDataLoader method loadWords.

Map<String, AnalyzedTokenReadings> loadWords(String path) {
    final Map<String, AnalyzedTokenReadings> map = new HashMap<>();
    final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
    try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
        while (scanner.hasNextLine()) {
            final String line = scanner.nextLine().trim();
            if (line.isEmpty() || line.charAt(0) == '#') {
                // ignore comments
                continue;
            }
            final String[] parts = line.split(";");
            if (parts.length != 3) {
                throw new RuntimeException("Format error in file " + path + ", line: " + line + ", " + "expected 3 semicolon-separated parts, got " + parts.length);
            }
            final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1], parts[2], null);
            map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
        }
    }
    return map;
}
Also used : Scanner(java.util.Scanner) AnalyzedToken(org.languagetool.AnalyzedToken) HashMap(java.util.HashMap) InputStream(java.io.InputStream) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 38 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class MorfologikCatalanSpellerRule method matchPostagRegexp.

/**
   * Match POS tag with regular expression
   */
private boolean matchPostagRegexp(AnalyzedTokenReadings aToken, Pattern pattern) {
    for (AnalyzedToken analyzedToken : aToken) {
        String posTag = analyzedToken.getPOSTag();
        if (posTag == null) {
            posTag = "UNKNOWN";
        }
        final Matcher m = pattern.matcher(posTag);
        if (m.matches()) {
            return true;
        }
    }
    return false;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) Matcher(java.util.regex.Matcher)

Example 39 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class ReplaceOperationNamesRule method matchPostagRegexp.

/**
   * Match POS tag with regular expression
   */
private boolean matchPostagRegexp(AnalyzedTokenReadings aToken, Pattern pattern) {
    boolean matches = false;
    for (AnalyzedToken analyzedToken : aToken) {
        String posTag = analyzedToken.getPOSTag();
        if (posTag == null) {
            posTag = "UNKNOWN";
        }
        final Matcher m = pattern.matcher(posTag);
        if (m.matches()) {
            matches = true;
            break;
        }
    }
    return matches;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) Matcher(java.util.regex.Matcher)

Example 40 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class CatalanTagger method additionalTags.

@Nullable
protected List<AnalyzedToken> additionalTags(String word, IStemmer stemmer) {
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());
    List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
    //Adjectiu femení singular o participi femení singular + -ment
    if (word.endsWith("ment")) {
        final String lowerWord = word.toLowerCase(conversionLocale);
        final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
        List<AnalyzedToken> taggerTokens;
        taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
        for (AnalyzedToken taggerToken : taggerTokens) {
            final String posTag = taggerToken.getPOSTag();
            if (posTag != null) {
                final Matcher m = ADJ_PART_FS.matcher(posTag);
                if (m.matches()) {
                    additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
                    return additionalTaggedTokens;
                }
            }
        }
    }
    //Any well-formed verb with prefixes is tagged as a verb copying the original tags
    Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
    if (matcher.matches()) {
        final String possibleVerb = matcher.group(2).toLowerCase();
        List<AnalyzedToken> taggerTokens;
        taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
        for (AnalyzedToken taggerToken : taggerTokens) {
            final String posTag = taggerToken.getPOSTag();
            if (posTag != null) {
                final Matcher m = VERB.matcher(posTag);
                if (m.matches()) {
                    String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
                    additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
                }
            }
        }
        return additionalTaggedTokens;
    }
    // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
    if (word.contains("ŀ") || word.contains("Ŀ")) {
        final String lowerWord = word.toLowerCase(conversionLocale);
        final String possibleWord = lowerWord.replaceAll("ŀ", "l·");
        List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
        return taggerTokens;
    }
    return null;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) Matcher(java.util.regex.Matcher) IStemmer(morfologik.stemming.IStemmer) ArrayList(java.util.ArrayList) DictionaryLookup(morfologik.stemming.DictionaryLookup) Nullable(org.jetbrains.annotations.Nullable)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2