Search in sources :

Example 1 with IStemmer

use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.

the class EnglishSynthesizer method synthesize.

/**
   * Get a form of a given AnalyzedToken, where the form is defined by a
   * part-of-speech tag.
   * 
   * @param token AnalyzedToken to be inflected.
   * @param posTag A desired part-of-speech tag.
   * @return String value - inflected word.
   */
@Override
public String[] synthesize(AnalyzedToken token, String posTag) throws IOException {
    String aOrAn = aVsAnRule.suggestAorAn(token.getToken());
    if (ADD_DETERMINER.equals(posTag)) {
        return new String[] { aOrAn, "the " + token.getToken() };
    } else if (ADD_IND_DETERMINER.equals(posTag)) {
        return new String[] { aOrAn };
    }
    IStemmer synthesizer = createStemmer();
    List<WordData> wordData = synthesizer.lookup(token.getLemma() + "|" + posTag);
    List<String> wordForms = new ArrayList<>();
    for (WordData wd : wordData) {
        wordForms.add(wd.getStem().toString());
    }
    return wordForms.toArray(new String[wordForms.size()]);
}
Also used : IStemmer(morfologik.stemming.IStemmer) WordData(morfologik.stemming.WordData) ArrayList(java.util.ArrayList)

Example 2 with IStemmer

use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.

the class CatalanSynthesizer method synthesize.

@Override
public String[] synthesize(final AnalyzedToken token, final String posTag) throws IOException {
    initPossibleTags();
    Pattern p;
    boolean addDt = false;
    String prep = "";
    final Matcher mPrep = pPrep.matcher(posTag);
    if (mPrep.matches()) {
        // add definite article before token
        addDt = true;
        if (mPrep.groupCount() > 1) {
            // add preposition before article
            prep = mPrep.group(2);
        }
    }
    if (addDt) {
        p = Pattern.compile("N.*|A.*|V.P.*|PX.");
    } else {
        p = Pattern.compile(posTag);
    }
    final List<String> results = new ArrayList<>();
    final IStemmer synthesizer = createStemmer();
    for (final String tag : possibleTags) {
        final Matcher m = p.matcher(tag);
        if (m.matches()) {
            if (addDt) {
                lookupWithEl(token.getLemma(), tag, prep, results, synthesizer);
            } else {
                lookup(token.getLemma(), tag, results);
            }
        }
    }
    // if not found, try verbs from any regional variant
    if ((results.size() == 0) && posTag.startsWith("V")) {
        if (!posTag.endsWith("0")) {
            lookup(token.getLemma(), posTag.substring(0, posTag.length() - 1).concat("0"), results);
        }
        if (results.size() == 0) {
            // another try
            return synthesize(token, posTag.substring(0, posTag.length() - 1).concat("."), true);
        }
    }
    return results.toArray(new String[results.size()]);
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) IStemmer(morfologik.stemming.IStemmer) ArrayList(java.util.ArrayList)

Example 3 with IStemmer

use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.

the class CatalanTagger method additionalTags.

@Nullable
protected List<AnalyzedToken> additionalTags(String word, IStemmer stemmer) {
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());
    List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
    //Adjectiu femení singular o participi femení singular + -ment
    if (word.endsWith("ment")) {
        final String lowerWord = word.toLowerCase(conversionLocale);
        final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
        List<AnalyzedToken> taggerTokens;
        taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
        for (AnalyzedToken taggerToken : taggerTokens) {
            final String posTag = taggerToken.getPOSTag();
            if (posTag != null) {
                final Matcher m = ADJ_PART_FS.matcher(posTag);
                if (m.matches()) {
                    additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
                    return additionalTaggedTokens;
                }
            }
        }
    }
    //Any well-formed verb with prefixes is tagged as a verb copying the original tags
    Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
    if (matcher.matches()) {
        final String possibleVerb = matcher.group(2).toLowerCase();
        List<AnalyzedToken> taggerTokens;
        taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
        for (AnalyzedToken taggerToken : taggerTokens) {
            final String posTag = taggerToken.getPOSTag();
            if (posTag != null) {
                final Matcher m = VERB.matcher(posTag);
                if (m.matches()) {
                    String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
                    additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
                }
            }
        }
        return additionalTaggedTokens;
    }
    // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
    if (word.contains("ŀ") || word.contains("Ŀ")) {
        final String lowerWord = word.toLowerCase(conversionLocale);
        final String possibleWord = lowerWord.replaceAll("ŀ", "l·");
        List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
        return taggerTokens;
    }
    return null;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) Matcher(java.util.regex.Matcher) IStemmer(morfologik.stemming.IStemmer) ArrayList(java.util.ArrayList) DictionaryLookup(morfologik.stemming.DictionaryLookup) Nullable(org.jetbrains.annotations.Nullable)

Example 4 with IStemmer

use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.

the class MorfologikTagger method tag.

@Override
public List<TaggedWord> tag(String word) {
    List<TaggedWord> result = new ArrayList<>();
    try {
        IStemmer dictLookup = new DictionaryLookup(getDictionary());
        List<WordData> lookup = dictLookup.lookup(word);
        for (WordData wordData : lookup) {
            String tag = wordData.getTag() == null ? null : wordData.getTag().toString();
            // The frequency data is in the last byte (without a separator)
            if (dictionary.metadata.isFrequencyIncluded() && tag != null && tag.length() > 1) {
                tag = tag.substring(0, tag.length() - 1);
            }
            String stem = wordData.getStem() == null ? null : wordData.getStem().toString();
            TaggedWord taggedWord = new TaggedWord(stem, tag);
            result.add(taggedWord);
        }
    } catch (IOException e) {
        throw new RuntimeException("Could not tag word '" + word + "'", e);
    }
    return result;
}
Also used : IStemmer(morfologik.stemming.IStemmer) WordData(morfologik.stemming.WordData) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DictionaryLookup(morfologik.stemming.DictionaryLookup)

Example 5 with IStemmer

use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.

the class PolishSynthesizer method synthesize.

@Override
public final String[] synthesize(final AnalyzedToken token, final String pos, final boolean posTagRegExp) throws IOException {
    if (pos == null) {
        return null;
    }
    String posTag = pos;
    if (posTagRegExp) {
        if (possibleTags == null) {
            try (InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(TAGS_FILE_NAME)) {
                possibleTags = SynthesizerTools.loadWords(stream);
            }
        }
        final IStemmer synthesizer = new DictionaryLookup(getDictionary());
        final List<String> results = new ArrayList<>();
        boolean isNegated = false;
        if (token.getPOSTag() != null) {
            isNegated = posTag.indexOf(NEGATION_TAG) > 0 || token.getPOSTag().indexOf(NEGATION_TAG) > 0 && !(posTag.indexOf(COMP_TAG) > 0) && !(posTag.indexOf(SUP_TAG) > 0);
        }
        if (isNegated) {
            posTag = posTag.replaceAll(NEGATION_TAG, POTENTIAL_NEGATION_TAG + "?");
        }
        final Pattern p = Pattern.compile(posTag.replace('+', '|'));
        for (final String tag : possibleTags) {
            final Matcher m = p.matcher(tag);
            if (m.matches()) {
                final List<String> wordForms = getWordForms(token, tag, isNegated, synthesizer);
                if (wordForms != null) {
                    results.addAll(wordForms);
                }
            }
        }
        //remove duplicates
        Set<String> hs = new HashSet<>();
        hs.addAll(results);
        results.clear();
        results.addAll(hs);
        return results.toArray(new String[results.size()]);
    }
    return synthesize(token, posTag);
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) InputStream(java.io.InputStream) IStemmer(morfologik.stemming.IStemmer) ArrayList(java.util.ArrayList) DictionaryLookup(morfologik.stemming.DictionaryLookup) HashSet(java.util.HashSet)

Aggregations

IStemmer (morfologik.stemming.IStemmer)7 ArrayList (java.util.ArrayList)6 DictionaryLookup (morfologik.stemming.DictionaryLookup)5 Matcher (java.util.regex.Matcher)3 Pattern (java.util.regex.Pattern)2 WordData (morfologik.stemming.WordData)2 AnalyzedToken (org.languagetool.AnalyzedToken)2 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 HashSet (java.util.HashSet)1 Nullable (org.jetbrains.annotations.Nullable)1 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)1 ChunkTag (org.languagetool.chunking.ChunkTag)1