Search in sources :

Example 1 with WordData

use of morfologik.stemming.WordData in project languagetool by languagetool-org.

the class GermanTaggerEnhancer method run.

private void run() throws IOException {
    final Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
    final DictionaryLookup dl = new DictionaryLookup(dictionary);
    Tagger tagger = new German().getTagger();
    String prev = null;
    for (WordData wd : dl) {
        String word = wd.getWord().toString();
        if (word.endsWith("er") && StringTools.startsWithUppercase(word)) {
            if (!hasAdjReading(tagger, word) && isEigenname(tagger, word.substring(0, word.length() - 2)) && !word.equals(prev)) {
                for (String newTags : ADJ_READINGS) {
                    System.out.println(word + "\t" + word + "\t" + newTags + ":DEF");
                    System.out.println(word + "\t" + word + "\t" + newTags + ":IND");
                    System.out.println(word + "\t" + word + "\t" + newTags + ":SOL");
                }
                prev = word;
            }
        }
    }
}
Also used : Dictionary(morfologik.stemming.Dictionary) Tagger(org.languagetool.tagging.Tagger) WordData(morfologik.stemming.WordData) German(org.languagetool.language.German) DictionaryLookup(morfologik.stemming.DictionaryLookup)

Example 2 with WordData

use of morfologik.stemming.WordData in project languagetool by languagetool-org.

the class EnglishSynthesizer method synthesize.

/**
   * Get a form of a given AnalyzedToken, where the form is defined by a
   * part-of-speech tag.
   * 
   * @param token AnalyzedToken to be inflected.
   * @param posTag A desired part-of-speech tag.
   * @return String value - inflected word.
   */
@Override
public String[] synthesize(AnalyzedToken token, String posTag) throws IOException {
    String aOrAn = aVsAnRule.suggestAorAn(token.getToken());
    if (ADD_DETERMINER.equals(posTag)) {
        return new String[] { aOrAn, "the " + token.getToken() };
    } else if (ADD_IND_DETERMINER.equals(posTag)) {
        return new String[] { aOrAn };
    }
    IStemmer synthesizer = createStemmer();
    List<WordData> wordData = synthesizer.lookup(token.getLemma() + "|" + posTag);
    List<String> wordForms = new ArrayList<>();
    for (WordData wd : wordData) {
        wordForms.add(wd.getStem().toString());
    }
    return wordForms.toArray(new String[wordForms.size()]);
}
Also used : IStemmer(morfologik.stemming.IStemmer) WordData(morfologik.stemming.WordData) ArrayList(java.util.ArrayList)

Example 3 with WordData

use of morfologik.stemming.WordData in project languagetool by languagetool-org.

the class GermanTaggerTest method testDictionary.

@Test
public void testDictionary() throws IOException {
    Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
    DictionaryLookup dl = new DictionaryLookup(dictionary);
    for (WordData wd : dl) {
        if (wd.getTag() == null || wd.getTag().length() == 0) {
            System.err.println("**** Warning: the word " + wd.getWord() + "/" + wd.getStem() + " lacks a POS tag in the dictionary.");
        }
    }
}
Also used : Dictionary(morfologik.stemming.Dictionary) WordData(morfologik.stemming.WordData) DictionaryLookup(morfologik.stemming.DictionaryLookup) Test(org.junit.Test)

Example 4 with WordData

use of morfologik.stemming.WordData in project languagetool by languagetool-org.

the class CatalanSynthesizer method lookupWithEl.

/**
   * Lookup the inflected forms of a lemma defined by a part-of-speech tag.
   * Adds determiner "el" properly inflected and preposition
   * (prep. +) det. + noun. / adj.
   * @param lemma the lemma to be inflected.
   * @param posTag the desired part-of-speech tag.
   * @param results the list to collect the inflected forms.
   * @param synthesizer the stemmer to use.
   */
private void lookupWithEl(String lemma, String posTag, String prep, List<String> results, IStemmer synthesizer) {
    final List<WordData> wordForms = synthesizer.lookup(lemma + "|" + posTag);
    final Matcher mMS = pMS.matcher(posTag);
    final Matcher mFS = pFS.matcher(posTag);
    final Matcher mMP = pMP.matcher(posTag);
    final Matcher mFP = pFP.matcher(posTag);
    for (WordData wd : wordForms) {
        final String word = wd.getStem().toString();
        if (mMS.matches()) {
            final Matcher mMascYes = pMascYes.matcher(word);
            final Matcher mMascNo = pMascNo.matcher(word);
            if (prep.equals("per")) {
                if (mMascYes.matches() && !mMascNo.matches()) {
                    results.add("per l'" + word);
                } else {
                    results.add("pel " + word);
                }
            } else if (prep.isEmpty()) {
                if (mMascYes.matches() && !mMascNo.matches()) {
                    results.add("l'" + word);
                } else {
                    results.add("el " + word);
                }
            } else {
                if (mMascYes.matches() && !mMascNo.matches()) {
                    results.add(prep + " l'" + word);
                } else {
                    results.add(prep + "l " + word);
                }
            }
        }
        if (mFS.matches()) {
            final Matcher mFemYes = pFemYes.matcher(word);
            final Matcher mFemNo = pFemNo.matcher(word);
            if (prep.equals("per")) {
                if (mFemYes.matches() && !mFemNo.matches()) {
                    results.add("per l'" + word);
                } else {
                    results.add("per la " + word);
                }
            } else if (prep.isEmpty()) {
                if (mFemYes.matches() && !mFemNo.matches()) {
                    results.add("l'" + word);
                } else {
                    results.add("la " + word);
                }
            } else {
                if (mFemYes.matches() && !mFemNo.matches()) {
                    results.add(prep + " l'" + word);
                } else {
                    results.add(prep + " la " + word);
                }
            }
        }
        if (mMP.matches()) {
            if (prep.equals("per")) {
                results.add("pels " + word);
            } else if (prep.isEmpty()) {
                results.add("els " + word);
            } else {
                results.add(prep + "ls " + word);
            }
        }
        if (mFP.matches()) {
            if (prep.isEmpty()) {
                results.add("les " + word);
            } else {
                results.add(prep + " les " + word);
            }
        }
    }
}
Also used : Matcher(java.util.regex.Matcher) WordData(morfologik.stemming.WordData)

Example 5 with WordData

use of morfologik.stemming.WordData in project languagetool by languagetool-org.

the class MorfologikTagger method tag.

@Override
public List<TaggedWord> tag(String word) {
    List<TaggedWord> result = new ArrayList<>();
    try {
        IStemmer dictLookup = new DictionaryLookup(getDictionary());
        List<WordData> lookup = dictLookup.lookup(word);
        for (WordData wordData : lookup) {
            String tag = wordData.getTag() == null ? null : wordData.getTag().toString();
            // The frequency data is in the last byte (without a separator)
            if (dictionary.metadata.isFrequencyIncluded() && tag != null && tag.length() > 1) {
                tag = tag.substring(0, tag.length() - 1);
            }
            String stem = wordData.getStem() == null ? null : wordData.getStem().toString();
            TaggedWord taggedWord = new TaggedWord(stem, tag);
            result.add(taggedWord);
        }
    } catch (IOException e) {
        throw new RuntimeException("Could not tag word '" + word + "'", e);
    }
    return result;
}
Also used : IStemmer(morfologik.stemming.IStemmer) WordData(morfologik.stemming.WordData) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DictionaryLookup(morfologik.stemming.DictionaryLookup)

Aggregations

WordData (morfologik.stemming.WordData)8 DictionaryLookup (morfologik.stemming.DictionaryLookup)4 ArrayList (java.util.ArrayList)3 Dictionary (morfologik.stemming.Dictionary)3 IStemmer (morfologik.stemming.IStemmer)2 IOException (java.io.IOException)1 Matcher (java.util.regex.Matcher)1 Test (org.junit.Test)1 German (org.languagetool.language.German)1 Tagger (org.languagetool.tagging.Tagger)1