use of morfologik.stemming.WordData in project languagetool by languagetool-org.
the class GermanTaggerEnhancer method run.
private void run() throws IOException {
final Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
final DictionaryLookup dl = new DictionaryLookup(dictionary);
Tagger tagger = new German().getTagger();
String prev = null;
for (WordData wd : dl) {
String word = wd.getWord().toString();
if (word.endsWith("er") && StringTools.startsWithUppercase(word)) {
if (!hasAdjReading(tagger, word) && isEigenname(tagger, word.substring(0, word.length() - 2)) && !word.equals(prev)) {
for (String newTags : ADJ_READINGS) {
System.out.println(word + "\t" + word + "\t" + newTags + ":DEF");
System.out.println(word + "\t" + word + "\t" + newTags + ":IND");
System.out.println(word + "\t" + word + "\t" + newTags + ":SOL");
}
prev = word;
}
}
}
}
use of morfologik.stemming.WordData in project languagetool by languagetool-org.
the class EnglishSynthesizer method synthesize.
/**
* Get a form of a given AnalyzedToken, where the form is defined by a
* part-of-speech tag.
*
* @param token AnalyzedToken to be inflected.
* @param posTag A desired part-of-speech tag.
* @return String value - inflected word.
*/
@Override
public String[] synthesize(AnalyzedToken token, String posTag) throws IOException {
String aOrAn = aVsAnRule.suggestAorAn(token.getToken());
if (ADD_DETERMINER.equals(posTag)) {
return new String[] { aOrAn, "the " + token.getToken() };
} else if (ADD_IND_DETERMINER.equals(posTag)) {
return new String[] { aOrAn };
}
IStemmer synthesizer = createStemmer();
List<WordData> wordData = synthesizer.lookup(token.getLemma() + "|" + posTag);
List<String> wordForms = new ArrayList<>();
for (WordData wd : wordData) {
wordForms.add(wd.getStem().toString());
}
return wordForms.toArray(new String[wordForms.size()]);
}
use of morfologik.stemming.WordData in project languagetool by languagetool-org.
the class GermanTaggerTest method testDictionary.
@Test
public void testDictionary() throws IOException {
Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
DictionaryLookup dl = new DictionaryLookup(dictionary);
for (WordData wd : dl) {
if (wd.getTag() == null || wd.getTag().length() == 0) {
System.err.println("**** Warning: the word " + wd.getWord() + "/" + wd.getStem() + " lacks a POS tag in the dictionary.");
}
}
}
use of morfologik.stemming.WordData in project languagetool by languagetool-org.
the class CatalanSynthesizer method lookupWithEl.
/**
* Lookup the inflected forms of a lemma defined by a part-of-speech tag.
* Adds determiner "el" properly inflected and preposition
* (prep. +) det. + noun. / adj.
* @param lemma the lemma to be inflected.
* @param posTag the desired part-of-speech tag.
* @param results the list to collect the inflected forms.
* @param synthesizer the stemmer to use.
*/
private void lookupWithEl(String lemma, String posTag, String prep, List<String> results, IStemmer synthesizer) {
final List<WordData> wordForms = synthesizer.lookup(lemma + "|" + posTag);
final Matcher mMS = pMS.matcher(posTag);
final Matcher mFS = pFS.matcher(posTag);
final Matcher mMP = pMP.matcher(posTag);
final Matcher mFP = pFP.matcher(posTag);
for (WordData wd : wordForms) {
final String word = wd.getStem().toString();
if (mMS.matches()) {
final Matcher mMascYes = pMascYes.matcher(word);
final Matcher mMascNo = pMascNo.matcher(word);
if (prep.equals("per")) {
if (mMascYes.matches() && !mMascNo.matches()) {
results.add("per l'" + word);
} else {
results.add("pel " + word);
}
} else if (prep.isEmpty()) {
if (mMascYes.matches() && !mMascNo.matches()) {
results.add("l'" + word);
} else {
results.add("el " + word);
}
} else {
if (mMascYes.matches() && !mMascNo.matches()) {
results.add(prep + " l'" + word);
} else {
results.add(prep + "l " + word);
}
}
}
if (mFS.matches()) {
final Matcher mFemYes = pFemYes.matcher(word);
final Matcher mFemNo = pFemNo.matcher(word);
if (prep.equals("per")) {
if (mFemYes.matches() && !mFemNo.matches()) {
results.add("per l'" + word);
} else {
results.add("per la " + word);
}
} else if (prep.isEmpty()) {
if (mFemYes.matches() && !mFemNo.matches()) {
results.add("l'" + word);
} else {
results.add("la " + word);
}
} else {
if (mFemYes.matches() && !mFemNo.matches()) {
results.add(prep + " l'" + word);
} else {
results.add(prep + " la " + word);
}
}
}
if (mMP.matches()) {
if (prep.equals("per")) {
results.add("pels " + word);
} else if (prep.isEmpty()) {
results.add("els " + word);
} else {
results.add(prep + "ls " + word);
}
}
if (mFP.matches()) {
if (prep.isEmpty()) {
results.add("les " + word);
} else {
results.add(prep + " les " + word);
}
}
}
}
use of morfologik.stemming.WordData in project languagetool by languagetool-org.
the class MorfologikTagger method tag.
@Override
public List<TaggedWord> tag(String word) {
List<TaggedWord> result = new ArrayList<>();
try {
IStemmer dictLookup = new DictionaryLookup(getDictionary());
List<WordData> lookup = dictLookup.lookup(word);
for (WordData wordData : lookup) {
String tag = wordData.getTag() == null ? null : wordData.getTag().toString();
// The frequency data is in the last byte (without a separator)
if (dictionary.metadata.isFrequencyIncluded() && tag != null && tag.length() > 1) {
tag = tag.substring(0, tag.length() - 1);
}
String stem = wordData.getStem() == null ? null : wordData.getStem().toString();
TaggedWord taggedWord = new TaggedWord(stem, tag);
result.add(taggedWord);
}
} catch (IOException e) {
throw new RuntimeException("Could not tag word '" + word + "'", e);
}
return result;
}
Aggregations