use of morfologik.stemming.DictionaryLookup in project languagetool by languagetool-org.
the class GermanTaggerEnhancer method run.
private void run() throws IOException {
final Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
final DictionaryLookup dl = new DictionaryLookup(dictionary);
Tagger tagger = new German().getTagger();
String prev = null;
for (WordData wd : dl) {
String word = wd.getWord().toString();
if (word.endsWith("er") && StringTools.startsWithUppercase(word)) {
if (!hasAdjReading(tagger, word) && isEigenname(tagger, word.substring(0, word.length() - 2)) && !word.equals(prev)) {
for (String newTags : ADJ_READINGS) {
System.out.println(word + "\t" + word + "\t" + newTags + ":DEF");
System.out.println(word + "\t" + word + "\t" + newTags + ":IND");
System.out.println(word + "\t" + word + "\t" + newTags + ":SOL");
}
prev = word;
}
}
}
}
use of morfologik.stemming.DictionaryLookup in project languagetool by languagetool-org.
the class GermanTaggerTest method testDictionary.
@Test
public void testDictionary() throws IOException {
Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
DictionaryLookup dl = new DictionaryLookup(dictionary);
for (WordData wd : dl) {
if (wd.getTag() == null || wd.getTag().length() == 0) {
System.err.println("**** Warning: the word " + wd.getWord() + "/" + wd.getStem() + " lacks a POS tag in the dictionary.");
}
}
}
use of morfologik.stemming.DictionaryLookup in project languagetool by languagetool-org.
the class CatalanTagger method additionalTags.
@Nullable
protected List<AnalyzedToken> additionalTags(String word, IStemmer stemmer) {
final IStemmer dictLookup = new DictionaryLookup(getDictionary());
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
//Adjectiu femení singular o participi femení singular + -ment
if (word.endsWith("ment")) {
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
for (AnalyzedToken taggerToken : taggerTokens) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = ADJ_PART_FS.matcher(posTag);
if (m.matches()) {
additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
return additionalTaggedTokens;
}
}
}
}
//Any well-formed verb with prefixes is tagged as a verb copying the original tags
Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
if (matcher.matches()) {
final String possibleVerb = matcher.group(2).toLowerCase();
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
for (AnalyzedToken taggerToken : taggerTokens) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = VERB.matcher(posTag);
if (m.matches()) {
String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
}
}
}
return additionalTaggedTokens;
}
// U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
if (word.contains("ŀ") || word.contains("Ŀ")) {
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleWord = lowerWord.replaceAll("ŀ", "l·");
List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
return taggerTokens;
}
return null;
}
use of morfologik.stemming.DictionaryLookup in project languagetool by languagetool-org.
the class MorfologikTagger method tag.
@Override
public List<TaggedWord> tag(String word) {
List<TaggedWord> result = new ArrayList<>();
try {
IStemmer dictLookup = new DictionaryLookup(getDictionary());
List<WordData> lookup = dictLookup.lookup(word);
for (WordData wordData : lookup) {
String tag = wordData.getTag() == null ? null : wordData.getTag().toString();
// The frequency data is in the last byte (without a separator)
if (dictionary.metadata.isFrequencyIncluded() && tag != null && tag.length() > 1) {
tag = tag.substring(0, tag.length() - 1);
}
String stem = wordData.getStem() == null ? null : wordData.getStem().toString();
TaggedWord taggedWord = new TaggedWord(stem, tag);
result.add(taggedWord);
}
} catch (IOException e) {
throw new RuntimeException("Could not tag word '" + word + "'", e);
}
return result;
}
use of morfologik.stemming.DictionaryLookup in project languagetool by languagetool-org.
the class TestTools method testDictionary.
public static void testDictionary(BaseTagger tagger, Language language) throws IOException {
Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl(tagger.getDictionaryPath()));
DictionaryLookup lookup = new DictionaryLookup(dictionary);
for (WordData wordData : lookup) {
if (wordData.getTag() == null || wordData.getTag().length() == 0) {
System.err.println("**** Warning: " + language + ": the word " + wordData.getWord() + "/" + wordData.getStem() + " lacks a POS tag in the dictionary.");
}
}
}
Aggregations