use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.
the class EnglishSynthesizer method synthesize.
/**
* Get a form of a given AnalyzedToken, where the form is defined by a
* part-of-speech tag.
*
* @param token AnalyzedToken to be inflected.
* @param posTag A desired part-of-speech tag.
* @return String value - inflected word.
*/
@Override
public String[] synthesize(AnalyzedToken token, String posTag) throws IOException {
String aOrAn = aVsAnRule.suggestAorAn(token.getToken());
if (ADD_DETERMINER.equals(posTag)) {
return new String[] { aOrAn, "the " + token.getToken() };
} else if (ADD_IND_DETERMINER.equals(posTag)) {
return new String[] { aOrAn };
}
IStemmer synthesizer = createStemmer();
List<WordData> wordData = synthesizer.lookup(token.getLemma() + "|" + posTag);
List<String> wordForms = new ArrayList<>();
for (WordData wd : wordData) {
wordForms.add(wd.getStem().toString());
}
return wordForms.toArray(new String[wordForms.size()]);
}
use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.
the class CatalanSynthesizer method synthesize.
@Override
public String[] synthesize(final AnalyzedToken token, final String posTag) throws IOException {
initPossibleTags();
Pattern p;
boolean addDt = false;
String prep = "";
final Matcher mPrep = pPrep.matcher(posTag);
if (mPrep.matches()) {
// add definite article before token
addDt = true;
if (mPrep.groupCount() > 1) {
// add preposition before article
prep = mPrep.group(2);
}
}
if (addDt) {
p = Pattern.compile("N.*|A.*|V.P.*|PX.");
} else {
p = Pattern.compile(posTag);
}
final List<String> results = new ArrayList<>();
final IStemmer synthesizer = createStemmer();
for (final String tag : possibleTags) {
final Matcher m = p.matcher(tag);
if (m.matches()) {
if (addDt) {
lookupWithEl(token.getLemma(), tag, prep, results, synthesizer);
} else {
lookup(token.getLemma(), tag, results);
}
}
}
// if not found, try verbs from any regional variant
if ((results.size() == 0) && posTag.startsWith("V")) {
if (!posTag.endsWith("0")) {
lookup(token.getLemma(), posTag.substring(0, posTag.length() - 1).concat("0"), results);
}
if (results.size() == 0) {
// another try
return synthesize(token, posTag.substring(0, posTag.length() - 1).concat("."), true);
}
}
return results.toArray(new String[results.size()]);
}
use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.
the class CatalanTagger method additionalTags.
@Nullable
protected List<AnalyzedToken> additionalTags(String word, IStemmer stemmer) {
final IStemmer dictLookup = new DictionaryLookup(getDictionary());
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
//Adjectiu femení singular o participi femení singular + -ment
if (word.endsWith("ment")) {
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
for (AnalyzedToken taggerToken : taggerTokens) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = ADJ_PART_FS.matcher(posTag);
if (m.matches()) {
additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
return additionalTaggedTokens;
}
}
}
}
//Any well-formed verb with prefixes is tagged as a verb copying the original tags
Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
if (matcher.matches()) {
final String possibleVerb = matcher.group(2).toLowerCase();
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
for (AnalyzedToken taggerToken : taggerTokens) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = VERB.matcher(posTag);
if (m.matches()) {
String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
}
}
}
return additionalTaggedTokens;
}
// U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
if (word.contains("ŀ") || word.contains("Ŀ")) {
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleWord = lowerWord.replaceAll("ŀ", "l·");
List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
return taggerTokens;
}
return null;
}
use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.
the class MorfologikTagger method tag.
@Override
public List<TaggedWord> tag(String word) {
List<TaggedWord> result = new ArrayList<>();
try {
IStemmer dictLookup = new DictionaryLookup(getDictionary());
List<WordData> lookup = dictLookup.lookup(word);
for (WordData wordData : lookup) {
String tag = wordData.getTag() == null ? null : wordData.getTag().toString();
// The frequency data is in the last byte (without a separator)
if (dictionary.metadata.isFrequencyIncluded() && tag != null && tag.length() > 1) {
tag = tag.substring(0, tag.length() - 1);
}
String stem = wordData.getStem() == null ? null : wordData.getStem().toString();
TaggedWord taggedWord = new TaggedWord(stem, tag);
result.add(taggedWord);
}
} catch (IOException e) {
throw new RuntimeException("Could not tag word '" + word + "'", e);
}
return result;
}
use of morfologik.stemming.IStemmer in project languagetool by languagetool-org.
the class PolishSynthesizer method synthesize.
@Override
public final String[] synthesize(final AnalyzedToken token, final String pos, final boolean posTagRegExp) throws IOException {
if (pos == null) {
return null;
}
String posTag = pos;
if (posTagRegExp) {
if (possibleTags == null) {
try (InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(TAGS_FILE_NAME)) {
possibleTags = SynthesizerTools.loadWords(stream);
}
}
final IStemmer synthesizer = new DictionaryLookup(getDictionary());
final List<String> results = new ArrayList<>();
boolean isNegated = false;
if (token.getPOSTag() != null) {
isNegated = posTag.indexOf(NEGATION_TAG) > 0 || token.getPOSTag().indexOf(NEGATION_TAG) > 0 && !(posTag.indexOf(COMP_TAG) > 0) && !(posTag.indexOf(SUP_TAG) > 0);
}
if (isNegated) {
posTag = posTag.replaceAll(NEGATION_TAG, POTENTIAL_NEGATION_TAG + "?");
}
final Pattern p = Pattern.compile(posTag.replace('+', '|'));
for (final String tag : possibleTags) {
final Matcher m = p.matcher(tag);
if (m.matches()) {
final List<String> wordForms = getWordForms(token, tag, isNegated, synthesizer);
if (wordForms != null) {
results.addAll(wordForms);
}
}
}
//remove duplicates
Set<String> hs = new HashSet<>();
hs.addAll(results);
results.clear();
results.addAll(hs);
return results.toArray(new String[results.size()]);
}
return synthesize(token, posTag);
}
Aggregations