use of morfologik.stemming.DictionaryLookup in project languagetool by languagetool-org.
the class PolishSynthesizer method synthesize.
@Override
public final String[] synthesize(final AnalyzedToken token, final String pos, final boolean posTagRegExp) throws IOException {
if (pos == null) {
return null;
}
String posTag = pos;
if (posTagRegExp) {
if (possibleTags == null) {
try (InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(TAGS_FILE_NAME)) {
possibleTags = SynthesizerTools.loadWords(stream);
}
}
final IStemmer synthesizer = new DictionaryLookup(getDictionary());
final List<String> results = new ArrayList<>();
boolean isNegated = false;
if (token.getPOSTag() != null) {
isNegated = posTag.indexOf(NEGATION_TAG) > 0 || token.getPOSTag().indexOf(NEGATION_TAG) > 0 && !(posTag.indexOf(COMP_TAG) > 0) && !(posTag.indexOf(SUP_TAG) > 0);
}
if (isNegated) {
posTag = posTag.replaceAll(NEGATION_TAG, POTENTIAL_NEGATION_TAG + "?");
}
final Pattern p = Pattern.compile(posTag.replace('+', '|'));
for (final String tag : possibleTags) {
final Matcher m = p.matcher(tag);
if (m.matches()) {
final List<String> wordForms = getWordForms(token, tag, isNegated, synthesizer);
if (wordForms != null) {
results.addAll(wordForms);
}
}
}
//remove duplicates
Set<String> hs = new HashSet<>();
hs.addAll(results);
results.clear();
results.addAll(hs);
return results.toArray(new String[results.size()]);
}
return synthesize(token, posTag);
}
use of morfologik.stemming.DictionaryLookup in project languagetool by languagetool-org.
the class CatalanTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
final IStemmer dictLookup = new DictionaryLookup(getDictionary());
for (String word : sentenceTokens) {
// This hack allows all rules and dictionary entries to work with
// typewriter apostrophe
boolean containsTypewriterApostrophe = false;
if (word.length() > 1) {
if (word.contains("'")) {
containsTypewriterApostrophe = true;
}
word = word.replace("’", "'");
}
final List<AnalyzedToken> l = new ArrayList<>();
final String lowerWord = word.toLowerCase(conversionLocale);
final boolean isLowercase = word.equals(lowerWord);
final boolean isMixedCase = StringTools.isMixedCase(word);
List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
// normal case:
addTokens(taggerTokens, l);
// word with lowercase word tags:
if (!isLowercase && !isMixedCase) {
List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
addTokens(lowerTaggerTokens, l);
}
// additional tagging with prefixes
if (l.isEmpty() && !isMixedCase) {
addTokens(additionalTags(word, dictLookup), l);
}
if (l.isEmpty()) {
l.add(new AnalyzedToken(word, null, null));
}
AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos);
if (containsTypewriterApostrophe) {
List<ChunkTag> listChunkTags = new ArrayList<>();
listChunkTags.add(new ChunkTag("containsTypewriterApostrophe"));
atr.setChunkTags(listChunkTags);
}
tokenReadings.add(atr);
pos += word.length();
}
return tokenReadings;
}
use of morfologik.stemming.DictionaryLookup in project languagetool by languagetool-org.
the class PolishSynthesizer method synthesize.
@Override
public final String[] synthesize(final AnalyzedToken token, final String posTag) throws IOException {
if (posTag == null) {
return null;
}
final IStemmer synthesizer = new DictionaryLookup(getDictionary());
boolean isNegated = false;
if (token.getPOSTag() != null) {
isNegated = posTag.indexOf(NEGATION_TAG) > 0 || token.getPOSTag().indexOf(NEGATION_TAG) > 0 && !(posTag.indexOf(COMP_TAG) > 0) && !(posTag.indexOf(SUP_TAG) > 0);
}
if (posTag.indexOf('+') > 0) {
return synthesize(token, posTag, true);
}
final List<String> forms = getWordForms(token, posTag, isNegated, synthesizer);
return forms.toArray(new String[forms.size()]);
}
Aggregations