use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class KhmerWordRepeatRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
AnalyzedTokenReadings[] tokensWithWS = sentence.getTokens();
String prevToken = "";
// we start from token 1, token 0 is SENT_START
for (int i = 1; i < tokens.length; i++) {
String token = tokens[i].getToken();
if (isWord(token) && prevToken.equalsIgnoreCase(token) && !ignore(sentence, tokensWithWS, i)) {
int prevPos = tokens[i - 1].getStartPos();
int pos = tokens[i].getStartPos();
RuleMatch ruleMatch = new RuleMatch(this, prevPos, pos + prevToken.length(), messages.getString("repetition"), messages.getString("desc_repetition_short"));
List<String> replacements = new ArrayList<>();
// case 1: replace zero-width space w/ real space
replacements.add(prevToken + " " + token);
// case 2: remove repeated word - same as original suggestion
replacements.add(prevToken);
// case 3: same as case 2, just add "repetition character"
replacements.add(prevToken + "ៗ");
ruleMatch.setSuggestedReplacements(replacements);
ruleMatches.add(ruleMatch);
}
prevToken = token;
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class MorfologikRussianSpellerRule method ignoreToken.
@Override
protected boolean ignoreToken(AnalyzedTokenReadings[] tokens, int idx) throws IOException {
String word = tokens[idx].getToken();
// don't check words that don't have letters
if (!RUSSIAN_LETTERS.matcher(word).matches()) {
return true;
}
List<String> words = new ArrayList<>();
for (AnalyzedTokenReadings token : tokens) {
words.add(token.getToken());
}
return ignoreWord(words, idx);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class RussianPartialPosTagFilter method tag.
@Override
protected List<AnalyzedTokenReadings> tag(String token) {
try {
List<AnalyzedTokenReadings> tags = tagger.tag(Collections.singletonList(token));
AnalyzedTokenReadings[] atr = tags.toArray(new AnalyzedTokenReadings[tags.size()]);
AnalyzedSentence disambiguated = disambiguator.disambiguate(new AnalyzedSentence(atr));
return Arrays.asList(disambiguated.getTokens());
} catch (IOException e) {
throw new RuntimeException("Could not tag and disambiguate '" + token + "'", e);
}
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class RussianTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
for (String word : sentenceTokens) {
if (word.length() > 1) {
word = word.replace("о́", "о");
word = word.replace("а́", "а");
word = word.replace("е́", "е");
word = word.replace("у́", "у");
word = word.replace("и́", "и");
word = word.replace("ы́", "ы");
word = word.replace("э́", "э");
word = word.replace("ю́", "ю");
word = word.replace("я́", "я");
word = word.replace("о̀", "о");
word = word.replace("а̀", "а");
word = word.replace("ѐ", "е");
word = word.replace("у̀", "у");
word = word.replace("ѝ", "и");
word = word.replace("ы̀", "ы");
word = word.replace("э̀", "э");
word = word.replace("ю̀", "ю");
word = word.replace("я̀", "я");
word = word.replace("ʼ", "ъ");
}
List<AnalyzedToken> l = getAnalyzedTokens(word);
tokenReadings.add(new AnalyzedTokenReadings(l, pos));
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class MultiWordChunkerTest method testDisambiguate.
@Test
public void testDisambiguate() throws Exception {
Disambiguator chunker = new MultiWordChunker("/pl/multiwords.txt");
JLanguageTool lt = new JLanguageTool(new English());
AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence("A test... More.");
AnalyzedSentence disambiguated = chunker.disambiguate(analyzedSentence);
AnalyzedTokenReadings[] tokens = disambiguated.getTokens();
assertTrue(tokens[4].getReadings().toString().contains("<ELLIPSIS>"));
assertTrue(tokens[6].getReadings().toString().contains("</ELLIPSIS>"));
}
Aggregations