use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class QuestionWhitespaceRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokens();
String prevToken = "";
for (int i = 1; i < tokens.length; i++) {
String token = tokens[i].getToken();
boolean isWhiteBefore = tokens[i].isWhitespaceBefore() && !" ".equals(prevToken) && !" ".equals(prevToken);
String msg = null;
int fixLen = 0;
String suggestionText = null;
if (isWhiteBefore) {
switch(token) {
case "?":
msg = "Point d'interrogation est précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = " ?";
fixLen = 1;
break;
case "!":
msg = "Point d'exclamation est précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = " !";
fixLen = 1;
break;
case "»":
msg = "Le guillemet fermant est précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = " »";
fixLen = 1;
break;
case ";":
msg = "Point-virgule est précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = " ;";
fixLen = 1;
break;
case ":":
msg = "Deux-points sont précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = " :";
fixLen = 1;
break;
}
} else {
// "espace insécable" (U+00a0) is also often used. Let's accept both.
if (token.equals("?") && !prevToken.equals("!") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
msg = "Point d'interrogation est précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = prevToken + " ?";
fixLen = 1;
} else if (token.equals("!") && !prevToken.equals("?") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
msg = "Point d'exclamation est précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = prevToken + " !";
fixLen = 1;
} else if (token.equals(";") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
msg = "Point-virgule est précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = prevToken + " ;";
fixLen = 1;
} else if (token.equals(":") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
// Avoid false positive for URL like http://www.languagetool.org.
Matcher matcherUrl = urlPattern.matcher(prevToken);
if (!matcherUrl.find()) {
msg = "Deux-points précédés d'une espace fine insécable.";
// non-breaking space
suggestionText = prevToken + " :";
fixLen = 1;
}
} else if (token.equals("»") && !prevToken.equals(" ") && !prevToken.equals(" ")) {
msg = "Le guillemet fermant est précédé d'une espace fine insécable.";
// non-breaking space
suggestionText = prevToken + " »";
fixLen = 1;
}
}
if (StringTools.isEmpty(token) && prevToken.equals("«")) {
msg = "Le guillemet ouvrant est suivi d'une espace fine insécable.";
// non-breaking space
suggestionText = "« ";
fixLen = 1;
} else if (!StringTools.isEmpty(token) && prevToken.equals("«") && !token.equals(" ") && !token.equals(" ")) {
msg = "Le guillemet ouvrant est suivi d'une espace fine insécable.";
// non-breaking space
suggestionText = "« ";
fixLen = 0;
}
if (msg != null) {
int fromPos = tokens[i - 1].getStartPos();
int toPos = tokens[i - 1].getStartPos() + fixLen + tokens[i - 1].getToken().length();
RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg, "Insérer un espace insécable");
if (suggestionText != null) {
ruleMatch.setSuggestedReplacement(suggestionText);
}
ruleMatches.add(ruleMatch);
}
prevToken = token;
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class EnglishChunkFilterTest method testPluralByPluralNoun.
@Test
public void testPluralByPluralNoun() throws IOException {
String input = "I/X have/N-VP ten/B-NP books/I-NP ./.";
List<ChunkTaggedToken> tokens = makeTokens(input);
// 'books'
tokens.remove(3);
AnalyzedTokenReadings readings = new AnalyzedTokenReadings(Arrays.asList(new AnalyzedToken("books", "NNS", "book"), new AnalyzedToken("books", "VBZ", "book")), 0);
tokens.add(3, new ChunkTaggedToken("books", Collections.singletonList(new ChunkTag("I-NP")), readings));
assertChunks(tokens, "I/X have/N-VP ten/B-NP-plural books/E-NP-plural ./.");
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class EnglishChunkerTest method createReadingsList.
private List<AnalyzedTokenReadings> createReadingsList(String sentence) {
StringTokenizer tokenizer = new StringTokenizer(sentence, " ", true);
List<AnalyzedTokenReadings> result = new ArrayList<>();
int pos = 0;
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (token.trim().isEmpty()) {
result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), pos));
} else {
result.add(new AnalyzedTokenReadings(new AnalyzedToken(token, "fake", "fake"), pos));
}
pos += token.length();
}
return result;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class PolishTagger method tag.
@Override
public final List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) {
List<AnalyzedToken> taggerTokens;
List<AnalyzedToken> lowerTaggerTokens;
List<AnalyzedToken> upperTaggerTokens;
final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
for (String word : sentenceTokens) {
final List<AnalyzedToken> l = new ArrayList<>();
final String lowerWord = word.toLowerCase(plLocale);
taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
final boolean isLowercase = word.equals(lowerWord);
//normal case
addTokens(taggerTokens, l);
if (!isLowercase) {
//lowercase
addTokens(lowerTaggerTokens, l);
}
//uppercase
if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
if (isLowercase) {
upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(word)));
if (!upperTaggerTokens.isEmpty()) {
addTokens(upperTaggerTokens, l);
} else {
l.add(new AnalyzedToken(word, null, null));
}
} else {
l.add(new AnalyzedToken(word, null, null));
}
}
tokenReadings.add(new AnalyzedTokenReadings(l, pos));
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class EsperantoTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
lazyInit();
Matcher matcher;
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
for (String word : sentenceTokens) {
List<AnalyzedToken> l = new ArrayList<>();
// spurious tagging as single letter words "A", "O", "E", etc.
if (word.length() > 1) {
String lWord = word.toLowerCase();
List<TaggedWord> manualTags = manualTagger.tag(lWord);
if (manualTags.size() > 0) {
// This is a closed word for which we know its lemmas and tags.
for (TaggedWord manualTag : manualTags) {
l.add(new AnalyzedToken(word, manualTag.getPosTag(), manualTag.getLemma()));
}
} else {
// Tiu, kiu (tabelvortoj).
if ((matcher = patternTabelvorto.matcher(lWord)).find()) {
String type1Group = matcher.group(1).substring(0, 1).toLowerCase();
String type2Group = matcher.group(2);
String plGroup = matcher.group(3);
String accGroup = matcher.group(4);
String type3Group = matcher.group(5);
String type;
String plural;
String accusative;
if (accGroup == null) {
accusative = "xxx";
} else {
accusative = accGroup.equalsIgnoreCase("n") ? "akz" : "nak";
}
if (plGroup == null) {
plural = " pn ";
} else {
plural = plGroup.equalsIgnoreCase("j") ? " pl " : " np ";
}
type = ((type2Group == null) ? type3Group : type2Group).toLowerCase();
l.add(new AnalyzedToken(word, "T " + accusative + plural + type1Group + " " + type, null));
if ((matcher = patternTabelvortoAdverb.matcher(lWord)).find()) {
l.add(new AnalyzedToken(word, "E nak", lWord));
}
// Words ending in .*oj?n? are nouns.
} else if (lWord.endsWith("o")) {
l.add(new AnalyzedToken(word, "O nak np", lWord));
} else if (lWord.length() >= 2 && lWord.endsWith("'")) {
l.add(new AnalyzedToken(word, "O nak np", lWord.substring(0, lWord.length() - 1) + "o"));
} else if (lWord.endsWith("oj")) {
l.add(new AnalyzedToken(word, "O nak pl", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("on")) {
l.add(new AnalyzedToken(word, "O akz np", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("ojn")) {
l.add(new AnalyzedToken(word, "O akz pl", lWord.substring(0, lWord.length() - 2)));
// Words ending in .*aj?n? are adjectives.
} else if (lWord.endsWith("a")) {
l.add(new AnalyzedToken(word, "A nak np", lWord));
} else if (lWord.endsWith("aj")) {
l.add(new AnalyzedToken(word, "A nak pl", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("an")) {
l.add(new AnalyzedToken(word, "A akz np", lWord.substring(0, lWord.length() - 1)));
} else if (lWord.endsWith("ajn")) {
l.add(new AnalyzedToken(word, "A akz pl", lWord.substring(0, lWord.length() - 2)));
// Words ending in .*en? are adverbs.
} else if (lWord.endsWith("e")) {
l.add(new AnalyzedToken(word, "E nak", lWord));
} else if (lWord.endsWith("en")) {
l.add(new AnalyzedToken(word, "E akz", lWord.substring(0, lWord.length() - 1)));
// Verbs.
} else if ((matcher = patternVerb.matcher(lWord)).find()) {
String verb = matcher.group(1) + "i";
String tense = matcher.group(2);
String transitive = findTransitivity(verb);
l.add(new AnalyzedToken(word, "V " + transitive + " " + tense, verb));
// Irregular word (no tag).
} else {
l.add(new AnalyzedToken(word, null, null));
}
// Participle (can be combined with other tags).
if ((matcher = patternParticiple.matcher(lWord)).find()) {
if (!setNonParticiple.contains(matcher.group(1))) {
String verb = matcher.group(2) + "i";
String aio = matcher.group(3);
String antAt = matcher.group(4).equals("n") ? "n" : "-";
String aoe = matcher.group(5);
String plural = matcher.group(6).equals("j") ? "pl" : "np";
String accusative = matcher.group(7).equals("n") ? "akz" : "nak";
String transitive = findTransitivity(verb);
l.add(new AnalyzedToken(word, "C " + accusative + " " + plural + " " + transitive + " " + aio + " " + antAt + " " + aoe, verb));
}
}
}
} else {
// Single letter word (no tag).
l.add(new AnalyzedToken(word, null, null));
}
tokenReadings.add(new AnalyzedTokenReadings(l, 0));
}
return tokenReadings;
}
Aggregations