use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class LongSentenceRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
String msg = MessageFormat.format(messages.getString("long_sentence_rule_msg"), maxWords);
int numWords = 0;
int pos = 0;
if (tokens.length < maxWords + 1) {
// just a short-circuit
return toRuleMatchArray(ruleMatches);
} else {
for (AnalyzedTokenReadings aToken : tokens) {
String token = aToken.getToken();
// won't match the whole offending sentence, but much of it
pos += token.length();
if (!aToken.isSentenceStart() && !aToken.isSentenceEnd() && !NON_WORD_REGEX.matcher(token).matches()) {
numWords++;
}
}
}
if (numWords > maxWords) {
RuleMatch ruleMatch = new RuleMatch(this, 0, pos, msg);
ruleMatches.add(ruleMatch);
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class PartialPosTagFilter method acceptRuleMatch.
@Override
public RuleMatch acceptRuleMatch(RuleMatch match, Map<String, String> args, AnalyzedTokenReadings[] patternTokens) {
if (!(args.containsKey("no") && args.containsKey("regexp") && args.containsKey("postag_regexp"))) {
throw new RuntimeException("Set 'no', 'regexp' and 'postag_regexp' for filter " + PartialPosTagFilter.class.getSimpleName());
}
int tokenPos = Integer.parseInt(args.get("no"));
Pattern pattern = Pattern.compile(args.get("regexp"));
String requiredTagRegexp = args.get("postag_regexp");
boolean negatePos = args.containsKey("negate_pos");
boolean two_groups_regexp = args.containsKey("two_groups_regexp");
String token = patternTokens[tokenPos - 1].getToken();
Matcher matcher = pattern.matcher(token);
if ((matcher.groupCount() != 1) && !(two_groups_regexp)) {
throw new RuntimeException("Got " + matcher.groupCount() + " groups for regex '" + pattern.pattern() + "', expected 1");
}
if ((matcher.groupCount() != 2) && (two_groups_regexp)) {
throw new RuntimeException("Got " + matcher.groupCount() + " groups for regex '" + pattern.pattern() + "', expected 2");
}
if (matcher.matches()) {
String partialToken = matcher.group(1);
if (matcher.groupCount() == 2) {
partialToken = partialToken + matcher.group(2);
}
List<AnalyzedTokenReadings> tags = tag(partialToken);
if (tags != null && partialTagHasRequiredTag(tags, requiredTagRegexp, negatePos)) {
return match;
}
return null;
}
return null;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class SentenceWhitespaceRule method match.
@Override
public RuleMatch[] match(List<AnalyzedSentence> sentences) throws IOException {
boolean isFirstSentence = true;
boolean prevSentenceEndsWithWhitespace = false;
boolean prevSentenceEndsWithNumber = false;
List<RuleMatch> ruleMatches = new ArrayList<>();
int pos = 0;
for (AnalyzedSentence sentence : sentences) {
AnalyzedTokenReadings[] tokens = sentence.getTokens();
if (isFirstSentence) {
isFirstSentence = false;
} else {
if (!prevSentenceEndsWithWhitespace && tokens.length > 1) {
int startPos = 0;
String firstToken = tokens[1].getToken();
int endPos = firstToken.length();
RuleMatch ruleMatch = new RuleMatch(this, pos + startPos, pos + endPos, getMessage(prevSentenceEndsWithNumber));
ruleMatch.setSuggestedReplacement(" " + firstToken);
ruleMatches.add(ruleMatch);
}
}
if (tokens.length > 0) {
String lastToken = tokens[tokens.length - 1].getToken();
prevSentenceEndsWithWhitespace = lastToken.trim().isEmpty() && lastToken.length() == 1;
}
if (tokens.length > 1) {
String prevLastToken = tokens[tokens.length - 2].getToken();
prevSentenceEndsWithNumber = NUMBER_REGEX.matcher(prevLastToken).matches();
}
pos += sentence.getText().length();
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class PortugueseAccentuationDataLoader method loadWords.
Map<String, AnalyzedTokenReadings> loadWords(String path) {
final Map<String, AnalyzedTokenReadings> map = new HashMap<>();
final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
while (scanner.hasNextLine()) {
final String line = scanner.nextLine().trim();
if (line.isEmpty() || line.charAt(0) == '#') {
// ignore comments
continue;
}
final String[] parts = line.split(";");
if (parts.length != 3) {
throw new RuntimeException("Format error in file " + path + ", line: " + line + ", " + "expected 3 semicolon-separated parts, got " + parts.length);
}
final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1], parts[2], null);
map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
}
}
return map;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class FrenchPartialPosTagFilter method tag.
@Override
protected List<AnalyzedTokenReadings> tag(String token) {
try {
List<AnalyzedTokenReadings> tags = tagger.tag(Collections.singletonList(token));
AnalyzedTokenReadings[] atr = tags.toArray(new AnalyzedTokenReadings[tags.size()]);
AnalyzedSentence disambiguated = disambiguator.disambiguate(new AnalyzedSentence(atr));
return Arrays.asList(disambiguated.getTokens());
} catch (IOException e) {
throw new RuntimeException("Could not tag and disambiguate '" + token + "'", e);
}
}
Aggregations