Search in sources :

Example 21 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class LongSentenceRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    String msg = MessageFormat.format(messages.getString("long_sentence_rule_msg"), maxWords);
    int numWords = 0;
    int pos = 0;
    if (tokens.length < maxWords + 1) {
        // just a short-circuit
        return toRuleMatchArray(ruleMatches);
    } else {
        for (AnalyzedTokenReadings aToken : tokens) {
            String token = aToken.getToken();
            // won't match the whole offending sentence, but much of it
            pos += token.length();
            if (!aToken.isSentenceStart() && !aToken.isSentenceEnd() && !NON_WORD_REGEX.matcher(token).matches()) {
                numWords++;
            }
        }
    }
    if (numWords > maxWords) {
        RuleMatch ruleMatch = new RuleMatch(this, 0, pos, msg);
        ruleMatches.add(ruleMatch);
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 22 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class PartialPosTagFilter method acceptRuleMatch.

@Override
public RuleMatch acceptRuleMatch(RuleMatch match, Map<String, String> args, AnalyzedTokenReadings[] patternTokens) {
    if (!(args.containsKey("no") && args.containsKey("regexp") && args.containsKey("postag_regexp"))) {
        throw new RuntimeException("Set 'no', 'regexp' and 'postag_regexp' for filter " + PartialPosTagFilter.class.getSimpleName());
    }
    int tokenPos = Integer.parseInt(args.get("no"));
    Pattern pattern = Pattern.compile(args.get("regexp"));
    String requiredTagRegexp = args.get("postag_regexp");
    boolean negatePos = args.containsKey("negate_pos");
    boolean two_groups_regexp = args.containsKey("two_groups_regexp");
    String token = patternTokens[tokenPos - 1].getToken();
    Matcher matcher = pattern.matcher(token);
    if ((matcher.groupCount() != 1) && !(two_groups_regexp)) {
        throw new RuntimeException("Got " + matcher.groupCount() + " groups for regex '" + pattern.pattern() + "', expected 1");
    }
    if ((matcher.groupCount() != 2) && (two_groups_regexp)) {
        throw new RuntimeException("Got " + matcher.groupCount() + " groups for regex '" + pattern.pattern() + "', expected 2");
    }
    if (matcher.matches()) {
        String partialToken = matcher.group(1);
        if (matcher.groupCount() == 2) {
            partialToken = partialToken + matcher.group(2);
        }
        List<AnalyzedTokenReadings> tags = tag(partialToken);
        if (tags != null && partialTagHasRequiredTag(tags, requiredTagRegexp, negatePos)) {
            return match;
        }
        return null;
    }
    return null;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 23 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class SentenceWhitespaceRule method match.

@Override
public RuleMatch[] match(List<AnalyzedSentence> sentences) throws IOException {
    boolean isFirstSentence = true;
    boolean prevSentenceEndsWithWhitespace = false;
    boolean prevSentenceEndsWithNumber = false;
    List<RuleMatch> ruleMatches = new ArrayList<>();
    int pos = 0;
    for (AnalyzedSentence sentence : sentences) {
        AnalyzedTokenReadings[] tokens = sentence.getTokens();
        if (isFirstSentence) {
            isFirstSentence = false;
        } else {
            if (!prevSentenceEndsWithWhitespace && tokens.length > 1) {
                int startPos = 0;
                String firstToken = tokens[1].getToken();
                int endPos = firstToken.length();
                RuleMatch ruleMatch = new RuleMatch(this, pos + startPos, pos + endPos, getMessage(prevSentenceEndsWithNumber));
                ruleMatch.setSuggestedReplacement(" " + firstToken);
                ruleMatches.add(ruleMatch);
            }
        }
        if (tokens.length > 0) {
            String lastToken = tokens[tokens.length - 1].getToken();
            prevSentenceEndsWithWhitespace = lastToken.trim().isEmpty() && lastToken.length() == 1;
        }
        if (tokens.length > 1) {
            String prevLastToken = tokens[tokens.length - 2].getToken();
            prevSentenceEndsWithNumber = NUMBER_REGEX.matcher(prevLastToken).matches();
        }
        pos += sentence.getText().length();
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 24 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class PortugueseAccentuationDataLoader method loadWords.

Map<String, AnalyzedTokenReadings> loadWords(String path) {
    final Map<String, AnalyzedTokenReadings> map = new HashMap<>();
    final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
    try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
        while (scanner.hasNextLine()) {
            final String line = scanner.nextLine().trim();
            if (line.isEmpty() || line.charAt(0) == '#') {
                // ignore comments
                continue;
            }
            final String[] parts = line.split(";");
            if (parts.length != 3) {
                throw new RuntimeException("Format error in file " + path + ", line: " + line + ", " + "expected 3 semicolon-separated parts, got " + parts.length);
            }
            final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1], parts[2], null);
            map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
        }
    }
    return map;
}
Also used : Scanner(java.util.Scanner) AnalyzedToken(org.languagetool.AnalyzedToken) HashMap(java.util.HashMap) InputStream(java.io.InputStream) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 25 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class FrenchPartialPosTagFilter method tag.

@Override
protected List<AnalyzedTokenReadings> tag(String token) {
    try {
        List<AnalyzedTokenReadings> tags = tagger.tag(Collections.singletonList(token));
        AnalyzedTokenReadings[] atr = tags.toArray(new AnalyzedTokenReadings[tags.size()]);
        AnalyzedSentence disambiguated = disambiguator.disambiguate(new AnalyzedSentence(atr));
        return Arrays.asList(disambiguated.getTokens());
    } catch (IOException e) {
        throw new RuntimeException("Could not tag and disambiguate '" + token + "'", e);
    }
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2