Search in sources :

Example 1 with ConfusionSet

use of org.languagetool.rules.ConfusionSet in project languagetool by languagetool-org.

the class AllConfusionRulesEvaluator method main.

public static void main(String[] args) throws IOException {
    if (args.length < 3 || args.length > 4) {
        System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName() + " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>...");
        System.err.println("   <languageModelTopDir> is a directory with sub-directories '1grams', '2grams', and '3grams' with Lucene indexes");
        System.err.println("   <wikipediaXml|tatoebaFile|dir> either a Wikipedia XML dump, or a Tatoeba file or");
        System.err.println("                      a directory with example sentences (where <word>.txt contains only the sentences for <word>).");
        System.err.println("                      You can specify both a Wikipedia file and a Tatoeba file.");
        System.exit(1);
    }
    Language lang;
    if ("en".equals(args[0])) {
        lang = new ConfusionRuleEvaluator.EnglishLight();
    } else {
        lang = Languages.getLanguageForShortCode(args[0]);
    }
    LanguageModel languageModel = new LuceneLanguageModel(new File(args[1]));
    List<String> inputsFiles = new ArrayList<>();
    inputsFiles.add(args[2]);
    if (args.length >= 4) {
        inputsFiles.add(args[3]);
    }
    ConfusionRuleEvaluator eval = new ConfusionRuleEvaluator(lang, languageModel, false);
    eval.setVerboseMode(false);
    ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt");
    Map<String, List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
    Set<String> done = new HashSet<>();
    int fMeasureCount = 0;
    float fMeasureTotal = 0;
    for (List<ConfusionSet> entry : confusionSetMap.values()) {
        for (ConfusionSet confusionSet : entry) {
            Set<ConfusionString> set = confusionSet.getSet();
            if (set.size() != 2) {
                System.out.println("Skipping confusion set with size != 2: " + confusionSet);
            } else {
                Iterator<ConfusionString> iterator = set.iterator();
                ConfusionString set1 = iterator.next();
                ConfusionString set2 = iterator.next();
                String word1 = set1.getString();
                String word2 = set2.getString();
                String key = word1 + " " + word2;
                if (!done.contains(key)) {
                    Map<Long, ConfusionRuleEvaluator.EvalResult> evalResults = eval.run(inputsFiles, word1, word2, MAX_SENTENCES, Arrays.asList(confusionSet.getFactor()));
                    ConfusionRuleEvaluator.EvalResult evalResult = evalResults.values().iterator().next();
                    String summary1 = set1.getDescription() != null ? word1 + "|" + set1.getDescription() : word1;
                    String summary2 = set2.getDescription() != null ? word2 + "|" + set2.getDescription() : word2;
                    String start;
                    if (summary1.compareTo(summary2) < 0) {
                        start = summary1 + "; " + summary2 + "; " + confusionSet.getFactor();
                    } else {
                        start = summary2 + "; " + summary1 + "; " + confusionSet.getFactor();
                    }
                    String spaces = StringUtils.repeat(" ", 82 - start.length());
                    System.out.println(start + spaces + "# " + evalResult.getSummary());
                    double fMeasure = FMeasure.getWeightedFMeasure(evalResult.getPrecision(), evalResult.getRecall());
                    //System.out.println("f-measure: " + fMeasure);
                    fMeasureCount++;
                    fMeasureTotal += fMeasure;
                }
                done.add(key);
            }
        }
    }
    System.out.println("Average f-measure: " + (fMeasureTotal / fMeasureCount));
}
Also used : ConfusionString(org.languagetool.rules.ConfusionString) ConfusionSet(org.languagetool.rules.ConfusionSet) Language(org.languagetool.Language) LanguageModel(org.languagetool.languagemodel.LanguageModel) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) ConfusionString(org.languagetool.rules.ConfusionString) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) ConfusionSetLoader(org.languagetool.rules.ConfusionSetLoader) InputStream(java.io.InputStream) File(java.io.File)

Example 2 with ConfusionSet

use of org.languagetool.rules.ConfusionSet in project languagetool by languagetool-org.

the class AutomaticConfusionRuleEvaluator method runOnPair.

private void runOnPair(ConfusionRuleEvaluator evaluator, String line, String part1, String part2) throws IOException {
    for (Map.Entry<String, List<ConfusionSet>> entry : knownSets.entrySet()) {
        if (entry.getKey().equals(part1)) {
            List<ConfusionSet> confusionSet = entry.getValue();
            for (ConfusionSet set : confusionSet) {
                Set<String> stringSet = set.getSet().stream().map(l -> l.getString()).collect(Collectors.toSet());
                if (stringSet.containsAll(Arrays.asList(part1, part2))) {
                    System.out.println("Ignoring: " + part1 + "/" + part2 + ", in active confusion sets already");
                    ignored++;
                    return;
                }
            }
        }
    }
    System.out.println("Working on: " + line);
    File sentencesFile = writeExampleSentencesToTempFile(new String[] { part1, part2 });
    List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
    Map<Long, ConfusionRuleEvaluator.EvalResult> results = evaluator.run(input, part1, part2, MAX_EXAMPLES, EVAL_FACTORS);
    Map<Long, ConfusionRuleEvaluator.EvalResult> bestResults = findBestFactor(results);
    if (bestResults.size() > 0) {
        for (Map.Entry<Long, ConfusionRuleEvaluator.EvalResult> entry : bestResults.entrySet()) {
            System.out.println("=> " + entry.getValue().getSummary());
        }
    } else {
        System.out.println("No good result found for " + part1 + "/" + part2);
    }
}
Also used : java.util(java.util) DirectoryReader(org.apache.lucene.index.DirectoryReader) Term(org.apache.lucene.index.Term) ConfusionSetLoader(org.languagetool.rules.ConfusionSetLoader) JLanguageTool(org.languagetool.JLanguageTool) Collectors(java.util.stream.Collectors) IOUtils(org.apache.commons.io.IOUtils) ConfusionSet(org.languagetool.rules.ConfusionSet) java.io(java.io) LanguageModel(org.languagetool.languagemodel.LanguageModel) org.apache.lucene.search(org.apache.lucene.search) Language(org.languagetool.Language) Languages(org.languagetool.Languages) FSDirectory(org.apache.lucene.store.FSDirectory) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) ConfusionSet(org.languagetool.rules.ConfusionSet)

Example 3 with ConfusionSet

use of org.languagetool.rules.ConfusionSet in project languagetool by languagetool-org.

the class ConfusionRuleEvaluator method evaluate.

@SuppressWarnings("ConstantConditions")
private void evaluate(List<Sentence> sentences, boolean isCorrect, String token, String homophoneToken, List<Long> evalFactors) throws IOException {
    println("======================");
    printf("Starting evaluation on " + sentences.size() + " sentences with %s/%s:\n", token, homophoneToken);
    JLanguageTool lt = new JLanguageTool(language);
    List<Rule> allActiveRules = lt.getAllActiveRules();
    for (Rule activeRule : allActiveRules) {
        lt.disableRule(activeRule.getId());
    }
    for (Sentence sentence : sentences) {
        String textToken = isCorrect ? token : homophoneToken;
        String plainText = sentence.getText();
        String replacement = plainText.indexOf(textToken) == 0 ? StringTools.uppercaseFirstChar(token) : token;
        String replacedTokenSentence = isCorrect ? plainText : plainText.replaceFirst("(?i)\\b" + textToken + "\\b", replacement);
        AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence(replacedTokenSentence);
        for (Long factor : evalFactors) {
            rule.setConfusionSet(new ConfusionSet(factor, homophoneToken, token));
            RuleMatch[] matches = rule.match(analyzedSentence);
            boolean consideredCorrect = matches.length == 0;
            String displayStr = plainText.replaceFirst("(?i)\\b" + textToken + "\\b", "**" + replacement + "**");
            if (consideredCorrect && isCorrect) {
                evalValues.get(factor).trueNegatives++;
            } else if (!consideredCorrect && isCorrect) {
                evalValues.get(factor).falsePositives++;
                println("false positive with factor " + factor + ": " + displayStr);
            } else if (consideredCorrect && !isCorrect) {
                //println("false negative: " + displayStr);
                evalValues.get(factor).falseNegatives++;
            } else {
                evalValues.get(factor).truePositives++;
            //System.out.println("true positive: " + displayStr);
            }
        }
    }
}
Also used : ConfusionSet(org.languagetool.rules.ConfusionSet) AnalyzedSentence(org.languagetool.AnalyzedSentence) RuleMatch(org.languagetool.rules.RuleMatch) JLanguageTool(org.languagetool.JLanguageTool) ConfusionProbabilityRule(org.languagetool.rules.ngrams.ConfusionProbabilityRule) Rule(org.languagetool.rules.Rule) AnalyzedSentence(org.languagetool.AnalyzedSentence)

Example 4 with ConfusionSet

use of org.languagetool.rules.ConfusionSet in project languagetool by languagetool-org.

the class RuleCreator method run.

private void run(File homophoneOccurrences, String homophonePath) throws IOException {
    ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(homophonePath);
    Map<String, List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
    initMaps(homophoneOccurrences);
    int groupCount = 0;
    if (XML_MODE) {
        System.out.println("<rules lang='en'>\n");
        System.out.println("<category name='Auto-generated rules'>\n");
    }
    for (Map.Entry<String, List<ConfusionSet>> entry : confusionSetMap.entrySet()) {
        System.err.println(" === " + entry + " === ");
        if (entry.getValue().size() > 1) {
            System.err.println("WARN: will use only first pair of " + entry.getValue().size() + ": " + entry.getValue().get(0));
        }
        List<OccurrenceInfo> infos = occurrenceInfos.get(entry.getKey());
        if (infos == null) {
            System.err.println("Could not find occurrence infos for '" + entry.getKey() + "', skipping");
            continue;
        }
        Set cleanSet = new HashSet<>(entry.getValue().get(0).getSet());
        cleanSet.remove(entry.getKey());
        String name = StringUtils.join(cleanSet, "/") + " -> " + entry.getKey();
        if (XML_MODE) {
            System.out.println("<rulegroup id='R" + groupCount + "' name=\"" + StringTools.escapeXML(name) + "\">\n");
        }
        groupCount++;
        for (OccurrenceInfo occurrenceInfo : infos) {
            String[] parts = occurrenceInfo.ngram.split(" ");
            for (ConfusionString variant : entry.getValue().get(0).getSet()) {
                if (variant.getString().equals(entry.getKey())) {
                    continue;
                }
                printRule(occurrenceInfo, parts, variant.getString());
            }
        }
        if (XML_MODE) {
            System.out.println("</rulegroup>\n");
        }
    }
    if (XML_MODE) {
        System.out.println("</category>");
        System.out.println("</rules>");
    }
    System.err.println("Done. Wrote " + ruleCount + " rules.");
    System.err.println("Rules ignored because of different tokenization: " + tokenFilteredRules);
    System.err.println("Rules ignored because of error probability limit (" + minErrorProb + "): " + probFilteredRules);
}
Also used : ConfusionSet(org.languagetool.rules.ConfusionSet) ConfusionString(org.languagetool.rules.ConfusionString) ConfusionString(org.languagetool.rules.ConfusionString) ConfusionSetLoader(org.languagetool.rules.ConfusionSetLoader)

Aggregations

ConfusionSet (org.languagetool.rules.ConfusionSet)4 ConfusionSetLoader (org.languagetool.rules.ConfusionSetLoader)3 JLanguageTool (org.languagetool.JLanguageTool)2 Language (org.languagetool.Language)2 LanguageModel (org.languagetool.languagemodel.LanguageModel)2 LuceneLanguageModel (org.languagetool.languagemodel.LuceneLanguageModel)2 ConfusionString (org.languagetool.rules.ConfusionString)2 java.io (java.io)1 File (java.io.File)1 InputStream (java.io.InputStream)1 java.util (java.util)1 Collectors (java.util.stream.Collectors)1 IOUtils (org.apache.commons.io.IOUtils)1 DirectoryReader (org.apache.lucene.index.DirectoryReader)1 Term (org.apache.lucene.index.Term)1 org.apache.lucene.search (org.apache.lucene.search)1 FSDirectory (org.apache.lucene.store.FSDirectory)1 AnalyzedSentence (org.languagetool.AnalyzedSentence)1 Languages (org.languagetool.Languages)1 Rule (org.languagetool.rules.Rule)1