Search in sources :

Example 6 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class AllConfusionRulesEvaluator method main.

public static void main(String[] args) throws IOException {
    if (args.length < 3 || args.length > 4) {
        System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName() + " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>...");
        System.err.println("   <languageModelTopDir> is a directory with sub-directories '1grams', '2grams', and '3grams' with Lucene indexes");
        System.err.println("   <wikipediaXml|tatoebaFile|dir> either a Wikipedia XML dump, or a Tatoeba file or");
        System.err.println("                      a directory with example sentences (where <word>.txt contains only the sentences for <word>).");
        System.err.println("                      You can specify both a Wikipedia file and a Tatoeba file.");
        System.exit(1);
    }
    Language lang;
    if ("en".equals(args[0])) {
        lang = new ConfusionRuleEvaluator.EnglishLight();
    } else {
        lang = Languages.getLanguageForShortCode(args[0]);
    }
    LanguageModel languageModel = new LuceneLanguageModel(new File(args[1]));
    List<String> inputsFiles = new ArrayList<>();
    inputsFiles.add(args[2]);
    if (args.length >= 4) {
        inputsFiles.add(args[3]);
    }
    ConfusionRuleEvaluator eval = new ConfusionRuleEvaluator(lang, languageModel, false);
    eval.setVerboseMode(false);
    ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt");
    Map<String, List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
    Set<String> done = new HashSet<>();
    int fMeasureCount = 0;
    float fMeasureTotal = 0;
    for (List<ConfusionSet> entry : confusionSetMap.values()) {
        for (ConfusionSet confusionSet : entry) {
            Set<ConfusionString> set = confusionSet.getSet();
            if (set.size() != 2) {
                System.out.println("Skipping confusion set with size != 2: " + confusionSet);
            } else {
                Iterator<ConfusionString> iterator = set.iterator();
                ConfusionString set1 = iterator.next();
                ConfusionString set2 = iterator.next();
                String word1 = set1.getString();
                String word2 = set2.getString();
                String key = word1 + " " + word2;
                if (!done.contains(key)) {
                    Map<Long, ConfusionRuleEvaluator.EvalResult> evalResults = eval.run(inputsFiles, word1, word2, MAX_SENTENCES, Arrays.asList(confusionSet.getFactor()));
                    ConfusionRuleEvaluator.EvalResult evalResult = evalResults.values().iterator().next();
                    String summary1 = set1.getDescription() != null ? word1 + "|" + set1.getDescription() : word1;
                    String summary2 = set2.getDescription() != null ? word2 + "|" + set2.getDescription() : word2;
                    String start;
                    if (summary1.compareTo(summary2) < 0) {
                        start = summary1 + "; " + summary2 + "; " + confusionSet.getFactor();
                    } else {
                        start = summary2 + "; " + summary1 + "; " + confusionSet.getFactor();
                    }
                    String spaces = StringUtils.repeat(" ", 82 - start.length());
                    System.out.println(start + spaces + "# " + evalResult.getSummary());
                    double fMeasure = FMeasure.getWeightedFMeasure(evalResult.getPrecision(), evalResult.getRecall());
                    //System.out.println("f-measure: " + fMeasure);
                    fMeasureCount++;
                    fMeasureTotal += fMeasure;
                }
                done.add(key);
            }
        }
    }
    System.out.println("Average f-measure: " + (fMeasureTotal / fMeasureCount));
}
Also used : ConfusionString(org.languagetool.rules.ConfusionString) ConfusionSet(org.languagetool.rules.ConfusionSet) Language(org.languagetool.Language) LanguageModel(org.languagetool.languagemodel.LanguageModel) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) ConfusionString(org.languagetool.rules.ConfusionString) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) ConfusionSetLoader(org.languagetool.rules.ConfusionSetLoader) InputStream(java.io.InputStream) File(java.io.File)

Example 7 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class AutomaticConfusionRuleEvaluator method run.

private void run(List<String> lines, File indexDir) throws IOException {
    Language language = Languages.getLanguageForShortCode(LANGUAGE);
    LanguageModel lm = new LuceneLanguageModel(indexDir);
    ConfusionRuleEvaluator evaluator = new ConfusionRuleEvaluator(language, lm, CASE_SENSITIVE);
    for (String line : lines) {
        if (line.contains("#")) {
            System.out.println("Ignoring: " + line);
            continue;
        }
        String[] parts = line.split(";\\s*");
        if (parts.length != 2) {
            throw new IOException("Expected semicolon-separated input: " + line);
        }
        try {
            int i = 1;
            for (String part : parts) {
                // compare pair-wise - maybe we should compare every item with every other item?
                if (i < parts.length) {
                    runOnPair(evaluator, line, removeComment(part), removeComment(parts[i]));
                }
                i++;
            }
        } catch (RuntimeException e) {
            e.printStackTrace();
        }
    }
    System.out.println("Done. Ignored items because they are already known: " + ignored);
}
Also used : Language(org.languagetool.Language) LanguageModel(org.languagetool.languagemodel.LanguageModel) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel)

Example 8 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class CommonCrawlToNgram method main.

public static void main(String[] args) throws IOException {
    if (args.length != 4) {
        System.out.println("Usage: " + CommonCrawlToNgram.class + " <langCode> <input.xz> <ngramIndexDir> <simpleEvalFile>");
        System.out.println(" <simpleEvalFile> a plain text file with simple error markup");
        System.exit(1);
    }
    Language language = Languages.getLanguageForShortCode(args[0]);
    File input = new File(args[1]);
    File outputDir = new File(args[2]);
    File evalFile = new File(args[3]);
    try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir, evalFile)) {
        prg.indexInputFile();
    }
}
Also used : Language(org.languagetool.Language)

Example 9 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class ConfusionRuleEvaluator method main.

public static void main(String[] args) throws IOException {
    if (args.length < 5 || args.length > 6) {
        System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName() + " <token> <homophoneToken> <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|plainTextFile|dir>...");
        System.err.println("   <languageModelTopDir> is a directory with sub-directories like 'en' which then again contain '1grams',");
        System.err.println("                      '2grams', and '3grams' sub directories with Lucene indexes");
        System.err.println("                      See http://wiki.languagetool.org/finding-errors-using-n-gram-data");
        System.err.println("   <wikipediaXml|tatoebaFile|plainTextFile|dir> either a Wikipedia XML dump, or a Tatoeba file, or");
        System.err.println("                      a plain text file with one sentence per line, or a directory with");
        System.err.println("                      example sentences (where <word>.txt contains only the sentences for <word>).");
        System.err.println("                      You can specify both a Wikipedia file and a Tatoeba file.");
        System.exit(1);
    }
    long startTime = System.currentTimeMillis();
    String token = args[0];
    String homophoneToken = args[1];
    String langCode = args[2];
    Language lang;
    if ("en".equals(langCode)) {
        lang = new EnglishLight();
    } else {
        lang = Languages.getLanguageForShortCode(langCode);
    }
    LanguageModel languageModel = new LuceneLanguageModel(new File(args[3], lang.getShortCode()));
    //LanguageModel languageModel = new BerkeleyRawLanguageModel(new File("/media/Data/berkeleylm/google_books_binaries/ger.blm.gz"));
    //LanguageModel languageModel = new BerkeleyLanguageModel(new File("/media/Data/berkeleylm/google_books_binaries/ger.blm.gz"));
    List<String> inputsFiles = new ArrayList<>();
    inputsFiles.add(args[4]);
    if (args.length >= 6) {
        inputsFiles.add(args[5]);
    }
    ConfusionRuleEvaluator generator = new ConfusionRuleEvaluator(lang, languageModel, CASE_SENSITIVE);
    generator.run(inputsFiles, token, homophoneToken, MAX_SENTENCES, EVAL_FACTORS);
    long endTime = System.currentTimeMillis();
    System.out.println("\nTime: " + (endTime - startTime) + "ms");
}
Also used : Language(org.languagetool.Language) LanguageModel(org.languagetool.languagemodel.LanguageModel) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) File(java.io.File)

Example 10 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class SimpleRuleCounter method run.

private void run(List<Language> languages) {
    List<Language> sortedLanguages = new ArrayList<>(languages);
    sortedLanguages.sort((l1, l2) -> l1.getName().compareTo(l2.getName()));
    for (Language language : sortedLanguages) {
        if (language.isVariant()) {
            continue;
        }
        JLanguageTool lt = new JLanguageTool(language);
        List<Rule> allRules = lt.getAllActiveRules();
        countForLanguage(allRules, language);
    }
}
Also used : Language(org.languagetool.Language) JLanguageTool(org.languagetool.JLanguageTool) ArrayList(java.util.ArrayList) Rule(org.languagetool.rules.Rule) PatternRule(org.languagetool.rules.patterns.PatternRule)

Aggregations

Language (org.languagetool.Language)84 Test (org.junit.Test)23 File (java.io.File)15 ArrayList (java.util.ArrayList)12 JLanguageTool (org.languagetool.JLanguageTool)11 Rule (org.languagetool.rules.Rule)11 RuleMatch (org.languagetool.rules.RuleMatch)10 IOException (java.io.IOException)7 Ignore (org.junit.Ignore)6 StringTools.readerToString (org.languagetool.tools.StringTools.readerToString)5 InputStream (java.io.InputStream)4 English (org.languagetool.language.English)4 BitextRule (org.languagetool.rules.bitext.BitextRule)4 URL (java.net.URL)3 HashSet (java.util.HashSet)3 MultiThreadedJLanguageTool (org.languagetool.MultiThreadedJLanguageTool)3 AmericanEnglish (org.languagetool.language.AmericanEnglish)3 LanguageModel (org.languagetool.languagemodel.LanguageModel)3 LuceneLanguageModel (org.languagetool.languagemodel.LuceneLanguageModel)3 BufferedReader (java.io.BufferedReader)2