use of org.languagetool.Language in project languagetool by languagetool-org.
the class AllConfusionRulesEvaluator method main.
public static void main(String[] args) throws IOException {
if (args.length < 3 || args.length > 4) {
System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName() + " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>...");
System.err.println(" <languageModelTopDir> is a directory with sub-directories '1grams', '2grams', and '3grams' with Lucene indexes");
System.err.println(" <wikipediaXml|tatoebaFile|dir> either a Wikipedia XML dump, or a Tatoeba file or");
System.err.println(" a directory with example sentences (where <word>.txt contains only the sentences for <word>).");
System.err.println(" You can specify both a Wikipedia file and a Tatoeba file.");
System.exit(1);
}
Language lang;
if ("en".equals(args[0])) {
lang = new ConfusionRuleEvaluator.EnglishLight();
} else {
lang = Languages.getLanguageForShortCode(args[0]);
}
LanguageModel languageModel = new LuceneLanguageModel(new File(args[1]));
List<String> inputsFiles = new ArrayList<>();
inputsFiles.add(args[2]);
if (args.length >= 4) {
inputsFiles.add(args[3]);
}
ConfusionRuleEvaluator eval = new ConfusionRuleEvaluator(lang, languageModel, false);
eval.setVerboseMode(false);
ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt");
Map<String, List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
Set<String> done = new HashSet<>();
int fMeasureCount = 0;
float fMeasureTotal = 0;
for (List<ConfusionSet> entry : confusionSetMap.values()) {
for (ConfusionSet confusionSet : entry) {
Set<ConfusionString> set = confusionSet.getSet();
if (set.size() != 2) {
System.out.println("Skipping confusion set with size != 2: " + confusionSet);
} else {
Iterator<ConfusionString> iterator = set.iterator();
ConfusionString set1 = iterator.next();
ConfusionString set2 = iterator.next();
String word1 = set1.getString();
String word2 = set2.getString();
String key = word1 + " " + word2;
if (!done.contains(key)) {
Map<Long, ConfusionRuleEvaluator.EvalResult> evalResults = eval.run(inputsFiles, word1, word2, MAX_SENTENCES, Arrays.asList(confusionSet.getFactor()));
ConfusionRuleEvaluator.EvalResult evalResult = evalResults.values().iterator().next();
String summary1 = set1.getDescription() != null ? word1 + "|" + set1.getDescription() : word1;
String summary2 = set2.getDescription() != null ? word2 + "|" + set2.getDescription() : word2;
String start;
if (summary1.compareTo(summary2) < 0) {
start = summary1 + "; " + summary2 + "; " + confusionSet.getFactor();
} else {
start = summary2 + "; " + summary1 + "; " + confusionSet.getFactor();
}
String spaces = StringUtils.repeat(" ", 82 - start.length());
System.out.println(start + spaces + "# " + evalResult.getSummary());
double fMeasure = FMeasure.getWeightedFMeasure(evalResult.getPrecision(), evalResult.getRecall());
//System.out.println("f-measure: " + fMeasure);
fMeasureCount++;
fMeasureTotal += fMeasure;
}
done.add(key);
}
}
}
System.out.println("Average f-measure: " + (fMeasureTotal / fMeasureCount));
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class AutomaticConfusionRuleEvaluator method run.
private void run(List<String> lines, File indexDir) throws IOException {
Language language = Languages.getLanguageForShortCode(LANGUAGE);
LanguageModel lm = new LuceneLanguageModel(indexDir);
ConfusionRuleEvaluator evaluator = new ConfusionRuleEvaluator(language, lm, CASE_SENSITIVE);
for (String line : lines) {
if (line.contains("#")) {
System.out.println("Ignoring: " + line);
continue;
}
String[] parts = line.split(";\\s*");
if (parts.length != 2) {
throw new IOException("Expected semicolon-separated input: " + line);
}
try {
int i = 1;
for (String part : parts) {
// compare pair-wise - maybe we should compare every item with every other item?
if (i < parts.length) {
runOnPair(evaluator, line, removeComment(part), removeComment(parts[i]));
}
i++;
}
} catch (RuntimeException e) {
e.printStackTrace();
}
}
System.out.println("Done. Ignored items because they are already known: " + ignored);
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class CommonCrawlToNgram method main.
public static void main(String[] args) throws IOException {
if (args.length != 4) {
System.out.println("Usage: " + CommonCrawlToNgram.class + " <langCode> <input.xz> <ngramIndexDir> <simpleEvalFile>");
System.out.println(" <simpleEvalFile> a plain text file with simple error markup");
System.exit(1);
}
Language language = Languages.getLanguageForShortCode(args[0]);
File input = new File(args[1]);
File outputDir = new File(args[2]);
File evalFile = new File(args[3]);
try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir, evalFile)) {
prg.indexInputFile();
}
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class ConfusionRuleEvaluator method main.
public static void main(String[] args) throws IOException {
if (args.length < 5 || args.length > 6) {
System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName() + " <token> <homophoneToken> <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|plainTextFile|dir>...");
System.err.println(" <languageModelTopDir> is a directory with sub-directories like 'en' which then again contain '1grams',");
System.err.println(" '2grams', and '3grams' sub directories with Lucene indexes");
System.err.println(" See http://wiki.languagetool.org/finding-errors-using-n-gram-data");
System.err.println(" <wikipediaXml|tatoebaFile|plainTextFile|dir> either a Wikipedia XML dump, or a Tatoeba file, or");
System.err.println(" a plain text file with one sentence per line, or a directory with");
System.err.println(" example sentences (where <word>.txt contains only the sentences for <word>).");
System.err.println(" You can specify both a Wikipedia file and a Tatoeba file.");
System.exit(1);
}
long startTime = System.currentTimeMillis();
String token = args[0];
String homophoneToken = args[1];
String langCode = args[2];
Language lang;
if ("en".equals(langCode)) {
lang = new EnglishLight();
} else {
lang = Languages.getLanguageForShortCode(langCode);
}
LanguageModel languageModel = new LuceneLanguageModel(new File(args[3], lang.getShortCode()));
//LanguageModel languageModel = new BerkeleyRawLanguageModel(new File("/media/Data/berkeleylm/google_books_binaries/ger.blm.gz"));
//LanguageModel languageModel = new BerkeleyLanguageModel(new File("/media/Data/berkeleylm/google_books_binaries/ger.blm.gz"));
List<String> inputsFiles = new ArrayList<>();
inputsFiles.add(args[4]);
if (args.length >= 6) {
inputsFiles.add(args[5]);
}
ConfusionRuleEvaluator generator = new ConfusionRuleEvaluator(lang, languageModel, CASE_SENSITIVE);
generator.run(inputsFiles, token, homophoneToken, MAX_SENTENCES, EVAL_FACTORS);
long endTime = System.currentTimeMillis();
System.out.println("\nTime: " + (endTime - startTime) + "ms");
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class SimpleRuleCounter method run.
private void run(List<Language> languages) {
List<Language> sortedLanguages = new ArrayList<>(languages);
sortedLanguages.sort((l1, l2) -> l1.getName().compareTo(l2.getName()));
for (Language language : sortedLanguages) {
if (language.isVariant()) {
continue;
}
JLanguageTool lt = new JLanguageTool(language);
List<Rule> allRules = lt.getAllActiveRules();
countForLanguage(allRules, language);
}
}
Aggregations