use of org.languagetool.rules.ConfusionSetLoader in project languagetool by languagetool-org.
the class AllConfusionRulesEvaluator method main.
public static void main(String[] args) throws IOException {
if (args.length < 3 || args.length > 4) {
System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName() + " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>...");
System.err.println(" <languageModelTopDir> is a directory with sub-directories '1grams', '2grams', and '3grams' with Lucene indexes");
System.err.println(" <wikipediaXml|tatoebaFile|dir> either a Wikipedia XML dump, or a Tatoeba file or");
System.err.println(" a directory with example sentences (where <word>.txt contains only the sentences for <word>).");
System.err.println(" You can specify both a Wikipedia file and a Tatoeba file.");
System.exit(1);
}
Language lang;
if ("en".equals(args[0])) {
lang = new ConfusionRuleEvaluator.EnglishLight();
} else {
lang = Languages.getLanguageForShortCode(args[0]);
}
LanguageModel languageModel = new LuceneLanguageModel(new File(args[1]));
List<String> inputsFiles = new ArrayList<>();
inputsFiles.add(args[2]);
if (args.length >= 4) {
inputsFiles.add(args[3]);
}
ConfusionRuleEvaluator eval = new ConfusionRuleEvaluator(lang, languageModel, false);
eval.setVerboseMode(false);
ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt");
Map<String, List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
Set<String> done = new HashSet<>();
int fMeasureCount = 0;
float fMeasureTotal = 0;
for (List<ConfusionSet> entry : confusionSetMap.values()) {
for (ConfusionSet confusionSet : entry) {
Set<ConfusionString> set = confusionSet.getSet();
if (set.size() != 2) {
System.out.println("Skipping confusion set with size != 2: " + confusionSet);
} else {
Iterator<ConfusionString> iterator = set.iterator();
ConfusionString set1 = iterator.next();
ConfusionString set2 = iterator.next();
String word1 = set1.getString();
String word2 = set2.getString();
String key = word1 + " " + word2;
if (!done.contains(key)) {
Map<Long, ConfusionRuleEvaluator.EvalResult> evalResults = eval.run(inputsFiles, word1, word2, MAX_SENTENCES, Arrays.asList(confusionSet.getFactor()));
ConfusionRuleEvaluator.EvalResult evalResult = evalResults.values().iterator().next();
String summary1 = set1.getDescription() != null ? word1 + "|" + set1.getDescription() : word1;
String summary2 = set2.getDescription() != null ? word2 + "|" + set2.getDescription() : word2;
String start;
if (summary1.compareTo(summary2) < 0) {
start = summary1 + "; " + summary2 + "; " + confusionSet.getFactor();
} else {
start = summary2 + "; " + summary1 + "; " + confusionSet.getFactor();
}
String spaces = StringUtils.repeat(" ", 82 - start.length());
System.out.println(start + spaces + "# " + evalResult.getSummary());
double fMeasure = FMeasure.getWeightedFMeasure(evalResult.getPrecision(), evalResult.getRecall());
//System.out.println("f-measure: " + fMeasure);
fMeasureCount++;
fMeasureTotal += fMeasure;
}
done.add(key);
}
}
}
System.out.println("Average f-measure: " + (fMeasureTotal / fMeasureCount));
}
use of org.languagetool.rules.ConfusionSetLoader in project languagetool by languagetool-org.
the class HomophoneOccurrenceDumper method run.
private void run(String confusionSetPath) throws IOException {
System.err.println("Loading confusion sets from " + confusionSetPath + ", minimum occurrence: " + MIN_COUNT);
ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(confusionSetPath);
Map<String, List<ConfusionSet>> map = confusionSetLoader.loadConfusionSet(inputStream);
Set<String> confusionTerms = map.keySet();
dumpOccurrences(confusionTerms);
}
use of org.languagetool.rules.ConfusionSetLoader in project languagetool by languagetool-org.
the class RuleCreator method run.
private void run(File homophoneOccurrences, String homophonePath) throws IOException {
ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(homophonePath);
Map<String, List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
initMaps(homophoneOccurrences);
int groupCount = 0;
if (XML_MODE) {
System.out.println("<rules lang='en'>\n");
System.out.println("<category name='Auto-generated rules'>\n");
}
for (Map.Entry<String, List<ConfusionSet>> entry : confusionSetMap.entrySet()) {
System.err.println(" === " + entry + " === ");
if (entry.getValue().size() > 1) {
System.err.println("WARN: will use only first pair of " + entry.getValue().size() + ": " + entry.getValue().get(0));
}
List<OccurrenceInfo> infos = occurrenceInfos.get(entry.getKey());
if (infos == null) {
System.err.println("Could not find occurrence infos for '" + entry.getKey() + "', skipping");
continue;
}
Set cleanSet = new HashSet<>(entry.getValue().get(0).getSet());
cleanSet.remove(entry.getKey());
String name = StringUtils.join(cleanSet, "/") + " -> " + entry.getKey();
if (XML_MODE) {
System.out.println("<rulegroup id='R" + groupCount + "' name=\"" + StringTools.escapeXML(name) + "\">\n");
}
groupCount++;
for (OccurrenceInfo occurrenceInfo : infos) {
String[] parts = occurrenceInfo.ngram.split(" ");
for (ConfusionString variant : entry.getValue().get(0).getSet()) {
if (variant.getString().equals(entry.getKey())) {
continue;
}
printRule(occurrenceInfo, parts, variant.getString());
}
}
if (XML_MODE) {
System.out.println("</rulegroup>\n");
}
}
if (XML_MODE) {
System.out.println("</category>");
System.out.println("</rules>");
}
System.err.println("Done. Wrote " + ruleCount + " rules.");
System.err.println("Rules ignored because of different tokenization: " + tokenFilteredRules);
System.err.println("Rules ignored because of error probability limit (" + minErrorProb + "): " + probFilteredRules);
}
use of org.languagetool.rules.ConfusionSetLoader in project languagetool by languagetool-org.
the class RuleOverview method countConfusionPairs.
private int countConfusionPairs(Language lang) {
String path = "/" + lang.getShortCode() + "/confusion_sets.txt";
ResourceDataBroker dataBroker = JLanguageTool.getDataBroker();
if (dataBroker.resourceExists(path)) {
try (InputStream confusionSetStream = dataBroker.getFromResourceDirAsStream(path)) {
ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
return confusionSetLoader.loadConfusionSet(confusionSetStream).size() / 2;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return 0;
}
use of org.languagetool.rules.ConfusionSetLoader in project languagetool by languagetool-org.
the class NGramUrlGenerator method mainDownloadSome.
public static void mainDownloadSome(String[] args) throws IOException {
ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones.txt");
Map<String, List<ConfusionSet>> map = confusionSetLoader.loadConfusionSet(inputStream);
String url = "http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-2gram-20120701-<XX>.gz";
Set<String> nameSet = new HashSet<>();
for (String s : map.keySet()) {
if (s.length() < 2) {
nameSet.add(s.substring(0, 1).toLowerCase() + "_");
} else {
nameSet.add(s.substring(0, 2).toLowerCase());
}
}
List<String> nameList = new ArrayList<>(nameSet);
Collections.sort(nameList);
for (String name : nameList) {
System.out.println(url.replace("<XX>", name));
}
System.err.println("Number of files: " + nameList.size());
}
Aggregations