use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.
the class ConfusionSetOccurrenceLookup method main.
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.out.println("Usage: " + ConfusionSetOccurrenceLookup.class.getName() + " <confusion-file> <ngram-data-dir>");
System.exit(1);
}
try (Scanner sc = new Scanner(new File(args[0]));
LuceneLanguageModel lm = new LuceneLanguageModel(new File(args[1]))) {
while (sc.hasNextLine()) {
String line = sc.nextLine();
String[] words = line.split(";\\s*");
long total = 0;
List<Long> counts = new ArrayList<>();
StringBuilder sb = new StringBuilder();
for (String word : words) {
long count = lm.getCount(word);
total += count;
sb.append(word).append(":").append(count).append(" ");
counts.add(count);
}
float factor = (float) Collections.max(counts) / Collections.min(counts);
System.out.printf(Locale.ENGLISH, total + " " + line + " " + sb.toString().trim() + " factor:%.1f\n", factor);
}
}
}
use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.
the class GermanAuxVerbGuesser method main.
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.out.println("Usage: " + GermanAuxVerbGuesser.class.getName() + " <ngramDataIndex> <lemmaFile>");
System.out.println(" <lemmaFile> is a text file with 'participle2 \\t lemma' per line, e.g. 'getrunken \t trinken'");
System.exit(1);
}
String indexTopDir = args[0];
List<String> lines = Files.readAllLines(Paths.get(args[1]));
int match = 0;
int noMatch = 0;
int unambiguous = 0;
try (LuceneLanguageModel lm = new LuceneLanguageModel(new File(indexTopDir))) {
for (String line : lines) {
String pa2 = line.split("\t")[0];
String lemma = line.split("\t")[1];
long haben = countHaben(lm, pa2, lemma);
long sein = countSein(lm, pa2, lemma);
System.out.println(lemma + ": haben: " + haben + ", sein: " + sein);
if (haben == 0 && sein == 0) {
noMatch++;
} else {
if (haben == 0 && sein > 0 || haben > 0 && sein == 0) {
unambiguous++;
}
match++;
}
}
}
System.out.println("match: " + match);
System.out.println("noMatch: " + noMatch);
System.out.println("----");
System.out.println("unambiguous: " + unambiguous);
}
use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.
the class GermanAuxVerbGuesser2 method main.
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.out.println("Usage: " + GermanAuxVerbGuesser2.class.getName() + " <ngramDataIndex> <lemmaFile>");
System.out.println(" <lemmaFile> is a text file with 'participle2 \\t lemma' per line, e.g. 'getrunken \t trinken'");
System.exit(1);
}
String indexTopDir = args[0];
List<String> lines = Files.readAllLines(Paths.get(args[1]));
System.out.println("# factor lemma Dativ/mir Akkusativ/mich");
try (LuceneLanguageModel lm = new LuceneLanguageModel(new File(indexTopDir))) {
for (String line : lines) {
String pa2 = line.split("\t")[0];
String lemma = line.split("\t")[1];
long mir = count(lm, pa2, lemma, "mir");
long mich = count(lm, pa2, lemma, "mich");
long dir = count(lm, pa2, lemma, "dir");
long dich = count(lm, pa2, lemma, "dich");
float factor = ((float) mir + dir) / ((float) mich + dich);
System.out.println(factor + " " + lemma + " " + mir + " " + mich);
}
}
}
use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.
the class GermanReflexiveVerbGuesser method run.
private void run(File indexTopDir, File lemmaListFile) throws IOException {
List<String> lemmas = Files.readAllLines(lemmaListFile.toPath());
System.out.println("Durchschnitt Prozent | Anzahl Lemma | mich/uns/euch ... | ... mich/uns/euch | Lemma");
try (LuceneLanguageModel lm = new LuceneLanguageModel(indexTopDir)) {
for (String lemma : lemmas) {
//if (!lemma.equals("reklamieren")) { continue; }
//if (!lemma.equals("hertreiben")) { continue; }
String[] firstPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:1:SIN:PRÄ.*", true);
String[] thirdPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:3:SIN:PRÄ.*", true);
String firstPsSin = firstPsSinArray.length > 0 ? firstPsSinArray[0] : null;
String thirdPsSin = thirdPsSinArray.length > 0 ? thirdPsSinArray[0] : null;
long reflexiveCount1 = count1(lm, lemma, firstPsSin, thirdPsSin) - counterExamples("für", lm, lemma, firstPsSin, thirdPsSin) - counterExamples("vor", lm, lemma, firstPsSin, thirdPsSin);
long reflexiveCount2 = count2(lm, lemma, firstPsSin, thirdPsSin);
long lemmaCount = lm.getCount(lemma);
float factor1 = ((float) reflexiveCount1 / lemmaCount) * 100.0f;
float factor2 = ((float) reflexiveCount2 / lemmaCount) * 100.0f;
float avgFactor = (factor1 + factor2) / 2;
//System.out.printf("%.2f%% %.2f%% " + reflexiveCount1 + " " + reflexiveCount2 + " " + lemmaCount + " " + lemma + "\n", factor1, factor2);
//System.out.printf("%.2f%% %.2f%% " + lemmaCount + " " + lemma + "\n", factor1, factor2);
System.out.printf("%.2f %d %.2f%% %.2f%% %s\n", avgFactor, lemmaCount, factor1, factor2, lemma);
}
}
}
use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.
the class NGramLookup method main.
public static void main(String[] args) {
if (args.length != 2) {
System.out.println("Usage: " + NGramLookup.class.getName() + " <ngram> <ngramDataIndex>");
System.out.println(" Example: " + NGramLookup.class.getName() + " \"my house\" /data/ngram-index");
System.exit(1);
}
String indexTopDir = args[1];
try (LuceneLanguageModel lm = new LuceneLanguageModel(new File(indexTopDir))) {
String[] lookup = args[0].split(" ");
long count = lm.getCount(Arrays.asList(lookup));
System.out.println(Arrays.toString(lookup) + " -> " + count);
}
}
Aggregations