Search in sources :

Example 6 with LuceneLanguageModel

use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.

the class ConfusionSetOccurrenceLookup method main.

public static void main(String[] args) throws IOException {
    if (args.length != 2) {
        System.out.println("Usage: " + ConfusionSetOccurrenceLookup.class.getName() + " <confusion-file> <ngram-data-dir>");
        System.exit(1);
    }
    try (Scanner sc = new Scanner(new File(args[0]));
        LuceneLanguageModel lm = new LuceneLanguageModel(new File(args[1]))) {
        while (sc.hasNextLine()) {
            String line = sc.nextLine();
            String[] words = line.split(";\\s*");
            long total = 0;
            List<Long> counts = new ArrayList<>();
            StringBuilder sb = new StringBuilder();
            for (String word : words) {
                long count = lm.getCount(word);
                total += count;
                sb.append(word).append(":").append(count).append(" ");
                counts.add(count);
            }
            float factor = (float) Collections.max(counts) / Collections.min(counts);
            System.out.printf(Locale.ENGLISH, total + " " + line + "    " + sb.toString().trim() + " factor:%.1f\n", factor);
        }
    }
}
Also used : File(java.io.File) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel)

Example 7 with LuceneLanguageModel

use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.

the class GermanAuxVerbGuesser method main.

public static void main(String[] args) throws IOException {
    if (args.length != 2) {
        System.out.println("Usage: " + GermanAuxVerbGuesser.class.getName() + " <ngramDataIndex> <lemmaFile>");
        System.out.println("   <lemmaFile> is a text file with 'participle2 \\t lemma' per line, e.g. 'getrunken \t trinken'");
        System.exit(1);
    }
    String indexTopDir = args[0];
    List<String> lines = Files.readAllLines(Paths.get(args[1]));
    int match = 0;
    int noMatch = 0;
    int unambiguous = 0;
    try (LuceneLanguageModel lm = new LuceneLanguageModel(new File(indexTopDir))) {
        for (String line : lines) {
            String pa2 = line.split("\t")[0];
            String lemma = line.split("\t")[1];
            long haben = countHaben(lm, pa2, lemma);
            long sein = countSein(lm, pa2, lemma);
            System.out.println(lemma + ": haben: " + haben + ", sein: " + sein);
            if (haben == 0 && sein == 0) {
                noMatch++;
            } else {
                if (haben == 0 && sein > 0 || haben > 0 && sein == 0) {
                    unambiguous++;
                }
                match++;
            }
        }
    }
    System.out.println("match: " + match);
    System.out.println("noMatch: " + noMatch);
    System.out.println("----");
    System.out.println("unambiguous: " + unambiguous);
}
Also used : LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) File(java.io.File)

Example 8 with LuceneLanguageModel

use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.

the class GermanAuxVerbGuesser2 method main.

public static void main(String[] args) throws IOException {
    if (args.length != 2) {
        System.out.println("Usage: " + GermanAuxVerbGuesser2.class.getName() + " <ngramDataIndex> <lemmaFile>");
        System.out.println("   <lemmaFile> is a text file with 'participle2 \\t lemma' per line, e.g. 'getrunken \t trinken'");
        System.exit(1);
    }
    String indexTopDir = args[0];
    List<String> lines = Files.readAllLines(Paths.get(args[1]));
    System.out.println("# factor lemma Dativ/mir Akkusativ/mich");
    try (LuceneLanguageModel lm = new LuceneLanguageModel(new File(indexTopDir))) {
        for (String line : lines) {
            String pa2 = line.split("\t")[0];
            String lemma = line.split("\t")[1];
            long mir = count(lm, pa2, lemma, "mir");
            long mich = count(lm, pa2, lemma, "mich");
            long dir = count(lm, pa2, lemma, "dir");
            long dich = count(lm, pa2, lemma, "dich");
            float factor = ((float) mir + dir) / ((float) mich + dich);
            System.out.println(factor + " " + lemma + " " + mir + " " + mich);
        }
    }
}
Also used : LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) File(java.io.File)

Example 9 with LuceneLanguageModel

use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.

the class GermanReflexiveVerbGuesser method run.

private void run(File indexTopDir, File lemmaListFile) throws IOException {
    List<String> lemmas = Files.readAllLines(lemmaListFile.toPath());
    System.out.println("Durchschnitt Prozent | Anzahl Lemma | mich/uns/euch ... | ... mich/uns/euch | Lemma");
    try (LuceneLanguageModel lm = new LuceneLanguageModel(indexTopDir)) {
        for (String lemma : lemmas) {
            //if (!lemma.equals("reklamieren")) { continue; }
            //if (!lemma.equals("hertreiben")) { continue; }
            String[] firstPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:1:SIN:PRÄ.*", true);
            String[] thirdPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:3:SIN:PRÄ.*", true);
            String firstPsSin = firstPsSinArray.length > 0 ? firstPsSinArray[0] : null;
            String thirdPsSin = thirdPsSinArray.length > 0 ? thirdPsSinArray[0] : null;
            long reflexiveCount1 = count1(lm, lemma, firstPsSin, thirdPsSin) - counterExamples("für", lm, lemma, firstPsSin, thirdPsSin) - counterExamples("vor", lm, lemma, firstPsSin, thirdPsSin);
            long reflexiveCount2 = count2(lm, lemma, firstPsSin, thirdPsSin);
            long lemmaCount = lm.getCount(lemma);
            float factor1 = ((float) reflexiveCount1 / lemmaCount) * 100.0f;
            float factor2 = ((float) reflexiveCount2 / lemmaCount) * 100.0f;
            float avgFactor = (factor1 + factor2) / 2;
            //System.out.printf("%.2f%% %.2f%% " + reflexiveCount1 + " " + reflexiveCount2 + " " + lemmaCount + " " + lemma + "\n", factor1, factor2);
            //System.out.printf("%.2f%% %.2f%% " + lemmaCount + " " + lemma + "\n", factor1, factor2);
            System.out.printf("%.2f %d %.2f%% %.2f%% %s\n", avgFactor, lemmaCount, factor1, factor2, lemma);
        }
    }
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel)

Example 10 with LuceneLanguageModel

use of org.languagetool.languagemodel.LuceneLanguageModel in project languagetool by languagetool-org.

the class NGramLookup method main.

public static void main(String[] args) {
    if (args.length != 2) {
        System.out.println("Usage: " + NGramLookup.class.getName() + " <ngram> <ngramDataIndex>");
        System.out.println("  Example: " + NGramLookup.class.getName() + " \"my house\" /data/ngram-index");
        System.exit(1);
    }
    String indexTopDir = args[1];
    try (LuceneLanguageModel lm = new LuceneLanguageModel(new File(indexTopDir))) {
        String[] lookup = args[0].split(" ");
        long count = lm.getCount(Arrays.asList(lookup));
        System.out.println(Arrays.toString(lookup) + " -> " + count);
    }
}
Also used : LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel) File(java.io.File)

Aggregations

LuceneLanguageModel (org.languagetool.languagemodel.LuceneLanguageModel)10 File (java.io.File)8 Language (org.languagetool.Language)3 LanguageModel (org.languagetool.languagemodel.LanguageModel)3 Ignore (org.junit.Ignore)2 Test (org.junit.Test)2 InputStream (java.io.InputStream)1 AnalyzedToken (org.languagetool.AnalyzedToken)1 ConfusionSet (org.languagetool.rules.ConfusionSet)1 ConfusionSetLoader (org.languagetool.rules.ConfusionSetLoader)1 ConfusionString (org.languagetool.rules.ConfusionString)1