Search in sources :

Example 1 with MorfologikSpeller

use of org.languagetool.rules.spelling.morfologik.MorfologikSpeller in project languagetool by languagetool-org.

the class RareWordsFinder method run.

private void run(File input, int minimum) throws FileNotFoundException, CharacterCodingException {
    MorfologikSpeller speller = new MorfologikSpeller(dictInClassPath, 1);
    int lineCount = 0;
    int wordCount = 0;
    try (Scanner s = new Scanner(input)) {
        while (s.hasNextLine()) {
            String line = s.nextLine();
            String[] parts = line.split("\t");
            String word = parts[0];
            long count = Long.parseLong(parts[1]);
            if (count <= minimum) {
                if (word.matches("[a-zA-Z]+") && !word.matches("[A-Z]+") && !word.matches("[a-zA-Z]+[A-Z]+[a-zA-Z]*") && !word.matches("[A-Z].*")) {
                    boolean isMisspelled = speller.isMisspelled(word);
                    if (!isMisspelled) {
                        //List<String> suggestions = speller.getSuggestions(word);  // seems to work only for words that are actually misspellings
                        List<String> suggestions = hunspellDict.suggest(word);
                        suggestions.remove(word);
                        if (suggestionsMightBeUseful(word, suggestions)) {
                            System.out.println(word + "\t" + count + " -> " + String.join(", ", suggestions));
                            wordCount++;
                        }
                    }
                }
            }
            lineCount++;
            if (lineCount % 1_000_000 == 0) {
                System.out.println("lineCount: " + lineCount + ", words found: " + wordCount);
            }
        }
        System.out.println("Done. lineCount: " + lineCount + ", words found: " + wordCount);
    }
}
Also used : Scanner(java.util.Scanner) MorfologikSpeller(org.languagetool.rules.spelling.morfologik.MorfologikSpeller)

Aggregations

Scanner (java.util.Scanner)1 MorfologikSpeller (org.languagetool.rules.spelling.morfologik.MorfologikSpeller)1