use of org.languagetool.rules.spelling.morfologik.MorfologikSpeller in project languagetool by languagetool-org.
the class RareWordsFinder method run.
private void run(File input, int minimum) throws FileNotFoundException, CharacterCodingException {
MorfologikSpeller speller = new MorfologikSpeller(dictInClassPath, 1);
int lineCount = 0;
int wordCount = 0;
try (Scanner s = new Scanner(input)) {
while (s.hasNextLine()) {
String line = s.nextLine();
String[] parts = line.split("\t");
String word = parts[0];
long count = Long.parseLong(parts[1]);
if (count <= minimum) {
if (word.matches("[a-zA-Z]+") && !word.matches("[A-Z]+") && !word.matches("[a-zA-Z]+[A-Z]+[a-zA-Z]*") && !word.matches("[A-Z].*")) {
boolean isMisspelled = speller.isMisspelled(word);
if (!isMisspelled) {
//List<String> suggestions = speller.getSuggestions(word); // seems to work only for words that are actually misspellings
List<String> suggestions = hunspellDict.suggest(word);
suggestions.remove(word);
if (suggestionsMightBeUseful(word, suggestions)) {
System.out.println(word + "\t" + count + " -> " + String.join(", ", suggestions));
wordCount++;
}
}
}
}
lineCount++;
if (lineCount % 1_000_000 == 0) {
System.out.println("lineCount: " + lineCount + ", words found: " + wordCount);
}
}
System.out.println("Done. lineCount: " + lineCount + ", words found: " + wordCount);
}
}
Aggregations