use of morfologik.fsa.FSA in project languagetool by languagetool-org.
the class ExportGermanNouns method getBinaryDictWords.
private Set<String> getBinaryDictWords() throws IOException {
final FSA fsa = FSA.read(JLanguageTool.getDataBroker().getFromResourceDirAsStream(DICT_FILENAME));
final Set<String> set = new HashSet<>();
for (ByteBuffer buffer : fsa) {
final byte[] sequence = new byte[buffer.remaining()];
buffer.get(sequence);
final String output = new String(sequence, "iso-8859-1");
if (isRelevantNoun(output)) {
final String[] parts = output.split("\\+");
final String term = parts[0].toLowerCase();
set.add(term);
}
}
return set;
}
use of morfologik.fsa.FSA in project languagetool by languagetool-org.
the class MissingGenitiveFinder method run.
@SuppressWarnings("UnnecessaryParentheses")
private void run() throws IOException {
GermanTagger tagger = new GermanTagger();
final FSA fsa = FSA.read(JLanguageTool.getDataBroker().getFromResourceDirAsStream(DICT_FILENAME));
int i = 0;
for (ByteBuffer buffer : fsa) {
final byte[] sequence = new byte[buffer.remaining()];
buffer.get(sequence);
final String output = new String(sequence, "iso-8859-1");
// COU = Country
boolean isNoun = output.contains("+SUB:") || (output.contains("+EIG:") && output.contains("COU"));
if (isNoun && output.contains(":GEN:")) {
final String[] parts = output.split("\\+");
String word = parts[0];
String esWord = parts[0].replaceFirst("s$", "es");
if (isRelevantWord(word)) {
boolean hasEsGenitive = hasEsGenitive(tagger, word);
boolean ignore1 = word.endsWith("els") && !word.endsWith("iels");
Integer occurrence = occurrences.get(esWord);
if (!hasEsGenitive && !ignore1 && occurrence != null) {
//System.out.println(i + ". " + word + " " + occurrence);
System.out.println(esWord + "\t" + word.replaceFirst("s$", "") + "\t" + parts[2]);
i++;
}
}
}
}
}
use of morfologik.fsa.FSA in project languagetool by languagetool-org.
the class GermanSpellerRuleTest method getDictionary.
private Dictionary getDictionary(List<byte[]> lines, InputStream infoFile) throws IOException {
Collections.sort(lines, FSABuilder.LEXICAL_ORDERING);
FSA fsa = FSABuilder.build(lines);
ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream());
ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray());
return Dictionary.read(fsaInStream, infoFile);
}
use of morfologik.fsa.FSA in project languagetool by languagetool-org.
the class MorfologikMultiSpeller method getDictionary.
private Dictionary getDictionary(List<byte[]> lines, String dictPath) throws IOException {
Dictionary dictFromCache = dicPathToDict.get(dictPath);
if (dictFromCache != null) {
return dictFromCache;
} else {
// Creating the dictionary at runtime can easily take 50ms for spelling.txt files
// that are ~50KB. We don't want that overhead for every check of a short sentence,
// so we cache the result:
Collections.sort(lines, FSABuilder.LEXICAL_ORDERING);
FSA fsa = FSABuilder.build(lines);
ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream());
ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray());
String infoFile = dictPath.replace(".dict", ".info");
Dictionary dict = Dictionary.read(fsaInStream, JLanguageTool.getDataBroker().getFromResourceDirAsStream(infoFile));
dicPathToDict.put(dictPath, dict);
return dict;
}
}
Aggregations