use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.
the class TurkishDictionaryLoaderTest method getLastItem.
public DictionaryItem getLastItem(String... itemStr) {
TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
String last = Strings.subStringUntilFirst(itemStr[itemStr.length - 1], " ");
return loader.load(itemStr).getMatchingItems(last).get(0);
}
use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method guessRootsWithHeuristics.
@Test
@Ignore("Not a Test.")
public void guessRootsWithHeuristics() throws IOException {
Path wordFreqFile = DATA_PATH.resolve("out/no-parse-zemberek-freq.txt");
Log.info("Loading histogram.");
List<String> words = Files.readAllLines(wordFreqFile);
TurkishDictionaryLoader dictionaryLoader = new TurkishDictionaryLoader();
// dictionaryLoader.load("elma");
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("elma").disableCache().build();
Multimap<String, String> res = HashMultimap.create(100000, 3);
int c = 0;
for (String s : words) {
if (s.length() < 4) {
continue;
}
if (!TurkishAlphabet.INSTANCE.containsVowel(s)) {
continue;
}
for (int i = 2; i < s.length(); i++) {
String candidateRoot = s.substring(0, i + 1);
if (!TurkishAlphabet.INSTANCE.containsVowel(candidateRoot)) {
continue;
}
List<DictionaryItem> items = new ArrayList<>(3);
// assumes noun.
items.add(TurkishDictionaryLoader.loadFromString(candidateRoot));
// assumes noun.
items.add(TurkishDictionaryLoader.loadFromString(candidateRoot + " [P:Verb]"));
char last = candidateRoot.charAt(candidateRoot.length() - 1);
if (i < s.length() - 1) {
char next = s.charAt(candidateRoot.length());
if (Turkish.Alphabet.isVowel(next)) {
String f = "";
if (last == 'b') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'p';
} else if (last == 'c') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'ç';
} else if (last == 'ğ') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'k';
}
if (last == 'd') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 't';
}
if (f.length() > 0) {
items.add(TurkishDictionaryLoader.loadFromString(f));
}
}
}
for (DictionaryItem item : items) {
morphology.getMorphotactics().getStemTransitions().addDictionaryItem(item);
WordAnalysis analyze = morphology.analyze(s);
for (SingleAnalysis wordAnalysis : analyze) {
if (!wordAnalysis.isUnknown()) {
res.put(candidateRoot, s);
}
}
morphology.getMorphotactics().getStemTransitions().removeDictionaryItem(item);
}
}
if (++c % 10000 == 0) {
Log.info(c);
}
if (c == 100000) {
break;
}
}
Log.info("Writing.");
try (PrintWriter pw1 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-words").toFile());
PrintWriter pw2 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-vocabulary").toFile())) {
for (String root : res.keySet()) {
Collection<String> vals = res.get(root);
if (vals.size() < 2) {
continue;
}
List<String> wl = new ArrayList<>(vals);
wl.sort(turkishCollator::compare);
pw1.println(root + " : " + String.join(", ", vals));
pw2.println(root);
}
}
}
Aggregations