use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project lucene-solr-analysis-turkish by iorixxx.
the class Zemberek3StemFilterFactory method inform.
@Override
public void inform(ResourceLoader loader) throws IOException {
if (dictionaryFiles == null || dictionaryFiles.trim().isEmpty()) {
this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
// Use default dictionaries shipped with Zemberek3.
return;
}
List<String> lines = new ArrayList<>();
List<String> files = splitFileNames(dictionaryFiles);
if (files.size() > 0) {
for (String file : files) {
List<String> wlist = getLines(loader, file.trim());
lines.addAll(wlist);
}
}
if (lines.isEmpty()) {
this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
// Use default dictionaries shipped with Zemberek3.
return;
}
SuffixProvider suffixProvider = new TurkishSuffixes();
RootLexicon lexicon = new TurkishDictionaryLoader(suffixProvider).load(lines);
DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
graph.addDictionaryItems(lexicon);
parser = new WordParser(graph);
}
use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.
the class TurkishDictionaryLoaderTest method masterDictionaryLoadTest.
@Test
@Ignore("Not a unit Test. Only loads the master dictionary.")
public void masterDictionaryLoadTest() throws IOException {
TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
RootLexicon items = loader.load(new File(Resources.getResource("tr/master-dictionary.dict").getFile()));
TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
Set<String> masterVoicing = new HashSet<>();
for (DictionaryItem item : items) {
if (item.attributes.contains(NoVoicing)) {
masterVoicing.add(item.lemma);
}
}
Locale tr = new Locale("tr");
List<String> allZ2 = SimpleTextReader.trimmingUTF8Reader(new File(Resources.getResource("tr/master-dictionary.dict").getFile())).asStringList();
for (String s : allZ2) {
if (s.startsWith("#")) {
continue;
}
String clean = Strings.subStringUntilFirst(s.trim(), " ").toLowerCase(tr).replaceAll("[\\-']", "");
if (s.contains("Adj") && !s.contains("Compound") && !s.contains("PropNoun")) {
TurkishLetterSequence seq = new TurkishLetterSequence(clean, alphabet);
if (seq.vowelCount() > 1 && seq.lastLetter().isStopConsonant() && !s.contains("Vo") && !s.contains("VowDrop")) {
if (!masterVoicing.contains(clean)) {
File f = new File("/home/afsina/data/tdk/html", clean + ".html");
if (!f.exists()) {
f = new File("/home/afsina/data/tdk/html", clean.replaceAll("â", "a").replaceAll("\\u00ee", "i") + ".html");
}
if (!f.exists()) {
System.out.println("Cannot find:" + s);
continue;
}
char c = clean.charAt(clean.length() - 1);
char vv = c;
switch(c) {
case 'k':
vv = 'ğ';
break;
case 'p':
vv = 'b';
break;
case 'ç':
vv = 'c';
break;
case 't':
vv = 'd';
break;
default:
System.out.println("crap:" + s);
}
String content = SimpleTextReader.trimmingUTF8Reader(f).asString();
if (!content.contains("color=DarkBlue>-" + String.valueOf(vv))) {
System.out.println(s);
}
}
}
}
}
for (DictionaryItem item : items) {
if ((item.primaryPos == Noun || item.primaryPos == PrimaryPos.Adjective) && item.secondaryPos != SecondaryPos.ProperNoun && item.hasAttribute(RootAttribute.Voicing)) {
}
}
System.out.println(items.size());
}
use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.
the class TurkishDictionaryLoaderTest method loadNounsFromFileTest.
@Test
public void loadNounsFromFileTest() throws IOException {
TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
RootLexicon items = loader.load(new File(Resources.getResource("test-lexicon-nouns.txt").getFile()));
Assert.assertFalse(items.isEmpty());
for (DictionaryItem item : items) {
Assert.assertTrue(item.primaryPos == Noun);
}
}
use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.
the class TurkishDictionaryLoaderTest method properNounsShouldNotHaveVoicingAutomaticallyTest.
@Test
public void properNounsShouldNotHaveVoicingAutomaticallyTest() {
TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
DictionaryItem item = loader.loadFromString("Tokat");
Assert.assertEquals("tokat", item.root);
Assert.assertEquals(Noun, item.primaryPos);
Assert.assertEquals(SecondaryPos.ProperNoun, item.secondaryPos);
Assert.assertFalse(item.hasAttribute(RootAttribute.Voicing));
item = loader.loadFromString("Dink");
Assert.assertEquals("dink", item.root);
Assert.assertEquals(Noun, item.primaryPos);
Assert.assertEquals(SecondaryPos.ProperNoun, item.secondaryPos);
Assert.assertFalse(item.hasAttribute(RootAttribute.Voicing));
}
use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.
the class WordAnalyzerTest method getItems.
private List<DictionaryItem> getItems(String[] lines) {
TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
List<DictionaryItem> items = new ArrayList<DictionaryItem>();
for (String line : lines) {
items.add(loader.loadFromString(line));
}
return items;
}
Aggregations