use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.
the class TurkishDictionaryLoaderTest method masterDictionaryLoadTest.
@Test
@Ignore("Not a unit Test. Only loads the master dictionary.")
public void masterDictionaryLoadTest() throws IOException {
TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
RootLexicon items = loader.load(new File(Resources.getResource("tr/master-dictionary.dict").getFile()));
TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
Set<String> masterVoicing = new HashSet<>();
for (DictionaryItem item : items) {
if (item.attributes.contains(NoVoicing)) {
masterVoicing.add(item.lemma);
}
}
Locale tr = new Locale("tr");
List<String> allZ2 = SimpleTextReader.trimmingUTF8Reader(new File(Resources.getResource("tr/master-dictionary.dict").getFile())).asStringList();
for (String s : allZ2) {
if (s.startsWith("#")) {
continue;
}
String clean = Strings.subStringUntilFirst(s.trim(), " ").toLowerCase(tr).replaceAll("[\\-']", "");
if (s.contains("Adj") && !s.contains("Compound") && !s.contains("PropNoun")) {
TurkishLetterSequence seq = new TurkishLetterSequence(clean, alphabet);
if (seq.vowelCount() > 1 && seq.lastLetter().isStopConsonant() && !s.contains("Vo") && !s.contains("VowDrop")) {
if (!masterVoicing.contains(clean)) {
File f = new File("/home/afsina/data/tdk/html", clean + ".html");
if (!f.exists()) {
f = new File("/home/afsina/data/tdk/html", clean.replaceAll("â", "a").replaceAll("\\u00ee", "i") + ".html");
}
if (!f.exists()) {
System.out.println("Cannot find:" + s);
continue;
}
char c = clean.charAt(clean.length() - 1);
char vv = c;
switch(c) {
case 'k':
vv = 'ğ';
break;
case 'p':
vv = 'b';
break;
case 'ç':
vv = 'c';
break;
case 't':
vv = 'd';
break;
default:
System.out.println("crap:" + s);
}
String content = SimpleTextReader.trimmingUTF8Reader(f).asString();
if (!content.contains("color=DarkBlue>-" + String.valueOf(vv))) {
System.out.println(s);
}
}
}
}
}
for (DictionaryItem item : items) {
if ((item.primaryPos == Noun || item.primaryPos == PrimaryPos.Adjective) && item.secondaryPos != SecondaryPos.ProperNoun && item.hasAttribute(RootAttribute.Voicing)) {
}
}
System.out.println(items.size());
}
use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method tryWithoutApostrophe.
private List<SingleAnalysis> tryWithoutApostrophe(String word, SecondaryPos secondaryPos) {
String normalized = null;
TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
if (alphabet.containsForeignDiacritics(word)) {
normalized = alphabet.foreignDiacriticsToTurkish(word);
}
normalized = normalized == null ? alphabet.normalize(word) : alphabet.normalize(normalized);
boolean capitalize = secondaryPos == SecondaryPos.ProperNoun || secondaryPos == SecondaryPos.Abbreviation;
// TODO: should we remove dots with normalization?
String pronunciation = guessPronunciation(normalized.replaceAll("[.]", ""));
DictionaryItem item = new DictionaryItem(capitalize ? Turkish.capitalize(normalized) : normalized, normalized, pronunciation, PrimaryPos.Noun, secondaryPos);
if (!alphabet.containsVowel(pronunciation)) {
List<SingleAnalysis> result = new ArrayList<>(1);
result.add(SingleAnalysis.dummy(word, item));
return result;
}
boolean itemDoesNotExist = !lexicon.containsItem(item);
if (itemDoesNotExist) {
item.attributes.add(RootAttribute.Runtime);
analyzer.getStemTransitions().addDictionaryItem(item);
}
List<SingleAnalysis> results = analyzer.analyze(normalized);
if (itemDoesNotExist) {
analyzer.getStemTransitions().removeDictionaryItem(item);
}
return results;
}
use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.
the class OflazerAnalyzerRunner method extractRootsFromParse.
public static void extractRootsFromParse(File input, File output) throws IOException {
System.out.println("Extracting root words from parse list");
TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
List<String> all = SimpleTextReader.trimmingUTF8Reader(input).asStringList();
Set<String> roots = Sets.newHashSet();
for (String s : all) {
List<String> lst = Lists.newArrayList(Splitter.on("\t").split(s));
String root = lst.get(1);
if (root.contains("+")) {
Iterator<String> iterator = Splitter.on("+").split(root).iterator();
root = iterator.next();
String pos = iterator.next();
if (pos.equals("Verb")) {
if (alphabet.getLastVowel(root).isFrontal()) {
root = root + "mek";
} else {
root = root + "mak";
}
}
}
roots.add(root);
}
ArrayList<String> sorted = Lists.newArrayList(roots);
sorted.sort(ctr);
SimpleTextWriter.oneShotUTF8Writer(output).writeLines(sorted);
}
use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.
the class OflazerAnalyzerRunner method extractDictItems.
public static void extractDictItems(File input, File output) throws IOException {
System.out.println("Extracting dict items from parse list");
TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
List<String> all = SimpleTextReader.trimmingUTF8Reader(input).asStringList();
Set<String> roots = Sets.newHashSet();
for (String s : all) {
s = s.replaceAll("\\^DB", "");
List<String> lst = Lists.newArrayList(Splitter.on("\t").split(s));
String root = lst.get(1);
StringBuilder data = new StringBuilder();
if (root.contains("+")) {
Iterator<String> iterator = Splitter.on("+").split(root).iterator();
root = iterator.next();
String pos = iterator.next();
String secPos = "";
if (iterator.hasNext()) {
String c = iterator.next();
if (secondaryPosSet.contains(c)) {
secPos = c;
}
}
if (pos.equals("Verb")) {
if (alphabet.getLastVowel(root).isFrontal()) {
root = root + "mek";
} else {
root = root + "mak";
}
}
data.append(root);
if (!pos.equals("Noun") && !pos.equals("Verb")) {
if (pos.equals("Adverb")) {
pos = "Adv";
}
data.append(" [P:").append(pos);
if (secPos.length() > 1) {
data.append(" ,").append(secPos);
}
data.append("; A:Ext]");
} else {
if (secPos.length() > 0) {
data.append(" [P:").append(secPos).append("; A:Ext]");
} else {
data.append(" [A:Ext]");
}
}
} else {
data = new StringBuilder(root).append(" [A:Ext]");
}
roots.add(data.toString());
}
ArrayList<String> sorted = Lists.newArrayList(roots);
sorted.sort(ctr);
SimpleTextWriter.oneShotUTF8Writer(output).writeLines(sorted);
}
use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.
the class DictionaryOperations method saveRegular.
public static void saveRegular() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Set<String> set = new HashSet<>();
for (DictionaryItem item : morphology.getLexicon()) {
String lemma = item.lemma;
if (item.attributes.contains(RootAttribute.Dummy)) {
continue;
}
if (item.primaryPos == PrimaryPos.Punctuation) /*|| item.secondaryPos == SecondaryPos.ProperNoun
|| item.secondaryPos == SecondaryPos.Abbreviation*/
{
continue;
}
set.add(lemma);
TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
if (alphabet.containsCircumflex(lemma)) {
set.add(alphabet.normalizeCircumflex(lemma));
}
}
List<String> list = new ArrayList<>(set);
list.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("zemberek.vocab"), list);
}
Aggregations