use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testEmail.
@Test
public void testEmail() {
TurkishMorphology morphology = getMorphology();
WordAnalysis result = morphology.analyze("foo@bar.com'a");
Assert.assertEquals(1, result.analysisCount());
SingleAnalysis analysis = result.getAnalysisResults().get(0);
Assert.assertEquals(SecondaryPos.Email, analysis.getDictionaryItem().secondaryPos);
String lexical = analysis.formatLexical();
Assert.assertTrue(lexical.endsWith("A3sg+Dat"));
Assert.assertEquals("foo@bar.com", analysis.getDictionaryItem().lemma);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testAsciiTolerantMorphology.
@Test
public void testAsciiTolerantMorphology() {
// Instance with no dictionary item.
TurkishMorphology morphology = getAsciiTolerantMorphology("sıra", "şıra", "armut", "kazan", "ekonomik [P:Adj]", "insan");
RuleBasedAnalyzer analyzer = morphology.getAnalyzer();
List<SingleAnalysis> result;
result = analyzer.analyze("ekonomık");
Assert.assertTrue(containsAllDictionaryLemma(result, "ekonomik"));
result = analyzer.analyze("sira");
Assert.assertEquals(2, result.size());
Assert.assertTrue(containsAllDictionaryLemma(result, "sıra", "şıra"));
result = analyzer.analyze("siraci");
Assert.assertTrue(containsAllDictionaryLemma(result, "sıra", "şıra"));
result = analyzer.analyze("armutcuga");
Assert.assertTrue(containsAllDictionaryLemma(result, "armut"));
result = analyzer.analyze("kazancıga");
Assert.assertTrue(containsAllDictionaryLemma(result, "kazan"));
result = analyzer.analyze("kazanciga");
Assert.assertTrue(containsAllDictionaryLemma(result, "kazan"));
result = analyzer.analyze("kazançiğimizdan");
Assert.assertTrue(containsAllDictionaryLemma(result, "kazan"));
result = analyzer.analyze("ınsanların");
Assert.assertTrue(containsAllDictionaryLemma(result, "insan"));
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method parseLargeVocabularyZemberek.
@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberek() throws IOException {
// Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
Path wordFreqFile = DATA_PATH.resolve("all-counts-sorted-freq.txt");
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
Log.info("Loading histogram.");
Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
List<String> accepted = new ArrayList<>(histogram.size() / 3);
int c = 0;
for (String s : histogram) {
try {
WordAnalysis parses = parser.analyze(s);
List<SingleAnalysis> analyses = parses.getAnalysisResults();
if (analyses.size() > 0 && analyses.get(0).getDictionaryItem().primaryPos != PrimaryPos.Unknown) {
accepted.add(s);
}
if (c > 0 && c % 10000 == 0) {
Log.info("Processed = " + c);
}
c++;
} catch (Exception e) {
Log.info("Exception in %s", s);
}
}
save(outDir.resolve("zemberek-parsed-words.txt"), accepted);
sortAndSave(outDir.resolve("zemberek-parsed-words.tr.txt"), accepted);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method guessRootsWithHeuristics.
@Test
@Ignore("Not a Test.")
public void guessRootsWithHeuristics() throws IOException {
Path wordFreqFile = DATA_PATH.resolve("out/no-parse-zemberek-freq.txt");
Log.info("Loading histogram.");
List<String> words = Files.readAllLines(wordFreqFile);
TurkishDictionaryLoader dictionaryLoader = new TurkishDictionaryLoader();
// dictionaryLoader.load("elma");
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("elma").disableCache().build();
Multimap<String, String> res = HashMultimap.create(100000, 3);
int c = 0;
for (String s : words) {
if (s.length() < 4) {
continue;
}
if (!TurkishAlphabet.INSTANCE.containsVowel(s)) {
continue;
}
for (int i = 2; i < s.length(); i++) {
String candidateRoot = s.substring(0, i + 1);
if (!TurkishAlphabet.INSTANCE.containsVowel(candidateRoot)) {
continue;
}
List<DictionaryItem> items = new ArrayList<>(3);
// assumes noun.
items.add(TurkishDictionaryLoader.loadFromString(candidateRoot));
// assumes noun.
items.add(TurkishDictionaryLoader.loadFromString(candidateRoot + " [P:Verb]"));
char last = candidateRoot.charAt(candidateRoot.length() - 1);
if (i < s.length() - 1) {
char next = s.charAt(candidateRoot.length());
if (Turkish.Alphabet.isVowel(next)) {
String f = "";
if (last == 'b') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'p';
} else if (last == 'c') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'ç';
} else if (last == 'ğ') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'k';
}
if (last == 'd') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 't';
}
if (f.length() > 0) {
items.add(TurkishDictionaryLoader.loadFromString(f));
}
}
}
for (DictionaryItem item : items) {
morphology.getMorphotactics().getStemTransitions().addDictionaryItem(item);
WordAnalysis analyze = morphology.analyze(s);
for (SingleAnalysis wordAnalysis : analyze) {
if (!wordAnalysis.isUnknown()) {
res.put(candidateRoot, s);
}
}
morphology.getMorphotactics().getStemTransitions().removeDictionaryItem(item);
}
}
if (++c % 10000 == 0) {
Log.info(c);
}
if (c == 100000) {
break;
}
}
Log.info("Writing.");
try (PrintWriter pw1 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-words").toFile());
PrintWriter pw2 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-vocabulary").toFile())) {
for (String root : res.keySet()) {
Collection<String> vals = res.get(root);
if (vals.size() < 2) {
continue;
}
List<String> wl = new ArrayList<>(vals);
wl.sort(turkishCollator::compare);
pw1.println(root + " : " + String.join(", ", vals));
pw2.println(root);
}
}
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class LoadProperNouns method main.
public static void main(String[] args) throws IOException {
TurkishMorphology parserGenerator = TurkishMorphology.createWithDefaults();
List<String> lines = Files.readAllLines(Paths.get("/home/afsina/Downloads/documents-export-2016-02-17/vocabulary-proper-full.tr.txt"));
Histogram<String> histogram = new Histogram<>();
Set<String> ignore = new HashSet<>(Files.readAllLines(Paths.get("morphology/src/main/resources/tr/proper-ignore")));
for (String line : lines) {
if (line.startsWith("_")) {
continue;
}
line = line.trim();
if (line.length() == 0) {
continue;
}
String word = Strings.subStringUntilFirst(line, " ");
int count = Integer.parseInt(Strings.subStringAfterFirst(line, " "));
word = Turkish.capitalize(word.substring(1));
if (count < 50) {
continue;
}
if (ignore.contains(word)) {
continue;
}
WordAnalysis parses = parserGenerator.analyze(word);
boolean found = false;
for (SingleAnalysis parse : parses) {
if (parse.getDictionaryItem().secondaryPos.equals(SecondaryPos.ProperNoun) && !parse.getDictionaryItem().hasAttribute(RootAttribute.Runtime)) {
found = true;
}
}
parserGenerator.invalidateCache();
if (found) {
continue;
}
if (word.length() < 4) {
continue;
}
histogram.add(word, count);
}
histogram.removeSmaller(165);
try (PrintWriter pw = new PrintWriter("proper")) {
histogram.getSortedList(Turkish.STRING_COMPARATOR_ASC).forEach(pw::println);
}
}
Aggregations