use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class RuleBasedDisambiguatorTest method test.
@Test
public void test() throws IOException {
// String input = "ABD Açık Serena Williams'ın";
// String input = "Çünkü birbirine tezat oluşturuyor.";
// String input = "O anda gördüm.";
// String input = "Aklımıza ilk gelen emeği öncelemek.";
// String input = "Petrolün Türkiye üzerinden dünya pazarına satılması.";
String input = "4 Neden önemli?";
// String input = "Sadece partimi iktidar yaptım.";
TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
// Rules rules = new Rules();
// rules.pairLexRules.add(PairRule.fromLine("Aklı*|aklı* [akıl:Noun] *"));
RuleBasedDisambiguator disambiguator = new RuleBasedDisambiguator(analyzer, Rules.fromResources());
ResultSentence resultSentence = disambiguator.disambiguate(input);
System.out.println(resultSentence.allIgnoredCount());
for (AmbiguityAnalysis a : resultSentence.results) {
a.getForTrainingOutput().forEach(System.out::println);
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method findZemberekMissingOrDifferent.
@Test
@Ignore("Not a Test.")
public void findZemberekMissingOrDifferent() throws IOException {
Path path = DATA_PATH.resolve("out");
LinkedHashSet<String> oSet = new LinkedHashSet<>(TextIO.loadLines(path.resolve("dictionary-from-analysis.txt")).stream().filter(s -> !s.contains("Prop")).collect(Collectors.toList()));
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
List<String> zemberekTypes = new ArrayList<>(parser.getLexicon().size());
for (DictionaryItem item : parser.getLexicon()) {
String lemma = /*item.primaryPos == PrimaryPos.Verb ? item.lemma.replaceAll("mek$|mak$", "") : */
item.lemma;
lemma = TurkishAlphabet.INSTANCE.normalizeCircumflex(lemma);
String primaryString = /*item.primaryPos == PrimaryPos.Adverb ? "Adverb" :*/
item.primaryPos.shortForm;
String pos = item.secondaryPos == null || item.secondaryPos == SecondaryPos.UnknownSec || item.secondaryPos == SecondaryPos.None ? "[P:" + primaryString + "]" : "[P:" + primaryString + "," + item.secondaryPos.shortForm + "]";
zemberekTypes.add(lemma + " " + pos);
if (pos.equals("[P:Noun]")) {
zemberekTypes.add(lemma + " [P:Adj]");
}
if (pos.equals("[P:Adj]")) {
zemberekTypes.add(lemma + " [P:Noun]");
}
}
zemberekTypes.sort(turkishCollator::compare);
Files.write(path.resolve("found-in-zemberek"), zemberekTypes);
LinkedHashSet<String> zSet = new LinkedHashSet<>(zemberekTypes);
oSet.removeAll(zSet);
Files.write(path.resolve("not-found-in-zemberek"), oSet);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method memoryStressTest.
@Test
@Ignore("Not a Test.")
public void memoryStressTest() throws IOException {
List<String> words = Files.readAllLines(Paths.get("dunya"));
TurkishMorphology parser = TurkishMorphology.builder().setLexicon(RootLexicon.fromResources(TurkishDictionaryLoader.DEFAULT_DICTIONARY_RESOURCES)).build();
int c = 0;
for (int i = 0; i < 100; i++) {
Stopwatch sw = Stopwatch.createStarted();
for (String s : words) {
WordAnalysis parses = parser.analyze(s);
c += parses.analysisCount();
}
Log.info(sw.elapsed(TimeUnit.MILLISECONDS));
Log.info(parser.toString());
}
Log.info(c);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method readmeExample2.
@Test
@Ignore("Not a Test")
public void readmeExample2() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
WordAnalysis result = morphology.analyze("kitabımızsa");
for (SingleAnalysis analysis : result) {
System.out.println(analysis.formatLong());
System.out.println("\tStems = " + analysis.getStems());
System.out.println("\tLemmas = " + analysis.getLemmas());
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method ambiguousWords.
@Test
@Ignore("Not a Test.")
public void ambiguousWords() throws IOException {
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
Path correct = outDir.resolve("zemberek-parses.txt");
Path outAmbAn = outDir.resolve("zemberek-ambigious-analyses.txt");
Path outAmbWord = outDir.resolve("zemberek-ambigious-words.txt");
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
List<String> words = Files.readAllLines(correct).subList(0, 100_000);
List<String> ambWords = new ArrayList<>();
List<WordAnalysis> amb = new ArrayList<>();
for (String word : words) {
WordAnalysis analysis = morphology.analyze(word);
if (!analysis.isCorrect() || analysis.analysisCount() == 1) {
} else {
HashSet<String> stems = new HashSet<>(4);
for (SingleAnalysis s : analysis) {
stems.add(s.getStem());
if (stems.size() > 1) {
amb.add(analysis);
ambWords.add(word);
break;
}
}
}
}
Log.info("Writing %d words", amb.size());
try (PrintWriter pwa = new PrintWriter(outAmbAn.toFile(), "utf-8")) {
for (WordAnalysis wa : amb) {
pwa.println(wa.getInput());
for (SingleAnalysis analysis : wa) {
pwa.println(analysis.formatLong());
}
pwa.println();
}
}
Files.write(outAmbWord, ambWords, StandardCharsets.UTF_8);
}
Aggregations