use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class GenerateDataWithRules method extractHighlyAmbigiousWordSentences.
private void extractHighlyAmbigiousWordSentences(Path inputRoot, Path outRoot, int minCount, int wordCount) throws IOException {
List<Path> files = Files.walk(inputRoot, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
for (Path file : files) {
Log.info("Processing %s", file);
LinkedHashSet<String> sentences = getSentences(file);
List<List<String>> group = group(new ArrayList<>(sentences), 5000);
for (List<String> lines : group) {
Log.info("Collected %d words.", wordAnalyses.size());
LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
for (String sentence : toProcess) {
try {
SentenceAnalysis sentenceAnalysis = morphology.analyzeAndDisambiguate(sentence);
for (SentenceWordAnalysis analysis : sentenceAnalysis) {
HashSet<String> stems = new HashSet<>(4);
for (SingleAnalysis s : analysis.getWordAnalysis()) {
stems.add(s.getStem());
if (stems.size() > minCount) {
wordAnalyses.add(analysis.getWordAnalysis());
break;
}
}
}
} catch (Exception e) {
Log.warn("Error in sentence %s", sentence);
}
}
}
if (wordAnalyses.size() > wordCount) {
break;
}
}
String s = inputRoot.toFile().getName();
Path amb = outRoot.resolve(s + "-amb.txt");
try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
for (WordAnalysis wa : wordAnalyses.getSortedList()) {
pwa.println(wa.getInput());
for (SingleAnalysis analysis : wa) {
pwa.println(analysis.formatLong());
}
pwa.println();
}
}
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method readmeExample2.
@Test
@Ignore("Not a Test")
public void readmeExample2() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
WordAnalysis result = morphology.analyze("kitabımızsa");
for (SingleAnalysis analysis : result) {
System.out.println(analysis.formatLong());
System.out.println("\tStems = " + analysis.getStems());
System.out.println("\tLemmas = " + analysis.getLemmas());
}
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method ambiguousWords.
@Test
@Ignore("Not a Test.")
public void ambiguousWords() throws IOException {
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
Path correct = outDir.resolve("zemberek-parses.txt");
Path outAmbAn = outDir.resolve("zemberek-ambigious-analyses.txt");
Path outAmbWord = outDir.resolve("zemberek-ambigious-words.txt");
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
List<String> words = Files.readAllLines(correct).subList(0, 100_000);
List<String> ambWords = new ArrayList<>();
List<WordAnalysis> amb = new ArrayList<>();
for (String word : words) {
WordAnalysis analysis = morphology.analyze(word);
if (!analysis.isCorrect() || analysis.analysisCount() == 1) {
} else {
HashSet<String> stems = new HashSet<>(4);
for (SingleAnalysis s : analysis) {
stems.add(s.getStem());
if (stems.size() > 1) {
amb.add(analysis);
ambWords.add(word);
break;
}
}
}
}
Log.info("Writing %d words", amb.size());
try (PrintWriter pwa = new PrintWriter(outAmbAn.toFile(), "utf-8")) {
for (WordAnalysis wa : amb) {
pwa.println(wa.getInput());
for (SingleAnalysis analysis : wa) {
pwa.println(analysis.formatLong());
}
pwa.println();
}
}
Files.write(outAmbWord, ambWords, StandardCharsets.UTF_8);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method disambiguationExample.
@Test
@Ignore("Not a Test")
public void disambiguationExample() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String sentence = "Yarın kar yağacak.";
System.out.println("Sentence = " + sentence);
List<WordAnalysis> analysis = morphology.analyzeSentence(sentence);
System.out.println("Before disambiguation.");
for (WordAnalysis entry : analysis) {
System.out.println("Word = " + entry.getInput());
for (SingleAnalysis single : entry) {
System.out.println(single.formatLong());
}
}
System.out.println("\nAfter disambiguation.");
SentenceAnalysis after = morphology.disambiguate(sentence, analysis);
after.bestAnalysis().forEach(s -> System.out.println(s.formatLong()));
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method failedWordTestIssue124.
@Test
@Ignore("Not a Test.")
public void failedWordTestIssue124() throws IOException {
Path failPath = DATA_PATH.resolve("fails.txt");
LinkedHashSet<String> words = new LinkedHashSet<>(Files.readAllLines(failPath, StandardCharsets.UTF_8));
LinkedHashSet<String> accepted = new LinkedHashSet<>();
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
for (String s : words) {
WordAnalysis parses = parser.analyze(s);
List<SingleAnalysis> analyses = parses.getAnalysisResults();
for (SingleAnalysis parse : analyses) {
if (parse.isUnknown() || parse.isRuntime()) {
continue;
}
accepted.add(s);
}
}
for (String s : accepted) {
words.remove(s);
}
Path failReduced = DATA_PATH.resolve("fails-reduced.txt");
try (PrintWriter pw = new PrintWriter(failReduced.toFile(), "utf-8")) {
words.forEach(pw::println);
}
Log.info("Word count = %d Found = %d Not Found = %d", words.size(), accepted.size(), words.size() - accepted.size());
}
Aggregations