use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.
the class Scripts method saveUnambiguous.
public static void saveUnambiguous(List<String> sentences, TurkishMorphology morphology, Path out) throws IOException {
try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
for (String sentence : sentences) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
continue;
}
pwMorph.format("S:%s%n", sentence);
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
pwMorph.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
pwMorph.println(singleAnalysis.formatLong());
} else {
pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
pwMorph.println();
}
}
}
use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.
the class PerceptronAmbiguityResolver method disambiguate.
@Override
public SentenceAnalysis disambiguate(String sentence, List<WordAnalysis> allAnalyses) {
DecodeResult best = decoder.bestPath(allAnalyses);
List<SentenceWordAnalysis> l = new ArrayList<>();
for (int i = 0; i < allAnalyses.size(); i++) {
WordAnalysis wordAnalysis = allAnalyses.get(i);
SingleAnalysis analysis = best.bestParse.get(i);
l.add(new SentenceWordAnalysis(analysis, wordAnalysis));
}
return new SentenceAnalysis(sentence, l);
}
use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.
the class ClassificationConsole method replaceWordsWithLemma.
private String replaceWordsWithLemma(String sentence) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
List<String> res = new ArrayList<>();
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
res.add(lemmas.get(lemmas.size() - 1));
}
return String.join(" ", res);
}
use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.
the class QuestionClassifier method replaceWordsWithLemma.
private String replaceWordsWithLemma(String sentence) {
List<String> tokens = Splitter.on(" ").splitToList(sentence);
// assume first is label. Remove label from sentence for morphological analysis.
String label = tokens.get(0);
tokens = tokens.subList(1, tokens.size());
sentence = String.join(" ", tokens);
if (sentence.length() == 0) {
return sentence;
}
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
List<String> res = new ArrayList<>();
// add label first.
res.add(label);
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
res.add(lemmas.get(lemmas.size() - 1));
}
return String.join(" ", res);
}
use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.
the class AmbiguousExampleFinder method extractSentences.
private static void extractSentences(TurkishMorphology morphology, AmbiguousExampleFinder finder) throws Exception {
List<String> ambiguousWords = Files.readAllLines(Paths.get("data/ambiguity/zemberek-ambigious-words.txt"), StandardCharsets.UTF_8).subList(0, 100);
Path out = Paths.get("data/ambiguity/sentences.txt");
Path morph = Paths.get("data/ambiguity/sentences.morph.txt");
try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8");
PrintWriter pwMorph = new PrintWriter(morph.toFile(), "utf-8")) {
for (String word : ambiguousWords) {
Log.info(word);
List<String> sentences = finder.getSentences(word, 3, 5, 10);
pw.println(word);
sentences.forEach(pw::println);
pw.println();
for (String sentence : sentences) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
if (containsUnkown(analysis)) {
continue;
}
pwMorph.format("S:%s%n", sentence);
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
pwMorph.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
pwMorph.println(singleAnalysis.formatLong());
} else {
pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
pwMorph.println();
}
}
}
}
Aggregations