use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class QuestionClassifier method replaceWordsWithLemma.
private String replaceWordsWithLemma(String sentence) {
List<String> tokens = Splitter.on(" ").splitToList(sentence);
// assume first is label. Remove label from sentence for morphological analysis.
String label = tokens.get(0);
tokens = tokens.subList(1, tokens.size());
sentence = String.join(" ", tokens);
if (sentence.length() == 0) {
return sentence;
}
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
List<String> res = new ArrayList<>();
// add label first.
res.add(label);
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
res.add(lemmas.get(lemmas.size() - 1));
}
return String.join(" ", res);
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class AmbiguousExampleFinder method extractSentences.
private static void extractSentences(TurkishMorphology morphology, AmbiguousExampleFinder finder) throws Exception {
List<String> ambiguousWords = Files.readAllLines(Paths.get("data/ambiguity/zemberek-ambigious-words.txt"), StandardCharsets.UTF_8).subList(0, 100);
Path out = Paths.get("data/ambiguity/sentences.txt");
Path morph = Paths.get("data/ambiguity/sentences.morph.txt");
try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8");
PrintWriter pwMorph = new PrintWriter(morph.toFile(), "utf-8")) {
for (String word : ambiguousWords) {
Log.info(word);
List<String> sentences = finder.getSentences(word, 3, 5, 10);
pw.println(word);
sentences.forEach(pw::println);
pw.println();
for (String sentence : sentences) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
if (containsUnkown(analysis)) {
continue;
}
pwMorph.format("S:%s%n", sentence);
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
pwMorph.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
pwMorph.println(singleAnalysis.formatLong());
} else {
pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
pwMorph.println();
}
}
}
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class DistanceBasedStemmer method findStems.
public void findStems(String str) {
str = "<s> <s> " + str + " </s> </s>";
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(str);
List<SentenceWordAnalysis> swaList = analysis.getWordAnalyses();
for (int i = 2; i < analysis.size() - 2; i++) {
SentenceWordAnalysis swa = swaList.get(i);
String s = swaList.get(i).getWordAnalysis().getInput();
List<String> bigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i - 2).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 2).getWordAnalysis().getInput()));
List<String> unigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()));
WordAnalysis wordResults = swa.getWordAnalysis();
Set<String> stems = wordResults.stream().map(a -> normalize(a.getDictionaryItem().lemma)).collect(Collectors.toSet());
List<ScoredItem<String>> scores = new ArrayList<>();
for (String stem : stems) {
if (!distances.containsWord(stem)) {
Log.info("Cannot find %s in vocab.", stem);
continue;
}
List<WordDistances.Distance> distances = this.distances.getDistance(stem);
float score = totalDistance(stem, bigramContext);
int k = 0;
for (WordDistances.Distance distance : distances) {
/* if (s.equals(distance.word)) {
continue;
}*/
score += distance(s, distance.word);
if (k++ == 10) {
break;
}
}
scores.add(new ScoredItem<>(stem, score));
}
Collections.sort(scores);
Log.info("%n%s : ", s);
for (ScoredItem<String> score : scores) {
Log.info("Lemma = %s Score = %.7f", score.item, score.score);
}
}
Log.info("==== Z disambiguation result ===== ");
for (SentenceWordAnalysis a : analysis) {
Log.info("%n%s : ", a.getWordAnalysis().getInput());
LinkedHashSet<String> items = new LinkedHashSet<>();
for (SingleAnalysis wa : a.getWordAnalysis()) {
items.add(wa.getDictionaryItem().toString());
}
for (String item : items) {
Log.info("%s", item);
}
}
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class ClassificationExampleBase method replaceWordsWithLemma.
protected String replaceWordsWithLemma(String sentence) {
List<String> tokens = Splitter.on(" ").splitToList(sentence);
// assume first is label. Remove label from sentence for morphological analysis.
String label = tokens.get(0);
tokens = tokens.subList(1, tokens.size());
sentence = String.join(" ", tokens);
if (sentence.length() == 0) {
return sentence;
}
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
List<String> res = new ArrayList<>();
// add label first.
res.add(label);
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
res.add(lemmas.get(0));
}
return String.join(" ", res);
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class Z3MarkovModelDisambiguator method getAmbiguousSequence.
public Ambiguous[] getAmbiguousSequence(SentenceAnalysis sentence) {
Ambiguous[] awords = new Ambiguous[sentence.size() + 3];
awords[0] = startWord;
awords[1] = startWord;
int i = 2;
for (SentenceAnalysis.Entry entry : sentence) {
int[] roots = new int[entry.parses.size()];
int[] lastIgs = new int[entry.parses.size()];
int j = 0;
for (WordAnalysis parse : entry.parses) {
String rootPart = parse.dictionaryItem.lemma;
WordAnalysis.InflectionalGroup firstIg = parse.inflectionalGroups.get(0);
if (firstIg.suffixList.size() == 0) {
rootPart += firstIg.formatNoSurface();
} else {
String s = firstIg.formatNoSurface();
String suffixPart = Strings.subStringAfterFirst(s, ";");
if (suffixPart.equals("A3sg+Pnon+Nom)")) {
rootPart += (Strings.subStringUntilFirst(s, ";") + ")");
}
}
roots[j] = rootLm.getVocabulary().indexOf(rootPart);
String igPart;
int igSize = parse.inflectionalGroups.size();
if (igSize > 1 && parse.inflectionalGroups.get(igSize - 2).suffixList.size() == 0) {
igPart = parse.inflectionalGroups.get(igSize - 2).formatNoSurface() + parse.getLastIg();
} else {
igPart = parse.getLastIg().formatNoSurface();
}
lastIgs[j] = igLm.getVocabulary().indexOf(igPart);
j++;
}
awords[i] = new Ambiguous(roots, lastIgs);
i++;
}
awords[i] = endWord;
return awords;
}
Aggregations