use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class LuceneLemmaFilter method addLemmas.
private boolean addLemmas() {
String word = termAttribute.toString();
WordAnalysis analysis = morphology.analyze(word);
Set<String> l = new HashSet<>(5);
// l.add(word);
analysis.forEach(s -> l.addAll(s.getLemmas()));
lemmas = new ArrayDeque<>(l);
return true;
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class DistanceBasedStemmer method findStems.
public void findStems(String str) {
str = "<s> <s> " + str + " </s> </s>";
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(str);
List<SentenceWordAnalysis> swaList = analysis.getWordAnalyses();
for (int i = 2; i < analysis.size() - 2; i++) {
SentenceWordAnalysis swa = swaList.get(i);
String s = swaList.get(i).getWordAnalysis().getInput();
List<String> bigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i - 2).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 2).getWordAnalysis().getInput()));
List<String> unigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()));
WordAnalysis wordResults = swa.getWordAnalysis();
Set<String> stems = wordResults.stream().map(a -> normalize(a.getDictionaryItem().lemma)).collect(Collectors.toSet());
List<ScoredItem<String>> scores = new ArrayList<>();
for (String stem : stems) {
if (!distances.containsWord(stem)) {
Log.info("Cannot find %s in vocab.", stem);
continue;
}
List<WordDistances.Distance> distances = this.distances.getDistance(stem);
float score = totalDistance(stem, bigramContext);
int k = 0;
for (WordDistances.Distance distance : distances) {
/* if (s.equals(distance.word)) {
continue;
}*/
score += distance(s, distance.word);
if (k++ == 10) {
break;
}
}
scores.add(new ScoredItem<>(stem, score));
}
Collections.sort(scores);
Log.info("%n%s : ", s);
for (ScoredItem<String> score : scores) {
Log.info("Lemma = %s Score = %.7f", score.item, score.score);
}
}
Log.info("==== Z disambiguation result ===== ");
for (SentenceWordAnalysis a : analysis) {
Log.info("%n%s : ", a.getWordAnalysis().getInput());
LinkedHashSet<String> items = new LinkedHashSet<>();
for (SingleAnalysis wa : a.getWordAnalysis()) {
items.add(wa.getDictionaryItem().toString());
}
for (String item : items) {
Log.info("%s", item);
}
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class AddNewDictionaryItem method test.
private void test(String input, DictionaryItem newItem) throws IOException {
WordAnalysis before = morphology.analyze(input);
Log.info("Parses for " + input + " before adding " + newItem);
printResults(before);
morphology.invalidateCache();
morphology.getMorphotactics().getStemTransitions().addDictionaryItem(newItem);
WordAnalysis after = morphology.analyze(input);
Log.info("Parses for " + input + " after adding " + newItem);
printResults(after);
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class StemmingAndLemmatization method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String word = "kutucuğumuz";
Log.info("Word = " + word);
Log.info("Results: ");
WordAnalysis results = morphology.analyze(word);
for (SingleAnalysis result : results) {
Log.info(result.formatLong());
Log.info("\tStems = " + result.getStems());
Log.info("\tLemmas = " + result.getLemmas());
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class GenerateDataWithRules method extractHighlyAmbigiousWordSentences.
private void extractHighlyAmbigiousWordSentences(Path inputRoot, Path outRoot, int minCount, int wordCount) throws IOException {
List<Path> files = Files.walk(inputRoot, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
for (Path file : files) {
Log.info("Processing %s", file);
LinkedHashSet<String> sentences = getSentences(file);
List<List<String>> group = group(new ArrayList<>(sentences), 5000);
for (List<String> lines : group) {
Log.info("Collected %d words.", wordAnalyses.size());
LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
for (String sentence : toProcess) {
try {
SentenceAnalysis sentenceAnalysis = morphology.analyzeAndDisambiguate(sentence);
for (SentenceWordAnalysis analysis : sentenceAnalysis) {
HashSet<String> stems = new HashSet<>(4);
for (SingleAnalysis s : analysis.getWordAnalysis()) {
stems.add(s.getStem());
if (stems.size() > minCount) {
wordAnalyses.add(analysis.getWordAnalysis());
break;
}
}
}
} catch (Exception e) {
Log.warn("Error in sentence %s", sentence);
}
}
}
if (wordAnalyses.size() > wordCount) {
break;
}
}
String s = inputRoot.toFile().getName();
Path amb = outRoot.resolve(s + "-amb.txt");
try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
for (WordAnalysis wa : wordAnalyses.getSortedList()) {
pwa.println(wa.getInput());
for (SingleAnalysis analysis : wa) {
pwa.println(analysis.formatLong());
}
pwa.println();
}
}
}
Aggregations