use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class HunspellOperations method generateAnnotationFileMultiSplit.
private static void generateAnnotationFileMultiSplit(Path vocab, Path annotationsPath) throws IOException {
List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
List<String> annotations = new ArrayList<>();
for (String word : words) {
WordAnalysis analysis = morphology.analyze(word);
if (!analysis.isCorrect()) {
Log.warn("Cannot analyze %s", word);
continue;
}
LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
for (SingleAnalysis s : analysis) {
if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
continue;
}
String surfaces = AnalysisFormatters.SURFACE_SEQUENCE.format(s);
List<String> tokens = Splitter.on(" ").splitToList(surfaces);
String stem = tokens.get(0);
for (int i = 0; i < tokens.size(); i++) {
String morpheme = tokens.get(i);
if (i > 0) {
stem = stem + morpheme;
}
List<String> morphemes = i == tokens.size() - 1 ? new ArrayList<>() : tokens.subList(i + 1, tokens.size());
String ending = String.join(" ", morphemes);
if (isCorrectAndContainsNoProper(morphology.analyze(stem))) {
if (ending.length() > 0) {
stemEndings.add(word + " " + stem + " " + ending);
}
/*else {
stemEndings.add(word + " " + stem);
}*/
}
}
}
annotations.add(String.join(",", stemEndings));
}
Files.write(annotationsPath, annotations, StandardCharsets.UTF_8);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class StemDisambiguationExperiment method unambiguous.
private boolean unambiguous(String sentence) {
for (String token : TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence)) {
WordAnalysis analyses = morphology.analyze(token);
Set<String> lemmas = new HashSet<>();
for (SingleAnalysis analysis : analyses) {
lemmas.add(analysis.getDictionaryItem().normalizedLemma());
}
if (lemmas.size() > 1) {
return false;
}
}
return true;
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class Scripts method saveUnambiguous.
public static void saveUnambiguous(List<SentenceAnalysis> sentences, Path out) throws IOException {
try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
for (SentenceAnalysis analysis : sentences) {
if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
continue;
}
pwMorph.format("S:%s%n", analysis.getSentence());
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
pwMorph.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
pwMorph.println(singleAnalysis.formatLong());
} else {
pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
pwMorph.println();
}
}
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class ClassificationExampleBase method splitWords.
protected String splitWords(String sentence) {
List<String> tokens = Splitter.on(" ").splitToList(sentence);
// assume first is label. Remove label from sentence for morphological analysis.
String label = tokens.get(0);
tokens = tokens.subList(1, tokens.size());
sentence = String.join(" ", tokens);
if (sentence.length() == 0) {
return sentence;
}
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
List<String> res = new ArrayList<>();
// add label first.
res.add(label);
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
String input = e.getWordAnalysis().getInput();
if (best.isUnknown()) {
res.add(input);
continue;
}
List<String> lemmas = best.getLemmas();
String l = lemmas.get(0);
if (l.length() < input.length()) {
res.add(l);
String substring = input.substring(l.length());
res.add("_" + substring);
} else {
res.add(l);
}
}
return String.join(" ", res);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class ChangeStem method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
String word = "simidime";
Log.info("Input Word = " + word);
WordAnalysis results = morphology.analyze(word);
for (SingleAnalysis result : results) {
List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes());
for (Result s : generated) {
Log.info("Input analysis: " + result.formatLong());
Log.info("After stem change, word = " + s.surface);
Log.info("After stem change, Analysis = " + s.analysis.formatLong());
}
}
}
Aggregations