use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.
the class GenerateWords method generateVerbs.
private static void generateVerbs() {
System.out.println("Generating Verbs.");
String[] positiveNegatives = { "", "Neg" };
String[] times = { "Imp", "Aor", "Past", "Prog1", "Prog2", "Narr", "Fut" };
String[] persons = { "A1sg", "A2sg", "A3sg", "A1pl", "A2pl", "A3pl" };
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("okumak").disableCache().build();
for (String posNeg : positiveNegatives) {
for (String time : times) {
for (String person : persons) {
List<String> seq = Stream.of(posNeg, time, person).filter(s -> s.length() > 0).collect(Collectors.toList());
String stem = "oku";
List<Result> results = morphology.getWordGenerator().generate(stem, seq);
if (results.size() == 0) {
System.out.println("Cennot generate Stem = [" + stem + "] Morphemes = " + seq);
continue;
}
System.out.println(results.stream().map(s -> s.surface).collect(Collectors.joining(" ")) + " " + seq);
}
}
}
}
use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.
the class GenerateWords method generateNouns.
private static void generateNouns() {
System.out.println("Generating Nouns.");
String[] number = { "A3sg", "A3pl" };
String[] possessives = { "P1sg", "P2sg", "P3sg" };
String[] cases = { "Dat", "Loc", "Abl" };
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("armut").disableCache().build();
DictionaryItem item = morphology.getLexicon().getMatchingItems("armut").get(0);
for (String numberM : number) {
for (String possessiveM : possessives) {
for (String caseM : cases) {
List<Result> results = morphology.getWordGenerator().generate(item, numberM, possessiveM, caseM);
results.forEach(s -> System.out.println(s.surface));
}
}
}
}
use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.
the class ChangeStem method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
String word = "simidime";
Log.info("Input Word = " + word);
WordAnalysis results = morphology.analyze(word);
for (SingleAnalysis result : results) {
List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes());
for (Result s : generated) {
Log.info("Input analysis: " + result.formatLong());
Log.info("After stem change, word = " + s.surface);
Log.info("After stem change, Analysis = " + s.analysis.formatLong());
}
}
}
use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.
the class TurkishSentenceNormalizer method normalize.
public String normalize(String sentence) {
if (sentence.trim().length() == 0) {
return sentence;
}
String processed = preProcess(sentence);
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(processed);
List<Candidates> candidatesList = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
Token currentToken = tokens.get(i);
String current = currentToken.getText();
String next = i == tokens.size() - 1 ? null : tokens.get(i + 1).getText();
String previous = i == 0 ? null : tokens.get(i - 1).getText();
LinkedHashSet<String> candidates = new LinkedHashSet<>(2);
// add matches from manual lookup
candidates.addAll(lookupManual.get(current));
// add matches from random walk
candidates.addAll(lookupFromGraph.get(current));
// add matches from ascii equivalents.
// TODO: this may decrease accuracy. Also, this can be eliminated with ascii tolerant analyzer.
candidates.addAll(lookupFromAscii.get(current));
// add matches from informal analysis to formal surface conversion.
WordAnalysis analyses = informalAsciiTolerantMorphology.analyze(current);
for (SingleAnalysis analysis : analyses) {
if (analysis.containsInformalMorpheme()) {
WordGenerator.Result result = analysisConverter.convert(current, analysis);
if (result != null) {
candidates.add(result.surface);
}
} else {
List<WordGenerator.Result> results = morphology.getWordGenerator().generate(analysis.getDictionaryItem(), analysis.getMorphemes());
for (Result result : results) {
candidates.add(result.surface);
}
}
}
// get top 3 1 distance matches.
if ((analyses.analysisCount() == 0) && current.length() > 3) {
List<String> spellCandidates = spellChecker.suggestForWord(current, previous, next, lm);
if (spellCandidates.size() > 3) {
spellCandidates = new ArrayList<>(spellCandidates.subList(0, 3));
}
candidates.addAll(spellCandidates);
}
// if still there is no match, add the word itself.
if (candidates.isEmpty() || morphology.analyze(current).isCorrect()) {
candidates.add(current);
}
Candidates result = new Candidates(currentToken.getText(), candidates.stream().map(Candidate::new).collect(Collectors.toList()));
candidatesList.add(result);
}
// Apply Viterbi decoding and return result.
return String.join(" ", decode(candidatesList));
}
use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.
the class WordGeneratorTest method testGeneration4.
@Test
public void testGeneration4() {
TurkishMorphotactics mo = getMorphotactics("elma");
WordGenerator wordGenerator = new WordGenerator(mo);
List<String> morphemes = Lists.newArrayList("Noun", "A3pl", "P1pl");
List<Result> results = wordGenerator.generate(mo.getRootLexicon().getItemById("elma_Noun"), TurkishMorphotactics.getMorphemes(morphemes));
Assert.assertTrue(results.size() > 0);
Assert.assertEquals("elmalarımız", results.get(0).surface);
}
Aggregations