use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class GenerateDataWithRules method collect.
private void collect(BatchResult batchResult, Collection<String> sentences, int maxAmbigiousWordCount, int resultLimit) {
List<List<String>> group = group(new ArrayList<>(sentences), 5000);
for (List<String> strings : group) {
LinkedHashSet<String> toProcess = getAccpetableSentences(strings);
Log.info("Processing.. %d found.", batchResult.acceptedSentences.size());
for (String sentence : toProcess) {
ResultSentence r = ruleBasedDisambiguator.disambiguate(sentence);
if (r.ambiguousWordCount() > maxAmbigiousWordCount) {
continue;
}
if (r.zeroAnalysisCount() > 0) {
continue;
}
if (r.allIgnoredCount() > 0) {
Log.warn("Sentence [%s] contains word(s) that all analyses are ignored.", r.sentence);
continue;
}
boolean sentenceOk = true;
for (WordAnalysis an : r.sentenceAnalysis) {
boolean ok = true;
for (Predicate<WordAnalysis> predicate : acceptWordPredicates) {
if (!predicate.test(an)) {
ok = false;
break;
}
}
if (!ok) {
batchResult.ignoredSentences.add(sentence);
sentenceOk = false;
break;
}
}
if (sentenceOk) {
batchResult.acceptedSentences.add(sentence);
batchResult.results.add(r);
if (resultLimit > 0 && batchResult.results.size() > resultLimit) {
return;
}
}
}
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class Scripts method saveUnambiguous.
public static void saveUnambiguous(List<SentenceAnalysis> sentences, Path out) throws IOException {
try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
for (SentenceAnalysis analysis : sentences) {
if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
continue;
}
pwMorph.format("S:%s%n", analysis.getSentence());
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
pwMorph.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
pwMorph.println(singleAnalysis.formatLong());
} else {
pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
pwMorph.println();
}
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class ChangeStem method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
String word = "simidime";
Log.info("Input Word = " + word);
WordAnalysis results = morphology.analyze(word);
for (SingleAnalysis result : results) {
List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes());
for (Result s : generated) {
Log.info("Input analysis: " + result.formatLong());
Log.info("After stem change, word = " + s.surface);
Log.info("After stem change, Analysis = " + s.analysis.formatLong());
}
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishSentenceNormalizer method normalize.
public String normalize(String sentence) {
if (sentence.trim().length() == 0) {
return sentence;
}
String processed = preProcess(sentence);
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(processed);
List<Candidates> candidatesList = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
Token currentToken = tokens.get(i);
String current = currentToken.getText();
String next = i == tokens.size() - 1 ? null : tokens.get(i + 1).getText();
String previous = i == 0 ? null : tokens.get(i - 1).getText();
LinkedHashSet<String> candidates = new LinkedHashSet<>(2);
// add matches from manual lookup
candidates.addAll(lookupManual.get(current));
// add matches from random walk
candidates.addAll(lookupFromGraph.get(current));
// add matches from ascii equivalents.
// TODO: this may decrease accuracy. Also, this can be eliminated with ascii tolerant analyzer.
candidates.addAll(lookupFromAscii.get(current));
// add matches from informal analysis to formal surface conversion.
WordAnalysis analyses = informalAsciiTolerantMorphology.analyze(current);
for (SingleAnalysis analysis : analyses) {
if (analysis.containsInformalMorpheme()) {
WordGenerator.Result result = analysisConverter.convert(current, analysis);
if (result != null) {
candidates.add(result.surface);
}
} else {
List<WordGenerator.Result> results = morphology.getWordGenerator().generate(analysis.getDictionaryItem(), analysis.getMorphemes());
for (Result result : results) {
candidates.add(result.surface);
}
}
}
// get top 3 1 distance matches.
if ((analyses.analysisCount() == 0) && current.length() > 3) {
List<String> spellCandidates = spellChecker.suggestForWord(current, previous, next, lm);
if (spellCandidates.size() > 3) {
spellCandidates = new ArrayList<>(spellCandidates.subList(0, 3));
}
candidates.addAll(spellCandidates);
}
// if still there is no match, add the word itself.
if (candidates.isEmpty() || morphology.analyze(current).isCorrect()) {
candidates.add(current);
}
Candidates result = new Candidates(currentToken.getText(), candidates.stream().map(Candidate::new).collect(Collectors.toList()));
candidatesList.add(result);
}
// Apply Viterbi decoding and return result.
return String.join(" ", decode(candidatesList));
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testHashTag2.
@Test
public void testHashTag2() {
TurkishMorphology morphology = getMorphology();
WordAnalysis result = morphology.analyze("#123'efefe");
Assert.assertEquals(1, result.analysisCount());
SingleAnalysis analysis = result.getAnalysisResults().get(0);
Assert.assertEquals(SecondaryPos.HashTag, analysis.getDictionaryItem().secondaryPos);
Assert.assertEquals("#123'efefe", analysis.getDictionaryItem().lemma);
}
Aggregations