use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishSentenceNormalizer method normalize.
public String normalize(String sentence) {
if (sentence.trim().length() == 0) {
return sentence;
}
String processed = preProcess(sentence);
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(processed);
List<Candidates> candidatesList = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
Token currentToken = tokens.get(i);
String current = currentToken.getText();
String next = i == tokens.size() - 1 ? null : tokens.get(i + 1).getText();
String previous = i == 0 ? null : tokens.get(i - 1).getText();
LinkedHashSet<String> candidates = new LinkedHashSet<>(2);
// add matches from manual lookup
candidates.addAll(lookupManual.get(current));
// add matches from random walk
candidates.addAll(lookupFromGraph.get(current));
// add matches from ascii equivalents.
// TODO: this may decrease accuracy. Also, this can be eliminated with ascii tolerant analyzer.
candidates.addAll(lookupFromAscii.get(current));
// add matches from informal analysis to formal surface conversion.
WordAnalysis analyses = informalAsciiTolerantMorphology.analyze(current);
for (SingleAnalysis analysis : analyses) {
if (analysis.containsInformalMorpheme()) {
WordGenerator.Result result = analysisConverter.convert(current, analysis);
if (result != null) {
candidates.add(result.surface);
}
} else {
List<WordGenerator.Result> results = morphology.getWordGenerator().generate(analysis.getDictionaryItem(), analysis.getMorphemes());
for (Result result : results) {
candidates.add(result.surface);
}
}
}
// get top 3 1 distance matches.
if ((analyses.analysisCount() == 0) && current.length() > 3) {
List<String> spellCandidates = spellChecker.suggestForWord(current, previous, next, lm);
if (spellCandidates.size() > 3) {
spellCandidates = new ArrayList<>(spellCandidates.subList(0, 3));
}
candidates.addAll(spellCandidates);
}
// if still there is no match, add the word itself.
if (candidates.isEmpty() || morphology.analyze(current).isCorrect()) {
candidates.add(current);
}
Candidates result = new Candidates(currentToken.getText(), candidates.stream().map(Candidate::new).collect(Collectors.toList()));
candidatesList.add(result);
}
// Apply Viterbi decoding and return result.
return String.join(" ", decode(candidatesList));
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testHashTag2.
@Test
public void testHashTag2() {
TurkishMorphology morphology = getMorphology();
WordAnalysis result = morphology.analyze("#123'efefe");
Assert.assertEquals(1, result.analysisCount());
SingleAnalysis analysis = result.getAnalysisResults().get(0);
Assert.assertEquals(SecondaryPos.HashTag, analysis.getDictionaryItem().secondaryPos);
Assert.assertEquals("#123'efefe", analysis.getDictionaryItem().lemma);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testUrl2.
@Test
public void testUrl2() {
TurkishMorphology morphology = getMorphology();
WordAnalysis result = morphology.analyze("www.foo.com'da");
Assert.assertEquals(1, result.analysisCount());
SingleAnalysis analysis = result.getAnalysisResults().get(0);
Assert.assertEquals(SecondaryPos.Url, analysis.getDictionaryItem().secondaryPos);
String lexical = analysis.formatLexical();
Assert.assertTrue(lexical.endsWith("A3sg+Loc"));
Assert.assertEquals("www.foo.com", analysis.getDictionaryItem().lemma);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testMention.
@Test
public void testMention() {
TurkishMorphology morphology = getMorphology();
WordAnalysis result = morphology.analyze("@haha_ha'ya");
Assert.assertEquals(1, result.analysisCount());
SingleAnalysis analysis = result.getAnalysisResults().get(0);
Assert.assertEquals(SecondaryPos.Mention, analysis.getDictionaryItem().secondaryPos);
String lexical = analysis.formatLexical();
Assert.assertTrue(lexical.endsWith("A3sg+Dat"));
Assert.assertEquals("@haha_ha", analysis.getDictionaryItem().lemma);
Assert.assertTrue(lexical.contains("@haha_ha"));
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testTime2.
@Test
public void testTime2() {
TurkishMorphology morphology = getMorphology("dört [P:Num,Card;A:Voicing]");
WordAnalysis result = morphology.analyze("10:24'te");
Assert.assertEquals(1, result.analysisCount());
SingleAnalysis analysis = result.getAnalysisResults().get(0);
Assert.assertEquals(SecondaryPos.Clock, analysis.getDictionaryItem().secondaryPos);
String lexical = analysis.formatLexical();
Assert.assertTrue(lexical.endsWith("A3sg+Loc"));
}
Aggregations