use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class WordGenerator method generate.
private List<Result> generate(String input, List<StemTransition> candidates, List<Morpheme> morphemes) {
if (debugMode) {
debugData = new AnalysisDebugData();
debugData.input = input;
debugData.candidateStemTransitions.addAll(candidates);
}
// generate initial search paths.
List<GenerationPath> paths = new ArrayList<>();
for (StemTransition candidate : candidates) {
// we set the tail as " " because in morphotactics, some conditions look for tail's size
// during graph walk. Because this is generation we let that condition pass always.
SearchPath searchPath = SearchPath.initialPath(candidate, " ");
List<Morpheme> morphemesInPath;
// we skip it if it matches with the initial morpheme of the graph visiting SearchPath object.
if (morphemes.size() > 0) {
if (morphemes.get(0).equals(searchPath.getCurrentState().morpheme)) {
morphemesInPath = morphemes.subList(1, morphemes.size());
} else {
morphemesInPath = new ArrayList<>(morphemes);
}
} else {
morphemesInPath = new ArrayList<>(0);
}
paths.add(new GenerationPath(searchPath, morphemesInPath));
}
// search graph.
List<GenerationPath> resultPaths = search(paths);
// generate results from successful paths.
List<Result> result = new ArrayList<>(resultPaths.size());
for (GenerationPath path : resultPaths) {
SingleAnalysis analysis = SingleAnalysis.fromSearchPath(path.path);
result.add(new Result(analysis.surfaceForm(), analysis));
if (debugMode) {
debugData.results.add(analysis);
}
}
return result;
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testDate2.
@Test
public void testDate2() {
TurkishMorphology morphology = getMorphology("dört [P:Num,Card;A:Voicing]");
WordAnalysis result = morphology.analyze("1.1.2014'te");
Assert.assertEquals(1, result.analysisCount());
SingleAnalysis analysis = result.getAnalysisResults().get(0);
Assert.assertEquals(SecondaryPos.Date, analysis.getDictionaryItem().secondaryPos);
String lexical = analysis.formatLexical();
Assert.assertTrue(lexical.endsWith("A3sg+Loc"));
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testHashTag.
@Test
public void testHashTag() {
TurkishMorphology morphology = getMorphology();
WordAnalysis result = morphology.analyze("#haha_ha'ya");
Assert.assertEquals(1, result.analysisCount());
SingleAnalysis analysis = result.getAnalysisResults().get(0);
Assert.assertEquals(SecondaryPos.HashTag, analysis.getDictionaryItem().secondaryPos);
String lexical = analysis.formatLexical();
Assert.assertTrue(lexical.endsWith("A3sg+Dat"));
Assert.assertEquals("#haha_ha", analysis.getDictionaryItem().lemma);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectCorpusStatisticsForLemmas.
static CorpusStatistics collectCorpusStatisticsForLemmas(WebCorpus corpus, TurkishMorphology analyzer, int count) throws IOException {
CorpusStatistics statistics = new CorpusStatistics(1_000_000);
int docCount = 0;
for (WebDocument document : corpus.getDocuments()) {
Histogram<String> docHistogram = new Histogram<>();
List<String> sentences = extractor.fromParagraphs(document.getLines());
for (String sentence : sentences) {
List<SingleAnalysis> analysis = analyzer.analyzeAndDisambiguate(sentence).bestAnalysis();
for (SingleAnalysis w : analysis) {
if (!analysisAcceptable(w)) {
continue;
}
String s = w.getStemAndEnding().concat();
if (TurkishStopWords.DEFAULT.contains(s)) {
continue;
}
List<String> lemmas = w.getLemmas();
docHistogram.add(lemmas.get(lemmas.size() - 1));
}
}
statistics.termFrequencies.add(docHistogram);
for (String s : docHistogram) {
statistics.documentFrequencies.add(s);
}
if (docCount++ % 500 == 0) {
Log.info("Doc count = %d", docCount);
}
if (count > 0 && docCount > count) {
break;
}
}
statistics.documentCount = count > 0 ? Math.min(count, corpus.documentCount()) : corpus.documentCount();
return statistics;
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method lemmaNgrams.
private List<Histogram<Term>> lemmaNgrams(List<String> paragraphs) {
List<Histogram<Term>> ngrams = new ArrayList<>(order + 1);
for (int i = 0; i < order; i++) {
ngrams.add(new Histogram<>(100));
}
int tokenCount = 0;
List<String> sentences = extractor.fromParagraphs(paragraphs);
for (String sentence : sentences) {
List<SingleAnalysis> analysis = morphology.analyzeAndDisambiguate(sentence).bestAnalysis();
for (int i = 0; i < order; i++) {
int currentOrder = i + 1;
for (int j = 0; j < analysis.size() - currentOrder; j++) {
String[] words = new String[currentOrder];
boolean fail = false;
for (int k = 0; k < currentOrder; k++) {
SingleAnalysis a = analysis.get(j + k);
if (!analysisAcceptable(a)) {
fail = true;
break;
}
String surface = a.getStemAndEnding().concat();
if (TurkishStopWords.DEFAULT.contains(surface)) {
fail = true;
break;
}
List<String> lemmas = a.getLemmas();
words[k] = lemmas.get(lemmas.size() - 1);
}
if (!fail) {
Term term = new Term(words);
int count = ngrams.get(i).add(term);
if (count == 1) {
// if this is the first time, set the first occurance index.
term.setFirstOccurrenceIndex(tokenCount + j);
}
}
tokenCount += analysis.size();
}
}
}
return ngrams;
}
Aggregations