use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.
the class _SingleAnalysis method fromSearchPath.
// Here we generate a _SingleAnalysis from a search path.
public static _SingleAnalysis fromSearchPath(SearchPath searchPath) {
List<MorphemeSurface> morphemes = new ArrayList<>(searchPath.transitions.size());
int derivationCount = 0;
for (SurfaceTransition transition : searchPath.getTransitions()) {
if (transition.isDerivative()) {
derivationCount++;
}
Morpheme morpheme = transition.getMorpheme();
// if empty, use the cache.
if (transition.surface.isEmpty()) {
MorphemeSurface suffixSurface = emptyMorphemeCache.get(morpheme);
if (suffixSurface == null) {
suffixSurface = new MorphemeSurface(morpheme, "");
emptyMorphemeCache.put(morpheme, suffixSurface);
}
morphemes.add(suffixSurface);
continue;
}
MorphemeSurface suffixSurface = new MorphemeSurface(morpheme, transition.surface);
morphemes.add(suffixSurface);
}
int[] groupBoundaries = new int[derivationCount + 1];
// we assume there is always an IG
groupBoundaries[0] = 0;
int morphemeCounter = 0, derivationCounter = 1;
for (SurfaceTransition transition : searchPath.getTransitions()) {
if (transition.isDerivative()) {
groupBoundaries[derivationCounter] = morphemeCounter;
derivationCounter++;
}
morphemeCounter++;
}
return new _SingleAnalysis(searchPath.getDictionaryItem(), morphemes, groupBoundaries);
}
use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.
the class _SingleAnalysisTest method morphemeGroupTest.
@Test
public void morphemeGroupTest() {
InterpretingAnalyzer analyzer = getAnalyzer("kitap");
_SingleAnalysis analysis = analyzer.analyze("kitaplarda").get(0);
MorphemeGroup group = analysis.getGroup(0);
Assert.assertEquals("kitaplarda", group.surface());
analyzer = getAnalyzer("okumak");
analysis = analyzer.analyze("okutmuyor").get(0);
Assert.assertEquals(2, analysis.getMorphemeGroupCount());
MorphemeGroup group0 = analysis.getGroup(0);
Assert.assertEquals("oku", group0.surface());
MorphemeGroup group1 = analysis.getGroup(1);
Assert.assertEquals("tmuyor", group1.surface());
}
use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.
the class AnalyzerTestBase method lastMorphemeIs.
boolean lastMorphemeIs(_SingleAnalysis result, String morphemeName) {
List<MorphemeSurface> morphemes = result.getMorphemesSurfaces();
if (morphemes.size() == 0) {
return false;
}
MorphemeSurface last = morphemes.get(morphemes.size() - 1);
return last.morpheme.id.equalsIgnoreCase(morphemeName);
}
use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.
the class InterpretingAnalyzer method analyze.
public List<_SingleAnalysis> analyze(String input, AnalysisDebugData debugData) {
// get stem candidates.
List<StemTransition> candidates = Lists.newArrayListWithCapacity(3);
for (int i = 1; i <= input.length(); i++) {
String stem = input.substring(0, i);
candidates.addAll(getMatchingStemTransitions(stem));
}
if (debugData != null) {
debugData.input = input;
debugData.candidateStemTransitions.addAll(candidates);
}
// generate initial search paths.
List<SearchPath> paths = new ArrayList<>();
for (StemTransition candidate : candidates) {
int length = candidate.surface.length();
String head = input.substring(0, length);
String tail = input.substring(length);
paths.add(SearchPath.initialPath(candidate, head, tail));
}
// search graph.
List<SearchPath> resultPaths = search(paths, debugData);
// generate results from successful paths.
List<_SingleAnalysis> result = new ArrayList<>(resultPaths.size());
for (SearchPath path : resultPaths) {
_SingleAnalysis analysis = _SingleAnalysis.fromSearchPath(path);
result.add(analysis);
if (debugData != null) {
debugData.results.add(analysis);
}
}
return result;
}
use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.
the class _MorphologicalAmbiguityResolverExperiment method extracData.
public void extracData(Path p, Path outRoot, int maxAnalysisCount, int resultLimit) throws IOException {
List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
int i = 0;
for (Path file : files) {
List<SingleAnalysisSentence> collect = collect(file, maxAnalysisCount);
result.addAll(collect);
i++;
Log.info("%d of %d", i, files.size());
if (resultLimit > 0 && result.size() > resultLimit) {
break;
}
}
String s = p.toFile().getName();
Path out = outRoot.resolve(s + "-unambigious.txt");
try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
for (SingleAnalysisSentence sentence : result) {
pw.println(sentence.sentence);
for (Single single : sentence.tokens) {
for (_SingleAnalysis r : single.res) {
pw.println(r.formatSurfaceSequence());
}
}
pw.println();
}
}
// saving failed words.
failedWords.saveSortedByKeys(outRoot.resolve(s + "-failed.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
// saving failed words by frequency.
failedWords.saveSortedByCounts(outRoot.resolve(s + "-failed.freq.txt"), " ");
}
Aggregations