Search in sources :

Example 1 with zemberek.morphology._analyzer._SingleAnalysis

use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.

the class _SingleAnalysis method fromSearchPath.

// Here we generate a _SingleAnalysis from a search path.
public static _SingleAnalysis fromSearchPath(SearchPath searchPath) {
    List<MorphemeSurface> morphemes = new ArrayList<>(searchPath.transitions.size());
    int derivationCount = 0;
    for (SurfaceTransition transition : searchPath.getTransitions()) {
        if (transition.isDerivative()) {
            derivationCount++;
        }
        Morpheme morpheme = transition.getMorpheme();
        // if empty, use the cache.
        if (transition.surface.isEmpty()) {
            MorphemeSurface suffixSurface = emptyMorphemeCache.get(morpheme);
            if (suffixSurface == null) {
                suffixSurface = new MorphemeSurface(morpheme, "");
                emptyMorphemeCache.put(morpheme, suffixSurface);
            }
            morphemes.add(suffixSurface);
            continue;
        }
        MorphemeSurface suffixSurface = new MorphemeSurface(morpheme, transition.surface);
        morphemes.add(suffixSurface);
    }
    int[] groupBoundaries = new int[derivationCount + 1];
    // we assume there is always an IG
    groupBoundaries[0] = 0;
    int morphemeCounter = 0, derivationCounter = 1;
    for (SurfaceTransition transition : searchPath.getTransitions()) {
        if (transition.isDerivative()) {
            groupBoundaries[derivationCounter] = morphemeCounter;
            derivationCounter++;
        }
        morphemeCounter++;
    }
    return new _SingleAnalysis(searchPath.getDictionaryItem(), morphemes, groupBoundaries);
}
Also used : Morpheme(zemberek.morphology._morphotactics.Morpheme) ArrayList(java.util.ArrayList)

Example 2 with zemberek.morphology._analyzer._SingleAnalysis

use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.

the class _SingleAnalysisTest method morphemeGroupTest.

@Test
public void morphemeGroupTest() {
    InterpretingAnalyzer analyzer = getAnalyzer("kitap");
    _SingleAnalysis analysis = analyzer.analyze("kitaplarda").get(0);
    MorphemeGroup group = analysis.getGroup(0);
    Assert.assertEquals("kitaplarda", group.surface());
    analyzer = getAnalyzer("okumak");
    analysis = analyzer.analyze("okutmuyor").get(0);
    Assert.assertEquals(2, analysis.getMorphemeGroupCount());
    MorphemeGroup group0 = analysis.getGroup(0);
    Assert.assertEquals("oku", group0.surface());
    MorphemeGroup group1 = analysis.getGroup(1);
    Assert.assertEquals("tmuyor", group1.surface());
}
Also used : MorphemeGroup(zemberek.morphology._analyzer._SingleAnalysis.MorphemeGroup) Test(org.junit.Test)

Example 3 with zemberek.morphology._analyzer._SingleAnalysis

use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.

the class AnalyzerTestBase method lastMorphemeIs.

boolean lastMorphemeIs(_SingleAnalysis result, String morphemeName) {
    List<MorphemeSurface> morphemes = result.getMorphemesSurfaces();
    if (morphemes.size() == 0) {
        return false;
    }
    MorphemeSurface last = morphemes.get(morphemes.size() - 1);
    return last.morpheme.id.equalsIgnoreCase(morphemeName);
}
Also used : MorphemeSurface(zemberek.morphology._analyzer._SingleAnalysis.MorphemeSurface)

Example 4 with zemberek.morphology._analyzer._SingleAnalysis

use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.

the class InterpretingAnalyzer method analyze.

public List<_SingleAnalysis> analyze(String input, AnalysisDebugData debugData) {
    // get stem candidates.
    List<StemTransition> candidates = Lists.newArrayListWithCapacity(3);
    for (int i = 1; i <= input.length(); i++) {
        String stem = input.substring(0, i);
        candidates.addAll(getMatchingStemTransitions(stem));
    }
    if (debugData != null) {
        debugData.input = input;
        debugData.candidateStemTransitions.addAll(candidates);
    }
    // generate initial search paths.
    List<SearchPath> paths = new ArrayList<>();
    for (StemTransition candidate : candidates) {
        int length = candidate.surface.length();
        String head = input.substring(0, length);
        String tail = input.substring(length);
        paths.add(SearchPath.initialPath(candidate, head, tail));
    }
    // search graph.
    List<SearchPath> resultPaths = search(paths, debugData);
    // generate results from successful paths.
    List<_SingleAnalysis> result = new ArrayList<>(resultPaths.size());
    for (SearchPath path : resultPaths) {
        _SingleAnalysis analysis = _SingleAnalysis.fromSearchPath(path);
        result.add(analysis);
        if (debugData != null) {
            debugData.results.add(analysis);
        }
    }
    return result;
}
Also used : StemTransition(zemberek.morphology._morphotactics.StemTransition) ArrayList(java.util.ArrayList)

Example 5 with zemberek.morphology._analyzer._SingleAnalysis

use of zemberek.morphology._analyzer._SingleAnalysis in project zemberek-nlp by ahmetaa.

the class _MorphologicalAmbiguityResolverExperiment method extracData.

public void extracData(Path p, Path outRoot, int maxAnalysisCount, int resultLimit) throws IOException {
    List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
    LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
    int i = 0;
    for (Path file : files) {
        List<SingleAnalysisSentence> collect = collect(file, maxAnalysisCount);
        result.addAll(collect);
        i++;
        Log.info("%d of %d", i, files.size());
        if (resultLimit > 0 && result.size() > resultLimit) {
            break;
        }
    }
    String s = p.toFile().getName();
    Path out = outRoot.resolve(s + "-unambigious.txt");
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
        for (SingleAnalysisSentence sentence : result) {
            pw.println(sentence.sentence);
            for (Single single : sentence.tokens) {
                for (_SingleAnalysis r : single.res) {
                    pw.println(r.formatSurfaceSequence());
                }
            }
            pw.println();
        }
    }
    // saving failed words.
    failedWords.saveSortedByKeys(outRoot.resolve(s + "-failed.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    // saving failed words by frequency.
    failedWords.saveSortedByCounts(outRoot.resolve(s + "-failed.freq.txt"), " ");
}
Also used : Path(java.nio.file.Path) Token(org.antlr.v4.runtime.Token) Strings(zemberek.core.io.Strings) HashMap(java.util.HashMap) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) InterpretingAnalyzer(zemberek.morphology._analyzer.InterpretingAnalyzer) zemberek.morphology._analyzer._SingleAnalysis(zemberek.morphology._analyzer._SingleAnalysis) Map(java.util.Map) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) Objects(java.util.Objects) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Paths(java.nio.file.Paths) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) LanguageIdentifier(zemberek.langid.LanguageIdentifier) Pattern(java.util.regex.Pattern) RootLexicon(zemberek.morphology.lexicon.RootLexicon) LinkedHashSet(java.util.LinkedHashSet) zemberek.morphology._analyzer._SingleAnalysis(zemberek.morphology._analyzer._SingleAnalysis) PrintWriter(java.io.PrintWriter)

Aggregations

ArrayList (java.util.ArrayList)3 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1 HashMap (java.util.HashMap)1 LinkedHashSet (java.util.LinkedHashSet)1 List (java.util.List)1 Map (java.util.Map)1 Objects (java.util.Objects)1 Pattern (java.util.regex.Pattern)1 Collectors (java.util.stream.Collectors)1 Token (org.antlr.v4.runtime.Token)1 Test (org.junit.Test)1 Histogram (zemberek.core.collections.Histogram)1 Strings (zemberek.core.io.Strings)1 Log (zemberek.core.logging.Log)1 SecondaryPos (zemberek.core.turkish.SecondaryPos)1