Search in sources :

Example 6 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class SpeedTest method testNewsCorpus.

@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpus() throws IOException {
    // Path p = Paths.get("/media/aaa/Data/corpora/me-sentences/www.aljazeera.com.tr/2018-02-22");
    Path p = Paths.get("src/test/resources/corpora/cnn-turk-10k");
    List<String> sentences = getSentences(p);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Stopwatch sw = Stopwatch.createStarted();
    int tokenCount = 0;
    int noAnalysis = 0;
    int sentenceCount = 0;
    Histogram<String> failedWords = new Histogram<>(100000);
    for (String sentence : sentences) {
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        for (Token token : tokens) {
            if (token.getType() == Token.Type.Punctuation) {
                continue;
            }
            tokenCount++;
            WordAnalysis results = morphology.analyze(token.getText());
            if (!results.isCorrect()) {
                noAnalysis++;
                failedWords.add(token.getText());
            }
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d tokens analyzed.", tokenCount);
        }
    }
    double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
    double speed = tokenCount / seconds;
    double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
    Log.info("%nElapsed = %.2f seconds", seconds);
    Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
    Log.info("Saving Unknown Tokens");
    failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
    failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 7 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class SpeedTest method main.

public static void main(String[] args) throws IOException {
    Path p = Paths.get("morphology/src/test/resources/corpora/cnn-turk-10k");
    TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
    for (int i = 0; i < 10; i++) {
        testForVisualVm(p, analyzer);
        analyzer.invalidateCache();
        System.in.read();
    }
}
Also used : Path(java.nio.file.Path) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 8 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class WordAnalysisSurfaceFormatterTest method formatVerbs.

@Test
public void formatVerbs() {
    TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("olmak").build();
    String[] inputs = { "olarak", "Olarak" };
    String[] expected = { "olarak", "Olarak" };
    check(morphology, inputs, expected, null);
    // giving apostrophe should not effect the output.
    check(morphology, inputs, expected, "'");
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 9 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class WordAnalysisSurfaceFormatterTest method formatKnownProperNouns.

@Test
public void formatKnownProperNouns() {
    TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("Ankara", "Iphone [Pr:ayfon, A:LocaleEn]", "Google [Pr:gugıl]").build();
    String[] inputs = { "ankarada", "ıphonumun", "googledan", "Iphone", "Google", "Googlesa" };
    String[] expected = { "Ankara'da", "Iphone'umun", "Google'dan", "Iphone", "Google", "Google'sa" };
    check(morphology, inputs, expected, "'");
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 10 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class PerceptronAmbiguityResolverEvaluation method main.

public static void main(String[] args) throws IOException {
    Path root = Paths.get("/media/ahmetaa/depo/ambiguity");
    List<Path> paths = Lists.newArrayList(Paths.get("data/gold/gold1.txt"), root.resolve("www.aljazeera.com.tr-rule-result.txt"), root.resolve("wowturkey.com-rule-result.txt"), root.resolve("open-subtitles-tr-2018-rule-result.txt"), root.resolve("sak.train"), root.resolve("www.haberturk.com-rule-result.txt"), root.resolve("www.cnnturk.com-rule-result.txt"));
    Path dev = root.resolve("sak.dev");
    Path model = Paths.get("morphology/src/main/resources/tr/ambiguity/model");
    Path modelCompressed = Paths.get("morphology/src/main/resources/tr/ambiguity/model-compressed");
    TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
    DataSet trainingSet = new DataSet();
    for (Path path : paths) {
        trainingSet.add(DataSet.load(path, morphology));
    }
    DataSet devSet = DataSet.load(dev, morphology);
    PerceptronAmbiguityResolver resolver = new PerceptronAmbiguityResolverTrainer(morphology).train(trainingSet, devSet, 7);
    Weights modelTrained = (Weights) resolver.getModel();
    modelTrained.pruneNearZeroWeights();
    modelTrained.saveAsText(model);
    System.out.println("Load model and test");
    PerceptronAmbiguityResolver resolverRead = PerceptronAmbiguityResolver.fromModelFile(model);
    Path test = root.resolve("sak.test");
    ((Weights) resolverRead.getModel()).compress().serialize(modelCompressed);
    PerceptronAmbiguityResolverTrainer.test(test, morphology, resolverRead);
    System.out.println("Load compressed model and test");
    PerceptronAmbiguityResolver comp = PerceptronAmbiguityResolver.fromModelFile(modelCompressed);
    PerceptronAmbiguityResolverTrainer.test(test, morphology, comp);
}
Also used : Path(java.nio.file.Path) Weights(zemberek.core.data.Weights) DataSet(zemberek.morphology.ambiguity.PerceptronAmbiguityResolverTrainer.DataSet) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5