Search in sources :

Example 21 with TurkishMorphology

use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzerTest method setUp.

@Before
public void setUp() throws Exception {
    TurkishMorphology morphParser = TurkishMorphology.createWithDefaults();
    parser = new TurkishSentenceAnalyzer(morphParser, new Z3MarkovModelDisambiguator());
}
Also used : Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Before(org.junit.Before)

Example 22 with TurkishMorphology

use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzerTest method shouldParseSmallCaseProperNounsWithSingleQuote.

@Test
public void shouldParseSmallCaseProperNounsWithSingleQuote() throws IOException {
    HashSet<String> expected = Sets.newHashSet("[(İstanbul:istanbul) (Noun,Prop;A3sg+P2sg:un+Nom)]", "[(İstanbul:istanbul) (Noun,Prop;A3sg+Pnon+Gen:un)]");
    TurkishMorphology parser = TurkishMorphology.builder().addTextDictionaryResources("dev-lexicon.txt").build();
    UnidentifiedTokenAnalyzer uiParser = new UnidentifiedTokenAnalyzer(parser);
    List<WordAnalysis> results = uiParser.analyze("İstanbul'un");
    Assert.assertEquals(2, results.size());
    for (WordAnalysis result : results) {
        Assert.assertTrue(expected.contains(result.formatLong()));
    }
    results = uiParser.analyze("istanbul'un");
    Assert.assertEquals(2, results.size());
    for (WordAnalysis result : results) {
        Assert.assertTrue(expected.contains(result.formatLong()));
    }
    results = uiParser.analyze("Ankara'ya");
    Assert.assertEquals(1, results.size());
    Assert.assertEquals("[(Ankara:ankara) (Noun,Prop;A3sg+Pnon+Dat:ya)]", results.get(0).formatLong());
    results = uiParser.analyze("ankara'ya");
    Assert.assertEquals(1, results.size());
    Assert.assertEquals("[(Ankara:ankara) (Noun,Prop;A3sg+Pnon+Dat:ya)]", results.get(0).formatLong());
    // Karaman does not exist in dictionary
    results = uiParser.analyze("Karaman");
    Assert.assertEquals(1, results.size());
    Assert.assertEquals("[(Karaman:karaman) (Noun,Prop;A3sg+Pnon+Nom)]", results.get(0).formatLong());
    results = uiParser.analyze("karaman'a");
    Assert.assertEquals(1, results.size());
    Assert.assertEquals("[(Karaman:karaman) (Noun,Prop;A3sg+Pnon+Dat:a)]", results.get(0).formatLong());
    results = uiParser.analyze("karaman");
    Assert.assertEquals(0, results.size());
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) UnidentifiedTokenAnalyzer(zemberek.morphology.analysis.tr.UnidentifiedTokenAnalyzer) Test(org.junit.Test)

Example 23 with TurkishMorphology

use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method parseLargeVocabularyZemberekForMorfessor.

@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberekForMorfessor() throws IOException {
    Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    Log.info("Loading histogram.");
    Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
    histogram.removeSmaller(1000);
    List<String> accepted = new ArrayList<>(histogram.size());
    int c = 0;
    for (String s : histogram) {
        s = s.trim();
        if (s.length() < 4) {
            continue;
        }
        List<WordAnalysis> parses = parser.analyze(s);
        if (parses.size() > 0 && parses.get(0).dictionaryItem.primaryPos != PrimaryPos.Unknown) {
            LinkedHashSet<String> k = new LinkedHashSet<>(2);
            for (WordAnalysis parse : parses) {
                if (parse.dictionaryItem.lemma.length() > 1) {
                    String str = parse.root + " " + String.join(" ", parse.suffixSurfaceList()).replaceAll("[ ]+", " ").trim();
                    k.add(str);
                }
            }
            String join = String.join(", ", k).trim();
            if (!s.equals(join) && join.length() > 2) {
                accepted.add(s + " " + join);
            }
        }
        if (c > 0 && c % 10000 == 0) {
            Log.info("Processed = " + c);
        }
        c++;
    }
    sortAndSave(outDir.resolve("morfessor-annotation.txt"), accepted);
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 24 with TurkishMorphology

use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method generatorTest.

@Test
@Ignore("Not a Test.")
public void generatorTest() throws IOException {
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    List<WordAnalysis> result = parser.analyze("besiciliği");
    WordAnalysis first = result.get(0);
    Log.info(first.inflectionalGroups);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 25 with TurkishMorphology

use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class PerceptronNer method main.

public static void main(String[] args) throws IOException {
    // Path trainPath = Paths.get("experiment/src/main/resources/ner/reyyan.train.txt");
    Path trainPath = Paths.get("experiment/src/main/resources/ner/NE-bracket.train.txt");
    NerDataSet trainingSet = NerDataSet.loadBracketTurkishCorpus(trainPath);
    new NerDataSet.Info(trainingSet).log();
    // Path testPath = Paths.get("experiment/src/main/resources/ner/reyyan.test.txt");
    Path testPath = Paths.get("experiment/src/main/resources/ner/NE-bracket.test.txt");
    NerDataSet testSet = NerDataSet.loadBracketTurkishCorpus(testPath);
    new NerDataSet.Info(testSet).log();
    Gazetteers gazetteers = new Gazetteers(Paths.get("experiment/src/main/resources/ner/location-words.txt"), Paths.get("experiment/src/main/resources/ner/organization-words.txt"));
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Map<String, ClassWeights> model = PerceptronNer.train(morphology, gazetteers, trainingSet, testSet, 10, 0.1f);
    PerceptronNer ner = new PerceptronNer(model, morphology, gazetteers);
    Log.info("Testing %d sentences.", testSet.sentences.size());
    NerDataSet testResult = ner.test(testSet);
    testReport(testSet, testResult, Paths.get("experiment/src/main/resources/ner/test-result.txt"));
    Log.info("Done.");
}
Also used : Path(java.nio.file.Path) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology)

Aggregations

TurkishMorphology (zemberek.morphology.analysis.tr.TurkishMorphology)26 Test (org.junit.Test)13 Ignore (org.junit.Ignore)5 Z3MarkovModelDisambiguator (zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator)5 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)5 TurkishSentenceAnalyzer (zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer)5 Path (java.nio.file.Path)2 UnidentifiedTokenAnalyzer (zemberek.morphology.analysis.tr.UnidentifiedTokenAnalyzer)2 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)2 Stopwatch (com.google.common.base.Stopwatch)1 BufferedOutputStream (java.io.BufferedOutputStream)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 ArrayList (java.util.ArrayList)1 LinkedHashSet (java.util.LinkedHashSet)1 Before (org.junit.Before)1 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)1 LexiconProto (zemberek.morphology.lexicon.proto.LexiconProto)1 Dictionary (zemberek.morphology.lexicon.proto.LexiconProto.Dictionary)1 TurkishSpellChecker (zemberek.normalization.TurkishSpellChecker)1