Search in sources :

Example 61 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class PerceptronNer method loadModel.

public static PerceptronNer loadModel(Path modelRoot, TurkishMorphology morphology) throws IOException {
    Map<String, ClassModel> weightsMap = new HashMap<>();
    List<Path> files = Files.walk(modelRoot, 1).filter(s -> s.toFile().getName().endsWith(".ner.model")).collect(Collectors.toList());
    for (Path file : files) {
        ClassModel weights = ClassModel.load(file);
        weightsMap.put(weights.id, weights);
    }
    return new PerceptronNer(weightsMap, morphology);
}
Also used : Path(java.nio.file.Path) CompressedWeights(zemberek.core.data.CompressedWeights) Files(java.nio.file.Files) TextUtil(zemberek.core.text.TextUtil) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) HashMap(java.util.HashMap) WordAnalysisSurfaceFormatter(zemberek.morphology.analysis.WordAnalysisSurfaceFormatter) Collectors(java.util.stream.Collectors) Weights(zemberek.core.data.Weights) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) WeightLookup(zemberek.core.data.WeightLookup) List(java.util.List) TextIO(zemberek.core.text.TextIO) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) ScoredItem(zemberek.core.ScoredItem) Path(java.nio.file.Path) HashMap(java.util.HashMap)

Example 62 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class CharacterGraphDecoderTest method stemEndingTest1.

@Test
public void stemEndingTest1() {
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("bakmak", "gelmek").build();
    List<String> endings = Lists.newArrayList("acak", "ecek");
    StemEndingGraph graph = new StemEndingGraph(morphology, endings);
    CharacterGraphDecoder spellChecker = new CharacterGraphDecoder(graph.stemGraph);
    List<String> res = spellChecker.getSuggestions("bakcaak");
    Assert.assertEquals(1, res.size());
    Assert.assertEquals("bakacak", res.get(0));
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 63 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class NerExperiment method trainAndTest.

public static void trainAndTest(Path trainPath, Path testPath, Path modelRoot, Path reportPath) throws IOException {
    NerDataSet trainingSet = NerDataSet.load(trainPath, AnnotationStyle.BRACKET);
    Log.info(trainingSet.info());
    NerDataSet testSet = NerDataSet.load(testPath, AnnotationStyle.BRACKET);
    Log.info(testSet.info());
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    PerceptronNer ner = new PerceptronNerTrainer(morphology).train(trainingSet, testSet, 7, 0.1f);
    Files.createDirectories(modelRoot);
    ner.saveModelAsText(modelRoot);
    Log.info("Testing %d sentences.", testSet.sentences.size());
    NerDataSet testResult = ner.evaluate(testSet);
    PerceptronNerTrainer.evaluationReport(testSet, testResult, reportPath);
    Log.info("Done.");
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 64 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class NerExperiment method main.

public static void main(String[] args) throws IOException {
    Path root = Paths.get("/home/ahmetaa/data/nlp/ner");
    Path trainPath = root.resolve("sentences.20k.result.txt");
    Path testPath = root.resolve("reyyan.test.txt");
    Path modelRoot = root.resolve("ner/model-toy");
    Path reportPath = root.resolve("test-result.txt");
    trainAndTest(trainPath, testPath, modelRoot, reportPath);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
    Stopwatch sw = Stopwatch.createStarted();
    Path input = root.resolve("sentences.1k");
    Path output = root.resolve("sentences.1k.result.txt");
    List<String> sentences = Files.readAllLines(input);
    int tokenCount = 0;
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        for (String sentence : sentences) {
            if (sentence.contains("[") || sentence.contains("]")) {
                continue;
            }
            tokenCount += TurkishTokenizer.DEFAULT.tokenize(sentence).size();
            NerSentence result = ner.findNamedEntities(sentence);
            pw.println(result.getAsTrainingSentence(AnnotationStyle.BRACKET));
        }
    }
    System.out.println("Elapsed = " + sw.elapsed(TimeUnit.MILLISECONDS));
    System.out.println("TokenCount = " + tokenCount);
}
Also used : Path(java.nio.file.Path) Stopwatch(com.google.common.base.Stopwatch) TurkishMorphology(zemberek.morphology.TurkishMorphology) PrintWriter(java.io.PrintWriter)

Example 65 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class NormalizationVocabularyGenerator method main.

public static void main(String[] args) throws Exception {
    TurkishMorphology morphology = getTurkishMorphology();
    NormalizationVocabularyGenerator generator = new NormalizationVocabularyGenerator(morphology);
    Path corporaRoot = Paths.get("/home/aaa/data/normalization/corpus");
    Path outRoot = Paths.get("/home/aaa/data/normalization/vocab-clean");
    Path rootList = corporaRoot.resolve("clean-list");
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, rootList, 30_000);
    Files.createDirectories(outRoot);
    // create vocabularies
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 22) {
        threadCount = 22;
    }
    generator.createVocabulary(corpusProvider, threadCount, outRoot);
}
Also used : Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5