Search in sources :

Example 1 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class CharacterGraphDecoderTest method stemEndingTest2.

@Test
public void stemEndingTest2() {
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("üzmek", "yüz", "güz").build();
    List<String> endings = Lists.newArrayList("düm");
    StemEndingGraph graph = new StemEndingGraph(morphology, endings);
    CharacterGraphDecoder spellChecker = new CharacterGraphDecoder(graph.stemGraph);
    List<ScoredItem<String>> res = spellChecker.getSuggestionsWithScores("yüzdüm");
    Assert.assertEquals(3, res.size());
    assertContainsAll(res, "yüzdüm", "üzdüm", "güzdüm");
}
Also used : ScoredItem(zemberek.core.ScoredItem) TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 2 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class CharacterGraphDecoderTest method stemEndingTest3.

@Test
public void stemEndingTest3() {
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("o", "ol", "ola").build();
    List<String> endings = Lists.newArrayList("arak", "acak");
    StemEndingGraph graph = new StemEndingGraph(morphology, endings);
    CharacterGraphDecoder spellChecker = new CharacterGraphDecoder(graph.stemGraph);
    List<ScoredItem<String>> res = spellChecker.getSuggestionsWithScores("olarak");
    assertContainsAll(res, "olarak", "olacak", "olaarak");
}
Also used : ScoredItem(zemberek.core.ScoredItem) TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 3 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class CharacterGraphDecoderTest method stemEndingTest.

@Test
public void stemEndingTest() {
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("Türkiye", "Bayram").build();
    List<String> endings = Lists.newArrayList("ında", "de");
    StemEndingGraph graph = new StemEndingGraph(morphology, endings);
    CharacterGraphDecoder spellChecker = new CharacterGraphDecoder(graph.stemGraph);
    List<ScoredItem<String>> res = spellChecker.getSuggestionsWithScores("türkiyede");
    assertContainsAll(res, "türkiyede");
}
Also used : ScoredItem(zemberek.core.ScoredItem) TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 4 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class DocumentSimilarityExperiment method checkSimilarity.

public void checkSimilarity(Path model, Path corpusFile, Path outPath) throws IOException {
    FastText fastText = FastText.load(model);
    List<WebDocument> docs = WebCorpus.loadDocuments(corpusFile);
    List<DocumentSimilarity> sims = new ArrayList<>();
    Log.info("Calculating document vectors.");
    for (WebDocument doc : docs) {
        doc.setContent(hack(doc.getLines()));
        if (doc.contentLength() < 500) {
            continue;
        }
        String str = doc.getContentAsString();
        str = str.length() > 200 ? str.substring(0, 200) : str;
        float[] vec = fastText.sentenceVector(str).clone();
        // float[] vec = fastText.textVectors(doc.getLines()).data_.clone();
        sims.add(new DocumentSimilarity(doc, vec));
    }
    try (PrintWriter pw = new PrintWriter(outPath.toFile(), "utf-8")) {
        int i = 0;
        for (DocumentSimilarity sim : sims) {
            List<ScoredItem<WebDocument>> nearest = nearestK(sim, sims, 5);
            pw.println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
            pw.println(String.join("\n", sim.document.getLines()));
            for (ScoredItem<WebDocument> w : nearest) {
                pw.println("----------------------------------");
                pw.println(String.join("\n", w.item.getLines()));
            }
            i++;
            if (i == 100) {
                break;
            }
        }
    }
}
Also used : ArrayList(java.util.ArrayList) ScoredItem(zemberek.core.ScoredItem) WebDocument(zemberek.corpus.WebDocument) FastText(zemberek.core.embeddings.FastText) PrintWriter(java.io.PrintWriter)

Example 5 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method runExperiment.

private void runExperiment() throws Exception {
    Path corpusPath = experimentRoot.resolve("category.corpus");
    Path train = experimentRoot.resolve("category.train");
    Path test = experimentRoot.resolve("category.test");
    Path titleRaw = experimentRoot.resolve("category.title");
    Path modelPath = experimentRoot.resolve("category.model");
    Path predictionPath = experimentRoot.resolve("category.predictions");
    extractCategoryDocuments(rawCorpusRoot, corpusPath);
    boolean useOnlyTitles = true;
    boolean useLemmas = true;
    generateSets(corpusPath, train, test, useOnlyTitles, useLemmas);
    generateRawSet(corpusPath, titleRaw);
    FastText fastText;
    if (modelPath.toFile().exists()) {
        Log.info("Reusing existing model %s", modelPath);
        fastText = FastText.load(modelPath);
    } else {
        Args argz = Args.forSupervised();
        argz.thread = 4;
        argz.model = Args.model_name.supervised;
        argz.loss = Args.loss_name.softmax;
        argz.epoch = 50;
        argz.wordNgrams = 2;
        argz.minCount = 0;
        argz.lr = 0.5;
        argz.dim = 100;
        argz.bucket = 5_000_000;
        fastText = new FastTextTrainer(argz).train(train);
        fastText.saveModel(modelPath);
    }
    EvaluationResult result = fastText.test(test, 1);
    Log.info(result.toString());
    WebCorpus corpus = new WebCorpus("corpus", "labeled");
    corpus.addDocuments(WebCorpus.loadDocuments(corpusPath));
    Log.info("Testing started.");
    List<String> testLines = Files.readAllLines(test, StandardCharsets.UTF_8);
    try (PrintWriter pw = new PrintWriter(predictionPath.toFile(), "utf-8")) {
        for (String testLine : testLines) {
            String id = testLine.substring(0, testLine.indexOf(' ')).substring(1);
            WebDocument doc = corpus.getDocument(id);
            List<ScoredItem<String>> res = fastText.predict(testLine, 3);
            List<String> predictedCategories = new ArrayList<>();
            for (ScoredItem<String> re : res) {
                if (re.score < -10) {
                    continue;
                }
                predictedCategories.add(String.format(Locale.ENGLISH, "%s (%.2f)", re.item.replaceAll("__label__", "").replaceAll("_", " "), re.score));
            }
            pw.println("id = " + id);
            pw.println();
            pw.println(doc.getTitle());
            pw.println();
            pw.println("Actual Category = " + doc.getCategory());
            pw.println("Predictions   = " + String.join(", ", predictedCategories));
            pw.println();
            pw.println("------------------------------------------------------");
            pw.println();
        }
    }
    Log.info("Done.");
}
Also used : Path(java.nio.file.Path) Args(zemberek.core.embeddings.Args) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) FastTextTrainer(zemberek.core.embeddings.FastTextTrainer) EvaluationResult(zemberek.core.embeddings.FastText.EvaluationResult) WebDocument(zemberek.corpus.WebDocument) WebCorpus(zemberek.corpus.WebCorpus) FastText(zemberek.core.embeddings.FastText) PrintWriter(java.io.PrintWriter)

Aggregations

ScoredItem (zemberek.core.ScoredItem)15 ArrayList (java.util.ArrayList)11 PrintWriter (java.io.PrintWriter)6 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 Path (java.nio.file.Path)5 IOException (java.io.IOException)3 LinkedHashSet (java.util.LinkedHashSet)3 List (java.util.List)3 Set (java.util.Set)3 Collectors (java.util.stream.Collectors)3 Test (org.junit.Test)3 FastTextClassifier (zemberek.classification.FastTextClassifier)3 Log (zemberek.core.logging.Log)3 TurkishAlphabet (zemberek.core.turkish.TurkishAlphabet)3 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)3 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)3 Paths (java.nio.file.Paths)2 Scanner (java.util.Scanner)2 IntVector (zemberek.core.collections.IntVector)2 FastText (zemberek.core.embeddings.FastText)2