Examples with WebCorpus - zemberek.corpus.WebCorpus

Example 1 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method extractLabeledDocuments.

private void extractLabeledDocuments(Path root, Path labeledFile) throws IOException {
    List<Path> files = Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
    files.sort(Comparator.comparing(Path::toString));
    WebCorpus corpus = new WebCorpus("label", "label");
    for (Path file : files) {
        if (file.toFile().isDirectory()) {
            continue;
        }
        Log.info("Adding %s", file);
        List<WebDocument> doc = WebCorpus.loadDocuments(file);
        List<WebDocument> labeled = doc.stream().filter(s -> s.getLabels().size() > 0 && s.getContentAsString().length() > 200).collect(Collectors.toList());
        corpus.addDocuments(labeled);
    }
    Log.info("Total amount of files = %d", corpus.getDocuments().size());
    WebCorpus noDuplicates = corpus.copyNoDuplicates();
    Log.info("Corpus size = %d, After removing duplicates = %d", corpus.documentCount(), noDuplicates.documentCount());
    Log.info("Saving corpus to %s", labeledFile);
    noDuplicates.save(labeledFile, false);
}

Also used : Path(java.nio.file.Path) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Stopwatch(com.google.common.base.Stopwatch) WebCorpus(zemberek.corpus.WebCorpus) Token(org.antlr.v4.runtime.Token) Random(java.util.Random) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) Set(java.util.Set) TurkishLexer(zemberek.tokenization.antlr.TurkishLexer) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Paths(java.nio.file.Paths) ScoredItem(zemberek.core.ScoredItem) Comparator(java.util.Comparator) Collections(java.util.Collections) WebDocument(zemberek.corpus.WebDocument) WebCorpus(zemberek.corpus.WebCorpus)

Example 2 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method runExperiment.

private void runExperiment() throws Exception {
    Path corpusPath = experimentRoot.resolve("category.corpus");
    Path train = experimentRoot.resolve("category.train");
    Path test = experimentRoot.resolve("category.test");
    Path modelPath = experimentRoot.resolve("category.model");
    Path predictionPath = experimentRoot.resolve("category.predictions");
    extractCategoryDocuments(rawCorpusRoot, corpusPath);
    boolean useOnlyTitles = true;
    boolean useLemmas = true;
    generateSets(corpusPath, train, test, useOnlyTitles, useLemmas);
    FastText fastText;
    if (modelPath.toFile().exists()) {
        Log.info("Reusing existing model %s", modelPath);
        fastText = FastText.load(modelPath);
    } else {
        Args argz = Args.forSupervised();
        argz.thread = 4;
        argz.model = Args.model_name.sup;
        argz.loss = Args.loss_name.softmax;
        argz.threadSafe = false;
        argz.epoch = 50;
        argz.wordNgrams = 2;
        argz.minCount = 0;
        argz.lr = 0.2;
        argz.dim = 100;
        argz.bucket = 5_000_000;
        fastText = FastText.train(train, argz);
        fastText.saveModel(modelPath);
    }
    fastText.test(test, 1);
    WebCorpus corpus = new WebCorpus("corpus", "labeled");
    corpus.addDocuments(WebCorpus.loadDocuments(corpusPath));
    Log.info("Testing started.");
    List<String> testLines = Files.readAllLines(test, StandardCharsets.UTF_8);
    try (PrintWriter pw = new PrintWriter(predictionPath.toFile(), "utf-8")) {
        for (String testLine : testLines) {
            String id = testLine.substring(0, testLine.indexOf(' ')).substring(1);
            WebDocument doc = corpus.getDocument(id);
            List<ScoredItem<String>> res = fastText.predict(testLine, 3);
            List<String> predictedCategories = new ArrayList<>();
            for (ScoredItem<String> re : res) {
                if (re.score < -10) {
                    continue;
                }
                predictedCategories.add(String.format("%s (%.2f)", re.item.replaceAll("__label__", "").replaceAll("_", " "), re.score));
            }
            pw.println("id = " + id);
            pw.println();
            pw.println(doc.getTitle());
            pw.println();
            pw.println("Actual Category = " + doc.getCategory());
            pw.println("Predictions   = " + String.join(", ", predictedCategories));
            pw.println();
            pw.println("------------------------------------------------------");
            pw.println();
        }
    }
    Log.info("Done.");
}

Also used : Path(java.nio.file.Path) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) WebDocument(zemberek.corpus.WebDocument) WebCorpus(zemberek.corpus.WebCorpus) PrintWriter(java.io.PrintWriter)

Example 3 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateSets.

private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useRoots) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, new Z3MarkovModelDisambiguator());
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (useOnlyTitle && document.getTitle().length() == 0) {
            continue;
        }
        String content = document.getContentAsString();
        String title = document.getTitle();
        List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
        List<String> reduced = new ArrayList<>(docTokens.size());
        String category = document.getCategory();
        if (categoryCounts.contains(category)) {
            category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        for (Token token : docTokens) {
            if (token.getType() == TurkishLexer.PercentNumeral || token.getType() == TurkishLexer.Number || token.getType() == TurkishLexer.Punctuation || token.getType() == TurkishLexer.RomanNumeral || token.getType() == TurkishLexer.Time || token.getType() == TurkishLexer.UnknownWord || token.getType() == TurkishLexer.Unknown) {
                continue;
            }
            String tokenStr = token.getText();
            reduced.add(tokenStr);
        }
        String join = String.join(" ", reduced);
        if (useRoots) {
            SentenceAnalysis analysis = analyzer.analyze(join);
            analyzer.disambiguate(analysis);
            List<String> res = new ArrayList<>();
            for (SentenceAnalysis.Entry e : analysis) {
                WordAnalysis best = e.parses.get(0);
                if (best.isUnknown()) {
                    res.add(e.input);
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                res.add(lemmas.get(lemmas.size() - 1));
            }
            join = String.join(" ", res);
        }
        set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate train and test set.");
    saveSets(train, test, new LinkedHashSet<>(set));
}

Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) WebDocument(zemberek.corpus.WebDocument) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) WebCorpus(zemberek.corpus.WebCorpus)

Example 4 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class WordHistogram method getParagraphsFromCorpus.

private static List<String> getParagraphsFromCorpus(Path input) throws IOException {
    WebCorpus corpus = new WebCorpus("a", "a");
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    Set<Long> contentHash = new HashSet<>();
    List<String> paragraphs = new ArrayList<>(100000);
    for (WebDocument document : corpus.getDocuments()) {
        Long hash = document.getHash();
        if (contentHash.contains(hash)) {
            continue;
        }
        contentHash.add(hash);
        paragraphs.add(document.getContentAsString());
    }
    return paragraphs;
}

Also used : WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) WebCorpus(zemberek.corpus.WebCorpus) HashSet(java.util.HashSet)

Example 5 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method test.

private void test(Path corpusPath, Path testData, Path predictionPath, FastText fastText) throws IOException {
    WebCorpus corpus = new WebCorpus("label", "label");
    corpus.addDocuments(WebCorpus.loadDocuments(corpusPath));
    Log.info("Testing started.");
    List<String> testLines = Files.readAllLines(testData, StandardCharsets.UTF_8);
    Stopwatch sw = Stopwatch.createStarted();
    try (PrintWriter pw = new PrintWriter(predictionPath.toFile(), "utf-8")) {
        for (String testLine : testLines) {
            String id = testLine.substring(0, testLine.indexOf(' ')).substring(1);
            WebDocument doc = corpus.getDocument(id);
            List<ScoredItem<String>> res = fastText.predict(testLine, 7);
            List<String> predictedLabels = new ArrayList<>();
            for (ScoredItem<String> re : res) {
                predictedLabels.add(String.format("%s (%.2f)", re.item.replaceAll("__label__", "").replaceAll("_", " "), re.score));
            }
            pw.println("id = " + id);
            pw.println();
            pw.println(doc.getContentAsString().replaceAll("[\n\r]+", "\n"));
            pw.println();
            pw.println("Actual Labels = " + String.join(", ", doc.getLabels()));
            pw.println("Predictions   = " + String.join(", ", predictedLabels));
            pw.println();
            pw.println("------------------------------------------------------");
            pw.println();
        }
    }
    Log.info("Done. in %d ms.", sw.elapsed(TimeUnit.MILLISECONDS));
}

Also used : WebDocument(zemberek.corpus.WebDocument) Stopwatch(com.google.common.base.Stopwatch) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) WebCorpus(zemberek.corpus.WebCorpus) PrintWriter(java.io.PrintWriter)

Aggregations

WebCorpus (zemberek.corpus.WebCorpus)9 ArrayList (java.util.ArrayList)8 WebDocument (zemberek.corpus.WebDocument)8 PrintWriter (java.io.PrintWriter)5 Path (java.nio.file.Path)5 ScoredItem (zemberek.core.ScoredItem)5 HashSet (java.util.HashSet)4 LinkedHashSet (java.util.LinkedHashSet)4 Token (org.antlr.v4.runtime.Token)4 Histogram (zemberek.core.collections.Histogram)4 Z3MarkovModelDisambiguator (zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator)4 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)4 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)4 TurkishMorphology (zemberek.morphology.analysis.tr.TurkishMorphology)4 TurkishSentenceAnalyzer (zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer)4 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)4 Stopwatch (com.google.common.base.Stopwatch)3 IOException (java.io.IOException)3 StandardCharsets (java.nio.charset.StandardCharsets)3 Files (java.nio.file.Files)3