Search in sources :

Example 11 with WebDocument

use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.

the class DocumentSimilarityExperiment method prepareCorpus.

public void prepareCorpus(Path root, Path target) throws IOException {
    Set<Long> hashes = new HashSet<>();
    List<Path> files = new ArrayList<>();
    if (root.toFile().isFile()) {
        files.add(root);
    } else {
        files.addAll(Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
    }
    files.sort(Comparator.comparing(Path::toString));
    WebCorpus corpus = new WebCorpus("web-news", "all");
    int duplicateCount = 0;
    TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
    for (Path file : files) {
        Log.info("Adding %s", file);
        List<WebDocument> docs = WebCorpus.loadDocuments(file);
        for (WebDocument doc : docs) {
            doc.setContent(extractor.fromParagraphs(doc.getLines()));
            doc.setContent(normalizeLines(doc.getLines()));
            if (hashes.contains(doc.getHash())) {
                duplicateCount++;
                continue;
            }
            if (doc.contentLength() < 50) {
                continue;
            }
            hashes.add(doc.getHash());
            corpus.addDocument(doc);
        }
        Log.info("Total doc count = %d Duplicate count= %d", corpus.documentCount(), duplicateCount);
    }
    Log.info("Total amount of files = %d", corpus.getDocuments().size());
    corpus.save(target, false);
}
Also used : Path(java.nio.file.Path) WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) WebCorpus(zemberek.corpus.WebCorpus) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 12 with WebDocument

use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method extractCategoryDocuments.

private void extractCategoryDocuments(Path root, Path categoryFile) throws IOException {
    List<Path> files = Files.walk(root).filter(s -> s.toFile().isFile()).sorted(Comparator.comparing(Path::toString)).collect(Collectors.toList());
    WebCorpus corpus = new WebCorpus("category", "category");
    for (Path file : files) {
        if (file.toFile().isDirectory()) {
            continue;
        }
        Log.info("Adding %s", file);
        List<WebDocument> doc = WebCorpus.loadDocuments(file);
        List<WebDocument> labeled = doc.stream().filter(s -> s.getCategory().length() > 0 && s.getContentAsString().length() > 200).collect(Collectors.toList());
        corpus.addDocuments(labeled);
    }
    Log.info("Total amount of files = %d", corpus.getDocuments().size());
    WebCorpus noDuplicates = corpus.copyNoDuplicates();
    Log.info("Corpus size = %d, After removing duplicates = %d", corpus.documentCount(), noDuplicates.documentCount());
    Log.info("Saving corpus to %s", categoryFile);
    noDuplicates.save(categoryFile, false);
}
Also used : Path(java.nio.file.Path) EvaluationResult(zemberek.core.embeddings.FastText.EvaluationResult) FastTextTrainer(zemberek.core.embeddings.FastTextTrainer) WebCorpus(zemberek.corpus.WebCorpus) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) WebDocument(zemberek.corpus.WebDocument) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) Turkish(zemberek.core.turkish.Turkish) Token(zemberek.tokenization.Token) Locale(java.util.Locale) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) FastText(zemberek.core.embeddings.FastText) PrintWriter(java.io.PrintWriter) Args(zemberek.core.embeddings.Args) AutomaticLabelingExperiment.saveSets(zemberek.embedding.fasttext.AutomaticLabelingExperiment.saveSets) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) List(java.util.List) Paths(java.nio.file.Paths) ScoredItem(zemberek.core.ScoredItem) Comparator(java.util.Comparator) WebDocument(zemberek.corpus.WebDocument) WebCorpus(zemberek.corpus.WebCorpus)

Example 13 with WebDocument

use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateRawSet.

private void generateRawSet(Path input, Path train) throws IOException {
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    for (String c : categoryCounts.getSortedList()) {
        System.out.println(c + " " + categoryCounts.getCount(c));
    }
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (document.getTitle().length() == 0) {
            continue;
        }
        String title = document.getTitle();
        String category = document.getCategory();
        if (category.contains("CNN") || category.contains("Güncel") || category.contains("Euro 2016") || category.contains("Yazarlar") || category.contains("Ajanda")) {
            continue;
        }
        if (category.equals("İyilik Sağlık")) {
            category = "Sağlık";
        }
        if (category.equals("Spor Diğer")) {
            category = "Spor";
        }
        if (category.equals("İyilik Sağlık")) {
            category = "Sağlık";
        }
        if (categoryCounts.contains(category)) {
            category = "__label__" + category.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        set.add(category + " " + title);
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate raw set.");
    Files.write(train, set, StandardCharsets.UTF_8);
}
Also used : Histogram(zemberek.core.collections.Histogram) WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) WebCorpus(zemberek.corpus.WebCorpus)

Example 14 with WebDocument

use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.

the class WordHistogram method getParagraphsFromCorpus.

private static List<String> getParagraphsFromCorpus(Path input) throws IOException {
    WebCorpus corpus = new WebCorpus("a", "a");
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    Set<Long> contentHash = new HashSet<>();
    List<String> paragraphs = new ArrayList<>(100000);
    for (WebDocument document : corpus.getDocuments()) {
        Long hash = document.getHash();
        if (contentHash.contains(hash)) {
            continue;
        }
        contentHash.add(hash);
        paragraphs.add(document.getContentAsString());
    }
    return paragraphs;
}
Also used : WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) WebCorpus(zemberek.corpus.WebCorpus) HashSet(java.util.HashSet)

Aggregations

WebDocument (zemberek.corpus.WebDocument)14 ArrayList (java.util.ArrayList)11 WebCorpus (zemberek.corpus.WebCorpus)10 Histogram (zemberek.core.collections.Histogram)9 PrintWriter (java.io.PrintWriter)7 ScoredItem (zemberek.core.ScoredItem)7 Path (java.nio.file.Path)6 HashSet (java.util.HashSet)5 LinkedHashSet (java.util.LinkedHashSet)5 FastText (zemberek.core.embeddings.FastText)5 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)5 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)5 Token (zemberek.tokenization.Token)5 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)5 Stopwatch (com.google.common.base.Stopwatch)4 IOException (java.io.IOException)4 StandardCharsets (java.nio.charset.StandardCharsets)4 Files (java.nio.file.Files)4 Paths (java.nio.file.Paths)4 Comparator (java.util.Comparator)4