Search in sources :

Example 6 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method generateSetForLabelExperiment.

Set<String> generateSetForLabelExperiment(Path input, TurkishSentenceAnalyzer analyzer, boolean useRoots) throws IOException {
    WebCorpus corpus = new WebCorpus("label", "labeled");
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    Log.info("Extracting data.");
    Histogram<String> labelCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        List<String> labels = document.getLabels();
        List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
        labelCounts.add(lowerCase);
    }
    labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
    Log.info("All label count = %d", labelCounts.size());
    labelCounts.removeSmaller(15);
    Log.info("Reduced label count = %d", labelCounts.size());
    labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    Set<Long> contentHash = new HashSet<>();
    for (WebDocument document : corpus.getDocuments()) {
        Long hash = document.getHash();
        if (contentHash.contains(hash)) {
            continue;
        }
        contentHash.add(hash);
        List<String> labelTags = new ArrayList<>();
        boolean labelFound = false;
        for (String label : document.getLabels()) {
            if (labelCounts.contains(label)) {
                labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
                labelFound = true;
            }
        }
        if (!labelFound) {
            continue;
        }
        String labelStr = String.join(" ", labelTags);
        String content = document.getContentAsString();
        String processed = processContent(analyzer, content, useRoots);
        if (processed.length() < 200) {
            continue;
        }
        set.add("#" + document.getId() + " " + labelStr + " " + processed);
        if (c++ % 1000 == 0) {
            Log.info("%d processed.", c);
        }
    }
    Log.info("Generate train and test set.");
    Collections.shuffle(set, new Random(1));
    return new LinkedHashSet<>(set);
}
Also used : TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Stopwatch(com.google.common.base.Stopwatch) WebCorpus(zemberek.corpus.WebCorpus) Token(org.antlr.v4.runtime.Token) Random(java.util.Random) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) Set(java.util.Set) TurkishLexer(zemberek.tokenization.antlr.TurkishLexer) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Paths(java.nio.file.Paths) ScoredItem(zemberek.core.ScoredItem) Comparator(java.util.Comparator) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) WebDocument(zemberek.corpus.WebDocument) Random(java.util.Random) WebCorpus(zemberek.corpus.WebCorpus) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 7 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method extractCategoryDocuments.

private void extractCategoryDocuments(Path root, Path categoryFile) throws IOException {
    List<Path> files = Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
    files.sort(Comparator.comparing(Path::toString));
    WebCorpus corpus = new WebCorpus("category", "category");
    for (Path file : files) {
        if (file.toFile().isDirectory()) {
            continue;
        }
        Log.info("Adding %s", file);
        List<WebDocument> doc = WebCorpus.loadDocuments(file);
        List<WebDocument> labeled = doc.stream().filter(s -> s.getCategory().length() > 0 && s.getContentAsString().length() > 200).collect(Collectors.toList());
        corpus.addDocuments(labeled);
    }
    Log.info("Total amount of files = %d", corpus.getDocuments().size());
    WebCorpus noDuplicates = corpus.copyNoDuplicates();
    Log.info("Corpus size = %d, After removing duplicates = %d", corpus.documentCount(), noDuplicates.documentCount());
    Log.info("Saving corpus to %s", categoryFile);
    noDuplicates.save(categoryFile, false);
}
Also used : Path(java.nio.file.Path) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) WebCorpus(zemberek.corpus.WebCorpus) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) PrintWriter(java.io.PrintWriter) AutomaticLabelingExperiment.saveSets(zemberek.embedding.fasttext.AutomaticLabelingExperiment.saveSets) Files(java.nio.file.Files) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishLexer(zemberek.tokenization.antlr.TurkishLexer) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Paths(java.nio.file.Paths) ScoredItem(zemberek.core.ScoredItem) Comparator(java.util.Comparator) WebDocument(zemberek.corpus.WebDocument) WebCorpus(zemberek.corpus.WebCorpus)

Example 8 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class DocumentSimilarityExperiment method onlySentences.

public void onlySentences(Path input, Path output) throws IOException {
    WebCorpus corpus = new WebCorpus("web-news", "all");
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    Log.info("Corpus loaded. There are %d documents.", corpus.documentCount());
    corpus.save(output, true);
}
Also used : WebCorpus(zemberek.corpus.WebCorpus)

Example 9 with WebCorpus

use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.

the class DocumentSimilarityExperiment method prepareCorpus.

public void prepareCorpus(Path root, Path target) throws IOException {
    Set<Long> hashes = new HashSet<>();
    List<Path> files = new ArrayList<>();
    if (root.toFile().isFile()) {
        files.add(root);
    } else {
        files.addAll(Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
    }
    files.sort(Comparator.comparing(Path::toString));
    WebCorpus corpus = new WebCorpus("web-news", "all");
    int duplicateCount = 0;
    TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
    for (Path file : files) {
        Log.info("Adding %s", file);
        List<WebDocument> docs = WebCorpus.loadDocuments(file);
        for (WebDocument doc : docs) {
            doc.setContent(extractor.fromParagraphs(doc.getLines()));
            doc.setContent(normalizeLines(doc.getLines()));
            if (hashes.contains(doc.getHash())) {
                duplicateCount++;
                continue;
            }
            if (doc.contentLength() < 50) {
                continue;
            }
            hashes.add(doc.getHash());
            corpus.addDocument(doc);
        }
        Log.info("Total doc count = %d Duplicate count= %d", corpus.documentCount(), duplicateCount);
    }
    Log.info("Total amount of files = %d", corpus.getDocuments().size());
    corpus.save(target, false);
}
Also used : Path(java.nio.file.Path) WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) WebCorpus(zemberek.corpus.WebCorpus) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

WebCorpus (zemberek.corpus.WebCorpus)9 ArrayList (java.util.ArrayList)8 WebDocument (zemberek.corpus.WebDocument)8 PrintWriter (java.io.PrintWriter)5 Path (java.nio.file.Path)5 ScoredItem (zemberek.core.ScoredItem)5 HashSet (java.util.HashSet)4 LinkedHashSet (java.util.LinkedHashSet)4 Token (org.antlr.v4.runtime.Token)4 Histogram (zemberek.core.collections.Histogram)4 Z3MarkovModelDisambiguator (zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator)4 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)4 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)4 TurkishMorphology (zemberek.morphology.analysis.tr.TurkishMorphology)4 TurkishSentenceAnalyzer (zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer)4 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)4 Stopwatch (com.google.common.base.Stopwatch)3 IOException (java.io.IOException)3 StandardCharsets (java.nio.charset.StandardCharsets)3 Files (java.nio.file.Files)3