use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method generateSetForLabelExperiment.
Set<String> generateSetForLabelExperiment(Path input, TurkishSentenceAnalyzer analyzer, boolean useRoots) throws IOException {
WebCorpus corpus = new WebCorpus("label", "labeled");
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
Log.info("Extracting data.");
Histogram<String> labelCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
List<String> labels = document.getLabels();
List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
labelCounts.add(lowerCase);
}
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
Log.info("All label count = %d", labelCounts.size());
labelCounts.removeSmaller(15);
Log.info("Reduced label count = %d", labelCounts.size());
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
Set<Long> contentHash = new HashSet<>();
for (WebDocument document : corpus.getDocuments()) {
Long hash = document.getHash();
if (contentHash.contains(hash)) {
continue;
}
contentHash.add(hash);
List<String> labelTags = new ArrayList<>();
boolean labelFound = false;
for (String label : document.getLabels()) {
if (labelCounts.contains(label)) {
labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
labelFound = true;
}
}
if (!labelFound) {
continue;
}
String labelStr = String.join(" ", labelTags);
String content = document.getContentAsString();
String processed = processContent(analyzer, content, useRoots);
if (processed.length() < 200) {
continue;
}
set.add("#" + document.getId() + " " + labelStr + " " + processed);
if (c++ % 1000 == 0) {
Log.info("%d processed.", c);
}
}
Log.info("Generate train and test set.");
Collections.shuffle(set, new Random(1));
return new LinkedHashSet<>(set);
}
use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method extractCategoryDocuments.
private void extractCategoryDocuments(Path root, Path categoryFile) throws IOException {
List<Path> files = Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
files.sort(Comparator.comparing(Path::toString));
WebCorpus corpus = new WebCorpus("category", "category");
for (Path file : files) {
if (file.toFile().isDirectory()) {
continue;
}
Log.info("Adding %s", file);
List<WebDocument> doc = WebCorpus.loadDocuments(file);
List<WebDocument> labeled = doc.stream().filter(s -> s.getCategory().length() > 0 && s.getContentAsString().length() > 200).collect(Collectors.toList());
corpus.addDocuments(labeled);
}
Log.info("Total amount of files = %d", corpus.getDocuments().size());
WebCorpus noDuplicates = corpus.copyNoDuplicates();
Log.info("Corpus size = %d, After removing duplicates = %d", corpus.documentCount(), noDuplicates.documentCount());
Log.info("Saving corpus to %s", categoryFile);
noDuplicates.save(categoryFile, false);
}
use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method onlySentences.
public void onlySentences(Path input, Path output) throws IOException {
WebCorpus corpus = new WebCorpus("web-news", "all");
corpus.addDocuments(WebCorpus.loadDocuments(input));
Log.info("Corpus loaded. There are %d documents.", corpus.documentCount());
corpus.save(output, true);
}
use of zemberek.corpus.WebCorpus in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method prepareCorpus.
public void prepareCorpus(Path root, Path target) throws IOException {
Set<Long> hashes = new HashSet<>();
List<Path> files = new ArrayList<>();
if (root.toFile().isFile()) {
files.add(root);
} else {
files.addAll(Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
}
files.sort(Comparator.comparing(Path::toString));
WebCorpus corpus = new WebCorpus("web-news", "all");
int duplicateCount = 0;
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
for (Path file : files) {
Log.info("Adding %s", file);
List<WebDocument> docs = WebCorpus.loadDocuments(file);
for (WebDocument doc : docs) {
doc.setContent(extractor.fromParagraphs(doc.getLines()));
doc.setContent(normalizeLines(doc.getLines()));
if (hashes.contains(doc.getHash())) {
duplicateCount++;
continue;
}
if (doc.contentLength() < 50) {
continue;
}
hashes.add(doc.getHash());
corpus.addDocument(doc);
}
Log.info("Total doc count = %d Duplicate count= %d", corpus.documentCount(), duplicateCount);
}
Log.info("Total amount of files = %d", corpus.getDocuments().size());
corpus.save(target, false);
}
Aggregations