use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method prepareCorpus.
public void prepareCorpus(Path root, Path target) throws IOException {
Set<Long> hashes = new HashSet<>();
List<Path> files = new ArrayList<>();
if (root.toFile().isFile()) {
files.add(root);
} else {
files.addAll(Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
}
files.sort(Comparator.comparing(Path::toString));
WebCorpus corpus = new WebCorpus("web-news", "all");
int duplicateCount = 0;
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
for (Path file : files) {
Log.info("Adding %s", file);
List<WebDocument> docs = WebCorpus.loadDocuments(file);
for (WebDocument doc : docs) {
doc.setContent(extractor.fromParagraphs(doc.getLines()));
doc.setContent(normalizeLines(doc.getLines()));
if (hashes.contains(doc.getHash())) {
duplicateCount++;
continue;
}
if (doc.contentLength() < 50) {
continue;
}
hashes.add(doc.getHash());
corpus.addDocument(doc);
}
Log.info("Total doc count = %d Duplicate count= %d", corpus.documentCount(), duplicateCount);
}
Log.info("Total amount of files = %d", corpus.getDocuments().size());
corpus.save(target, false);
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method extractCategoryDocuments.
private void extractCategoryDocuments(Path root, Path categoryFile) throws IOException {
List<Path> files = Files.walk(root).filter(s -> s.toFile().isFile()).sorted(Comparator.comparing(Path::toString)).collect(Collectors.toList());
WebCorpus corpus = new WebCorpus("category", "category");
for (Path file : files) {
if (file.toFile().isDirectory()) {
continue;
}
Log.info("Adding %s", file);
List<WebDocument> doc = WebCorpus.loadDocuments(file);
List<WebDocument> labeled = doc.stream().filter(s -> s.getCategory().length() > 0 && s.getContentAsString().length() > 200).collect(Collectors.toList());
corpus.addDocuments(labeled);
}
Log.info("Total amount of files = %d", corpus.getDocuments().size());
WebCorpus noDuplicates = corpus.copyNoDuplicates();
Log.info("Corpus size = %d, After removing duplicates = %d", corpus.documentCount(), noDuplicates.documentCount());
Log.info("Saving corpus to %s", categoryFile);
noDuplicates.save(categoryFile, false);
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateRawSet.
private void generateRawSet(Path input, Path train) throws IOException {
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
for (String c : categoryCounts.getSortedList()) {
System.out.println(c + " " + categoryCounts.getCount(c));
}
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (document.getTitle().length() == 0) {
continue;
}
String title = document.getTitle();
String category = document.getCategory();
if (category.contains("CNN") || category.contains("Güncel") || category.contains("Euro 2016") || category.contains("Yazarlar") || category.contains("Ajanda")) {
continue;
}
if (category.equals("İyilik Sağlık")) {
category = "Sağlık";
}
if (category.equals("Spor Diğer")) {
category = "Spor";
}
if (category.equals("İyilik Sağlık")) {
category = "Sağlık";
}
if (categoryCounts.contains(category)) {
category = "__label__" + category.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
set.add(category + " " + title);
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate raw set.");
Files.write(train, set, StandardCharsets.UTF_8);
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class WordHistogram method getParagraphsFromCorpus.
private static List<String> getParagraphsFromCorpus(Path input) throws IOException {
WebCorpus corpus = new WebCorpus("a", "a");
corpus.addDocuments(WebCorpus.loadDocuments(input));
Set<Long> contentHash = new HashSet<>();
List<String> paragraphs = new ArrayList<>(100000);
for (WebDocument document : corpus.getDocuments()) {
Long hash = document.getHash();
if (contentHash.contains(hash)) {
continue;
}
contentHash.add(hash);
paragraphs.add(document.getContentAsString());
}
return paragraphs;
}
Aggregations