use of zemberek.tokenization.TurkishSentenceExtractor in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method prepareCorpus.
public void prepareCorpus(Path root, Path target) throws IOException {
Set<Long> hashes = new HashSet<>();
List<Path> files = new ArrayList<>();
if (root.toFile().isFile()) {
files.add(root);
} else {
files.addAll(Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
}
files.sort(Comparator.comparing(Path::toString));
WebCorpus corpus = new WebCorpus("web-news", "all");
int duplicateCount = 0;
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
for (Path file : files) {
Log.info("Adding %s", file);
List<WebDocument> docs = WebCorpus.loadDocuments(file);
for (WebDocument doc : docs) {
doc.setContent(extractor.fromParagraphs(doc.getLines()));
doc.setContent(normalizeLines(doc.getLines()));
if (hashes.contains(doc.getHash())) {
duplicateCount++;
continue;
}
if (doc.contentLength() < 50) {
continue;
}
hashes.add(doc.getHash());
corpus.addDocument(doc);
}
Log.info("Total doc count = %d Duplicate count= %d", corpus.documentCount(), duplicateCount);
}
Log.info("Total amount of files = %d", corpus.getDocuments().size());
corpus.save(target, false);
}
Aggregations