use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectCorpusStatisticsForLemmas.
static CorpusStatistics collectCorpusStatisticsForLemmas(WebCorpus corpus, TurkishSentenceAnalyzer analyzer, int count) throws IOException {
CorpusStatistics statistics = new CorpusStatistics(1_000_000);
int docCount = 0;
for (WebDocument document : corpus.getDocuments()) {
Histogram<String> docHistogram = new Histogram<>();
List<String> sentences = extractor.fromParagraphs(document.getLines());
for (String sentence : sentences) {
List<WordAnalysis> analysis = analyzer.bestParse(sentence);
for (WordAnalysis w : analysis) {
if (!analysisAcceptable(w)) {
continue;
}
String s = w.getSurfaceForm();
if (TurkishStopWords.DEFAULT.contains(s)) {
continue;
}
List<String> lemmas = w.getLemmas();
docHistogram.add(lemmas.get(lemmas.size() - 1));
}
}
statistics.termFrequencies.add(docHistogram);
for (String s : docHistogram) {
statistics.documentFrequencies.add(s);
}
if (docCount++ % 500 == 0) {
Log.info("Doc count = %d", docCount);
}
if (count > 0 && docCount > count) {
break;
}
}
statistics.documentCount = count > 0 ? Math.min(count, corpus.documentCount()) : corpus.documentCount();
return statistics;
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectCorpusStatistics.
static CorpusStatistics collectCorpusStatistics(WebCorpus corpus) throws IOException {
CorpusStatistics statistics = new CorpusStatistics(1_000_000);
for (WebDocument document : corpus.getDocuments()) {
Histogram<String> docHistogram = new Histogram<>();
List<String> sentences = extractor.fromParagraphs(document.getLines());
for (String sentence : sentences) {
List<Token> tokens = lexer.tokenize(sentence);
for (Token token : tokens) {
if (!tokenTypeAccpetable(token)) {
continue;
}
String s = normalize(token.getText());
if (TurkishStopWords.DEFAULT.contains(s)) {
continue;
}
docHistogram.add(s);
}
}
statistics.termFrequencies.add(docHistogram);
for (String s : docHistogram) {
statistics.documentFrequencies.add(s);
}
}
statistics.documentCount = corpus.documentCount();
return statistics;
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectCorpusStatisticsForLemmas.
static CorpusStatistics collectCorpusStatisticsForLemmas(WebCorpus corpus, TurkishMorphology analyzer, int count) throws IOException {
CorpusStatistics statistics = new CorpusStatistics(1_000_000);
int docCount = 0;
for (WebDocument document : corpus.getDocuments()) {
Histogram<String> docHistogram = new Histogram<>();
List<String> sentences = extractor.fromParagraphs(document.getLines());
for (String sentence : sentences) {
List<SingleAnalysis> analysis = analyzer.analyzeAndDisambiguate(sentence).bestAnalysis();
for (SingleAnalysis w : analysis) {
if (!analysisAcceptable(w)) {
continue;
}
String s = w.getStemAndEnding().concat();
if (TurkishStopWords.DEFAULT.contains(s)) {
continue;
}
List<String> lemmas = w.getLemmas();
docHistogram.add(lemmas.get(lemmas.size() - 1));
}
}
statistics.termFrequencies.add(docHistogram);
for (String s : docHistogram) {
statistics.documentFrequencies.add(s);
}
if (docCount++ % 500 == 0) {
Log.info("Doc count = %d", docCount);
}
if (count > 0 && docCount > count) {
break;
}
}
statistics.documentCount = count > 0 ? Math.min(count, corpus.documentCount()) : corpus.documentCount();
return statistics;
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method extractLabeledDocuments.
private void extractLabeledDocuments(Path root, Path labeledFile) throws IOException {
List<Path> files = Files.walk(root).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
files.sort(Comparator.comparing(Path::toString));
WebCorpus corpus = new WebCorpus("label", "label");
for (Path file : files) {
if (file.toFile().isDirectory()) {
continue;
}
Log.info("Adding %s", file);
List<WebDocument> doc = WebCorpus.loadDocuments(file);
List<WebDocument> labeled = doc.stream().filter(s -> s.getLabels().size() > 0 && s.getContentAsString().length() > 200).collect(Collectors.toList());
corpus.addDocuments(labeled);
}
Log.info("Total amount of files = %d", corpus.getDocuments().size());
WebCorpus noDuplicates = corpus.copyNoDuplicates();
Log.info("Corpus size = %d, After removing duplicates = %d", corpus.documentCount(), noDuplicates.documentCount());
Log.info("Saving corpus to %s", labeledFile);
noDuplicates.save(labeledFile, false);
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method checkSimilarity.
public void checkSimilarity(Path model, Path corpusFile, Path outPath) throws IOException {
FastText fastText = FastText.load(model);
List<WebDocument> docs = WebCorpus.loadDocuments(corpusFile);
List<DocumentSimilarity> sims = new ArrayList<>();
Log.info("Calculating document vectors.");
for (WebDocument doc : docs) {
doc.setContent(hack(doc.getLines()));
if (doc.contentLength() < 500) {
continue;
}
String str = doc.getContentAsString();
str = str.length() > 200 ? str.substring(0, 200) : str;
float[] vec = fastText.sentenceVector(str).clone();
// float[] vec = fastText.textVectors(doc.getLines()).data_.clone();
sims.add(new DocumentSimilarity(doc, vec));
}
try (PrintWriter pw = new PrintWriter(outPath.toFile(), "utf-8")) {
int i = 0;
for (DocumentSimilarity sim : sims) {
List<ScoredItem<WebDocument>> nearest = nearestK(sim, sims, 5);
pw.println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
pw.println(String.join("\n", sim.document.getLines()));
for (ScoredItem<WebDocument> w : nearest) {
pw.println("----------------------------------");
pw.println(String.join("\n", w.item.getLines()));
}
i++;
if (i == 100) {
break;
}
}
}
}
Aggregations