use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectCorpusStatisticsForLemmas.
static CorpusStatistics collectCorpusStatisticsForLemmas(WebCorpus corpus, TurkishMorphology analyzer, int count) throws IOException {
CorpusStatistics statistics = new CorpusStatistics(1_000_000);
int docCount = 0;
for (WebDocument document : corpus.getDocuments()) {
Histogram<String> docHistogram = new Histogram<>();
List<String> sentences = extractor.fromParagraphs(document.getLines());
for (String sentence : sentences) {
List<SingleAnalysis> analysis = analyzer.analyzeAndDisambiguate(sentence).bestAnalysis();
for (SingleAnalysis w : analysis) {
if (!analysisAcceptable(w)) {
continue;
}
String s = w.getStemAndEnding().concat();
if (TurkishStopWords.DEFAULT.contains(s)) {
continue;
}
List<String> lemmas = w.getLemmas();
docHistogram.add(lemmas.get(lemmas.size() - 1));
}
}
statistics.termFrequencies.add(docHistogram);
for (String s : docHistogram) {
statistics.documentFrequencies.add(s);
}
if (docCount++ % 500 == 0) {
Log.info("Doc count = %d", docCount);
}
if (count > 0 && docCount > count) {
break;
}
}
statistics.documentCount = count > 0 ? Math.min(count, corpus.documentCount()) : corpus.documentCount();
return statistics;
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method lemmaNgrams.
private List<Histogram<Term>> lemmaNgrams(List<String> paragraphs) {
List<Histogram<Term>> ngrams = new ArrayList<>(order + 1);
for (int i = 0; i < order; i++) {
ngrams.add(new Histogram<>(100));
}
int tokenCount = 0;
List<String> sentences = extractor.fromParagraphs(paragraphs);
for (String sentence : sentences) {
List<SingleAnalysis> analysis = morphology.analyzeAndDisambiguate(sentence).bestAnalysis();
for (int i = 0; i < order; i++) {
int currentOrder = i + 1;
for (int j = 0; j < analysis.size() - currentOrder; j++) {
String[] words = new String[currentOrder];
boolean fail = false;
for (int k = 0; k < currentOrder; k++) {
SingleAnalysis a = analysis.get(j + k);
if (!analysisAcceptable(a)) {
fail = true;
break;
}
String surface = a.getStemAndEnding().concat();
if (TurkishStopWords.DEFAULT.contains(surface)) {
fail = true;
break;
}
List<String> lemmas = a.getLemmas();
words[k] = lemmas.get(lemmas.size() - 1);
}
if (!fail) {
Term term = new Term(words);
int count = ngrams.get(i).add(term);
if (count == 1) {
// if this is the first time, set the first occurance index.
term.setFirstOccurrenceIndex(tokenCount + j);
}
}
tokenCount += analysis.size();
}
}
}
return ngrams;
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class GenerateDataWithRules method extractHighlyAmbigiousWordSentences.
private void extractHighlyAmbigiousWordSentences(Path inputRoot, Path outRoot, int minCount, int wordCount) throws IOException {
List<Path> files = Files.walk(inputRoot, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
for (Path file : files) {
Log.info("Processing %s", file);
LinkedHashSet<String> sentences = getSentences(file);
List<List<String>> group = group(new ArrayList<>(sentences), 5000);
for (List<String> lines : group) {
Log.info("Collected %d words.", wordAnalyses.size());
LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
for (String sentence : toProcess) {
try {
SentenceAnalysis sentenceAnalysis = morphology.analyzeAndDisambiguate(sentence);
for (SentenceWordAnalysis analysis : sentenceAnalysis) {
HashSet<String> stems = new HashSet<>(4);
for (SingleAnalysis s : analysis.getWordAnalysis()) {
stems.add(s.getStem());
if (stems.size() > minCount) {
wordAnalyses.add(analysis.getWordAnalysis());
break;
}
}
}
} catch (Exception e) {
Log.warn("Error in sentence %s", sentence);
}
}
}
if (wordAnalyses.size() > wordCount) {
break;
}
}
String s = inputRoot.toFile().getName();
Path amb = outRoot.resolve(s + "-amb.txt");
try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
for (WordAnalysis wa : wordAnalyses.getSortedList()) {
pwa.println(wa.getInput());
for (SingleAnalysis analysis : wa) {
pwa.println(analysis.formatLong());
}
pwa.println();
}
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method unknownZemberek.
@Test
@Ignore("Not a Test.")
public void unknownZemberek() throws IOException {
Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
Log.info("Loading histogram.");
Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
Path dir = DATA_PATH.resolve("out");
Log.info("Loading parseable.");
List<String> zemberekAll = Files.readAllLines(dir.resolve("zemberek-parsed-words.txt"));
histogram.removeAll(zemberekAll);
// histogram.removeSmaller(10);
Log.info("Saving.");
Files.write(dir.resolve("no-parse-zemberek-freq.txt"), histogram.getSortedList());
Files.write(dir.resolve("no-parse-zemberek-tr.txt"), histogram.getSortedList((a, b) -> turkishCollator.compare(a, b)));
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class DocumentSimilarityExperiment method removeDuplicates.
public void removeDuplicates(Path input, Path output, int k) throws IOException {
List<String> all = Files.readAllLines(input);
Log.info("Sentence count = %d", all.size());
Histogram<String> h = new Histogram<>(10_000_000);
h.add(all);
for (String s : h.getSortedList()) {
int count = h.getCount(s);
if (count > k) {
h.set(s, k);
}
}
int newCount = 0;
try (PrintWriter pw = new PrintWriter(output.toFile(), "utf-8")) {
for (String s : all) {
if (h.getCount(s) > 0) {
pw.println(s);
h.decrementIfPositive(s);
newCount++;
}
}
}
Log.info("New count = %d", newCount);
}
Aggregations