use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectCorpusStatisticsForLemmas.
static CorpusStatistics collectCorpusStatisticsForLemmas(WebCorpus corpus, TurkishSentenceAnalyzer analyzer, int count) throws IOException {
CorpusStatistics statistics = new CorpusStatistics(1_000_000);
int docCount = 0;
for (WebDocument document : corpus.getDocuments()) {
Histogram<String> docHistogram = new Histogram<>();
List<String> sentences = extractor.fromParagraphs(document.getLines());
for (String sentence : sentences) {
List<WordAnalysis> analysis = analyzer.bestParse(sentence);
for (WordAnalysis w : analysis) {
if (!analysisAcceptable(w)) {
continue;
}
String s = w.getSurfaceForm();
if (TurkishStopWords.DEFAULT.contains(s)) {
continue;
}
List<String> lemmas = w.getLemmas();
docHistogram.add(lemmas.get(lemmas.size() - 1));
}
}
statistics.termFrequencies.add(docHistogram);
for (String s : docHistogram) {
statistics.documentFrequencies.add(s);
}
if (docCount++ % 500 == 0) {
Log.info("Doc count = %d", docCount);
}
if (count > 0 && docCount > count) {
break;
}
}
statistics.documentCount = count > 0 ? Math.min(count, corpus.documentCount()) : corpus.documentCount();
return statistics;
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class GenerateVocabulary method run.
@Override
protected void run() throws Exception {
if (!corpus.exists()) {
throw new IllegalArgumentException("Can not find the corpus file: " + corpus);
}
if (top < -1 || top == 0) {
throw new IllegalArgumentException("Illegal value for n: " + top);
}
Set<String> wordsToInclude = getWordsFromFile(includeFile);
Log.info("Amount of words to include using include file: %d", wordsToInclude.size());
Set<String> wordsToExclude = getWordsFromFile(excludeFile);
Log.info("Amount of words to exclude using exclude file: %d", wordsToExclude.size());
Set<String> intersection = Sets.newHashSet(wordsToExclude);
intersection.retainAll(wordsToInclude);
if (intersection.size() != 0) {
Log.warn("There are matching words in both include and exclude files: " + intersection.toString());
}
Collator collator = Collator.getInstance(Locale.ENGLISH);
if (sortLocale != null) {
collator = Collator.getInstance(new Locale(sortLocale));
}
Log.info("Processing corpus: %s", corpus);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
String line;
Histogram<String> histogram = new Histogram<>(50000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int count = 0;
while ((line = reader.readLine()) != null) {
List<String> words = Lists.newArrayList(tokenizer.split(line));
if (words.isEmpty()) {
continue;
}
histogram.add(words);
if (count % 500000 == 0 && count != 0) {
Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
}
count++;
}
Log.info("A total of %d lines have been processed. Vocabulary Size: %d", count, histogram.size());
if (top >= histogram.size()) {
top = histogram.size();
} else {
Log.info("Top %d words will be used.", top);
}
List<String> mostFrequent = histogram.getTop(top);
Log.info("Coverage: %.3f", 100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
LinkedHashSet<String> resultSet = Sets.newLinkedHashSet(mostFrequent);
resultSet.addAll(wordsToInclude);
resultSet.removeAll(wordsToExclude);
List<String> result = Lists.newArrayList(resultSet);
Log.info("Total size of vocabulary: %d", result.size());
if (ordered) {
Log.info("Sorting file with word order.");
Collections.sort(result, collator);
}
com.google.common.io.Files.createParentDirs(outFile);
Log.info("Saving to vocabulary file: %s", outFile);
SimpleTextWriter.utf8Builder(outFile).addNewLineBeforClose().build().writeLines(result);
Log.info("Done.");
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class TurkishSentenceAnalyzerTest method doParseSentencesInCorpus.
private void doParseSentencesInCorpus(File ntvmsnbcCorpus) throws IOException {
List<String> sentences = SimpleTextReader.trimmingUTF8Reader(ntvmsnbcCorpus).asStringList();
Stopwatch sw = Stopwatch.createStarted();
long wc = 0;
int s = 0;
Histogram<String> unknownStuff = new Histogram<>();
for (String sentence : sentences) {
SentenceAnalysis parse = parser.analyze(sentence);
for (SentenceAnalysis.Entry entry : parse) {
List<WordAnalysis> parses = entry.parses;
for (WordAnalysis wordAnalysis : parses) {
if (wordAnalysis.dictionaryItem == DictionaryItem.UNKNOWN) {
unknownStuff.add(wordAnalysis.getSurfaceForm());
}
}
}
wc += parse.size();
// parser.disambiguate(parse);
s++;
if (s % 10000 == 0) {
System.out.println(s);
System.out.println(sw.elapsed(TimeUnit.MILLISECONDS) / 1000d);
}
}
try (PrintWriter pw = new PrintWriter("unknown.txt", "utf-8")) {
for (String s1 : unknownStuff.getSortedList()) {
pw.println(s1 + " " + unknownStuff.getCount(s1));
}
}
System.out.println("Word count = " + wc);
System.out.println("Elapsed Time =" + sw.elapsed(TimeUnit.MILLISECONDS));
System.out.println("Parse and disambiguate per second = " + (wc * 1000d) / (sw.elapsed(TimeUnit.MILLISECONDS)));
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class SpeedTest method testNewsCorpus.
@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpus() throws IOException {
// Path p = Paths.get("/media/aaa/Data/corpora/me-sentences/www.aljazeera.com.tr/2018-02-22");
Path p = Paths.get("src/test/resources/corpora/cnn-turk-10k");
List<String> sentences = getSentences(p);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Stopwatch sw = Stopwatch.createStarted();
int tokenCount = 0;
int noAnalysis = 0;
int sentenceCount = 0;
Histogram<String> failedWords = new Histogram<>(100000);
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
if (token.getType() == Token.Type.Punctuation) {
continue;
}
tokenCount++;
WordAnalysis results = morphology.analyze(token.getText());
if (!results.isCorrect()) {
noAnalysis++;
failedWords.add(token.getText());
}
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d tokens analyzed.", tokenCount);
}
}
double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
double speed = tokenCount / seconds;
double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
Log.info("%nElapsed = %.2f seconds", seconds);
Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
Log.info("Saving Unknown Tokens");
failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class UnsupervisedKeyPhraseExtractor method collectCorpusStatistics.
static CorpusStatistics collectCorpusStatistics(WebCorpus corpus) throws IOException {
CorpusStatistics statistics = new CorpusStatistics(1_000_000);
for (WebDocument document : corpus.getDocuments()) {
Histogram<String> docHistogram = new Histogram<>();
List<String> sentences = extractor.fromParagraphs(document.getLines());
for (String sentence : sentences) {
List<Token> tokens = lexer.tokenize(sentence);
for (Token token : tokens) {
if (!tokenTypeAccpetable(token)) {
continue;
}
String s = normalize(token.getText());
if (TurkishStopWords.DEFAULT.contains(s)) {
continue;
}
docHistogram.add(s);
}
}
statistics.termFrequencies.add(docHistogram);
for (String s : docHistogram) {
statistics.documentFrequencies.add(s);
}
}
statistics.documentCount = corpus.documentCount();
return statistics;
}
Aggregations