use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class SpeedTest method testNewsCorpusNoCache.
@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpusNoCache() throws IOException {
Path p = Paths.get("src/main/resources/corpora/cnn-turk-10k");
List<String> sentences = getSentences(p);
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
Stopwatch sw = Stopwatch.createStarted();
int tokenCount = 0;
int noAnalysis = 0;
int sentenceCount = 0;
Histogram<String> failedWords = new Histogram<>(100000);
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
if (token.getType() == TurkishLexer.Punctuation) {
continue;
}
tokenCount++;
List<_SingleAnalysis> results = analyzer.analyze(token.getText());
if (results.size() == 0) {
noAnalysis++;
failedWords.add(token.getText());
}
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d tokens analyzed.", tokenCount);
}
}
double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
double speed = tokenCount / seconds;
double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
Log.info("%nElapsed = %.2f seconds", seconds);
Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
Log.info("Saving Unknown Tokens");
failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method generateSetForLabelExperiment.
Set<String> generateSetForLabelExperiment(Path input, TurkishSentenceAnalyzer analyzer, boolean useRoots) throws IOException {
WebCorpus corpus = new WebCorpus("label", "labeled");
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
Log.info("Extracting data.");
Histogram<String> labelCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
List<String> labels = document.getLabels();
List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
labelCounts.add(lowerCase);
}
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
Log.info("All label count = %d", labelCounts.size());
labelCounts.removeSmaller(15);
Log.info("Reduced label count = %d", labelCounts.size());
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
Set<Long> contentHash = new HashSet<>();
for (WebDocument document : corpus.getDocuments()) {
Long hash = document.getHash();
if (contentHash.contains(hash)) {
continue;
}
contentHash.add(hash);
List<String> labelTags = new ArrayList<>();
boolean labelFound = false;
for (String label : document.getLabels()) {
if (labelCounts.contains(label)) {
labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
labelFound = true;
}
}
if (!labelFound) {
continue;
}
String labelStr = String.join(" ", labelTags);
String content = document.getContentAsString();
String processed = processContent(analyzer, content, useRoots);
if (processed.length() < 200) {
continue;
}
set.add("#" + document.getId() + " " + labelStr + " " + processed);
if (c++ % 1000 == 0) {
Log.info("%d processed.", c);
}
}
Log.info("Generate train and test set.");
Collections.shuffle(set, new Random(1));
return new LinkedHashSet<>(set);
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class GenerateVocabulary method run.
@Override
public void run() throws Exception {
if (!corpus.exists()) {
throw new IllegalArgumentException("Can not find the corpus file: " + corpus);
}
if (top < -1 || top == 0)
throw new IllegalArgumentException("Illegal value for -top: " + top);
Set<String> wordsToInclude = getWordsFromFile(includeFile);
if (wordsToInclude.size() > 0) {
Log.info("Amount of words to include using include file: %d", wordsToInclude.size());
}
Set<String> wordsToExclude = getWordsFromFile(excludeFile);
if (wordsToExclude.size() > 0) {
Log.info("Amount of words to exclude using exclude file: %d", wordsToExclude.size());
}
Set<String> intersection = Sets.newHashSet(wordsToExclude);
intersection.retainAll(wordsToInclude);
if (intersection.size() != 0) {
Log.warn("There are matching words in both include and exclude files: " + intersection.toString());
}
Collator collator = Collator.getInstance(Locale.ENGLISH);
if (sortLocale != null) {
collator = Collator.getInstance(new Locale(sortLocale));
}
Log.info("Processing corpus: %s", corpus);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
String line;
Histogram<String> histogram = new Histogram<>(50000);
int count = 0;
while ((line = reader.readLine()) != null) {
List<String> words = new ArrayList<>(10);
for (String word : Splitter.on(" ").omitEmptyStrings().trimResults().split(line)) {
if (word.length() > 30) {
Log.warn("Too long word %s", word);
}
if (!countMetaWords) {
if (word.contains("<") || word.equalsIgnoreCase(">")) {
continue;
}
}
words.add(word);
}
if (words.isEmpty())
continue;
histogram.add(words);
if (count % 500000 == 0 && count != 0)
Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
count++;
}
Log.info("A total of %d lines have been processed. Vocabulary Size: %d", count, histogram.size());
if (minFreq > 1) {
histogram.removeSmaller(minFreq);
}
if (top >= histogram.size() || top == -1) {
top = histogram.size();
Log.info("All %d words will be in the vocabulary.", top);
} else
Log.info("Top %d words will be used in the vocabulary.", top);
List<String> mostFrequent;
if (top > 0) {
mostFrequent = histogram.getTop(top);
} else {
mostFrequent = histogram.getSortedList();
}
Log.info("Coverage: %.3f", 100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
LinkedHashSet<String> resultSet = Sets.newLinkedHashSet(mostFrequent);
resultSet.addAll(wordsToInclude);
resultSet.removeAll(wordsToExclude);
List<String> result = Lists.newArrayList(resultSet);
Log.info("Total size of vocabulary: %d", result.size());
if (ordered) {
Log.info("Sorting file with word order.");
Collections.sort(result, collator);
}
com.google.common.io.Files.createParentDirs(outFile);
Log.info("Saving to vocabulary file: %s", outFile);
if (!writeFrequencies) {
SimpleTextWriter.utf8Builder(outFile).addNewLineBeforClose().build().writeLines(result);
} else {
Log.info("Frequency values will be written with words.");
try (SimpleTextWriter stw = SimpleTextWriter.keepOpenUTF8Writer(outFile)) {
for (String s : result) {
stw.writeLine(s + frequencyFileDelimiter + histogram.getCount(s));
}
}
}
Log.info("Done.");
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method ambiguousGroupStats.
public void ambiguousGroupStats(String filename) throws IOException {
List<String> lines = readAll(filename);
Histogram<String> uniques = new Histogram<>(1000000);
Map<String, Histogram<String>> ambiguityGroups = Maps.newHashMap();
int total = 0;
for (String line : lines) {
for (String s : splitter.split(line)) {
WordAnalysis results = parser.analyze(s);
if (++total % 50000 == 0) {
System.out.println("Processed: " + total);
}
if (results.analysisCount() > 1) {
String key = generateKeyFromParse(results);
uniques.add(key);
Histogram<String> members = ambiguityGroups.get(key);
if (members == null) {
members = new Histogram<>();
ambiguityGroups.put(key, members);
}
members.add(s);
}
}
}
System.out.println("Total: " + total);
Stats st = new Stats(0.1);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (st.overCutoff(count)) {
String p1 = percentStr(count, st.allCounts);
st.significantCounts += count;
st.significantUniques++;
System.out.println(s + " : " + count + " " + pp(p1));
Histogram<String> members = ambiguityGroups.get(s);
for (String member : members.getSortedList()) {
int memberCount = members.getCount(member);
if (pct(memberCount, count) > 0.1) {
System.out.println(member + " : " + members.getCount(member));
}
}
System.out.println();
}
}
st.dump();
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method noParse.
public void noParse(String... filename) throws IOException {
Histogram<String> uniques = new Histogram<>(1000000);
int total = 0;
for (String file : filename) {
List<String> lines = readAll(file);
Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
for (String line : lines) {
for (String s : splitter.split(line)) {
WordAnalysis results = parser.analyze(s);
total++;
if (total % 50000 == 0) {
System.out.println("Processed: " + total);
}
if (results.analysisCount() == 0) {
uniques.add(s);
}
}
}
System.out.println("Total: " + total);
}
Stats st = new Stats(0.0002);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (count > 5) {
st.significantCounts += count;
st.significantUniques++;
System.out.println(s + " : " + count);
}
}
st.dump();
}
Aggregations