use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class _WordCollector method extracData.
public Histogram<String> extracData(Path p, Path outRoot, int resultLimit) throws IOException {
Histogram<String> words = new Histogram<>(5_000_000);
List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
for (Path file : files) {
Log.info("Processing %s", file);
List<String> lines = Files.readAllLines(file, StandardCharsets.UTF_8).stream().filter(s -> !s.startsWith("<")).collect(Collectors.toList());
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines);
for (String sentence : sentences) {
sentence = sentence.replaceAll("[\\s/\\-\\u00a0]+", " ");
sentence = sentence.replaceAll("[\\u00ad]", "");
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
String rawWord = token.getText();
if (!Strings.containsNone(rawWord, "0123456789_")) {
continue;
}
String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
words.add(word);
}
}
Log.info("Count = %d", words.size());
}
String s = p.toFile().getName();
Log.info("Saving words.");
// saving failed words.
words.saveSortedByKeys(outRoot.resolve(s + "-counts-sorted-name.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
// saving failed words by frequency.
words.saveSortedByCounts(outRoot.resolve(s + "-counts-sorted-freq.txt"), " ");
Files.write(outRoot.resolve(s + "-words-sorted-freq.txt"), words.getSortedList());
Files.write(outRoot.resolve(s + "-words-sorted-name.txt"), words.getSortedList(Turkish.STRING_COMPARATOR_ASC));
return words;
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class _WordCollector method mergeHistorgrams.
public static void mergeHistorgrams(Path path) throws IOException {
List<Path> files = Files.walk(path, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith("-counts-sorted-freq.txt")).collect(Collectors.toList());
Histogram<String> words = new Histogram<>(10_000_000);
for (Path file : files) {
Log.info("Laoding histogram for %s", file);
Histogram<String> h = Histogram.loadFromUtf8File(file, ' ');
words.add(h);
}
saveHistogram(path, "all", words);
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class CharNgramCountModel method load.
/**
* Loads data from the custom serialized file and generates a CharNgramCountModel from it.
*
* @param is InputStream to load data.
* @return a CharNgramCountModel generated from file.
*/
public static CharNgramCountModel load(InputStream is) throws IOException {
try (DataInputStream dis = new DataInputStream(new BufferedInputStream(is))) {
int order = dis.readInt();
String modelId = dis.readUTF();
Histogram<String>[] gramCounts = new Histogram[order + 1];
for (int j = 1; j <= order; j++) {
int size = dis.readInt();
Histogram<String> countSet = new Histogram<>(size * 2);
for (int i = 0; i < size; i++) {
String key = dis.readUTF();
countSet.add(key, dis.readInt());
}
gramCounts[j] = countSet;
}
return new CharNgramCountModel(modelId, order, gramCounts);
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class ConfusionTest method testAll.
public void testAll() throws IOException {
int sliceLength = 1000;
int maxSliceCount = 1000;
List<TestSet> sets = allSets(maxSliceCount, sliceLength);
Set<String> languages = identifier.getLanguages();
for (String language : languages) {
System.out.println(language);
Stopwatch sw = Stopwatch.createStarted();
int falsePositives = 0;
int totalCount = 0;
int correctlyFound = 0;
int correctAmount = 0;
for (TestSet set : sets) {
/* if(!set.modelId.equals("tr"))
continue;*/
totalCount += set.size();
Histogram<String> result = new Histogram<>();
for (String s : set.testPieces) {
/*
LanguageIdentifier.IdResult idResult = identifier.identifyFullConf(s);
result.add(idResult.id);
*/
String t = identifier.identify(s);
if (set.modelId.equals(language) && !t.equals(language)) {
/* if (identifier.containsLanguage(s, "tr", 100, -1))
System.out.println("Has tr slice!");
System.out.println(t + " " + s);*/
}
result.add(t);
// result.add(identifier.identifyWithSampling(s,sliceLength));
// result.add(identifier.identifyWithSampling(s, 4));
}
if (set.modelId.equals(language)) {
System.out.println("Lang test size:" + set.size());
correctlyFound = result.getCount(language);
correctAmount = set.size();
List<String> sorted = result.getSortedList();
for (String s : sorted) {
System.out.println(s + " : " + result.getCount(s));
}
continue;
} else {
int fpcount = result.getCount(language);
if (fpcount > 0) {
System.out.println(set.modelId + " " + fpcount);
}
}
falsePositives += result.getCount(language);
}
double elapsed = sw.elapsed(TimeUnit.MILLISECONDS);
System.out.println(String.format(Locale.ENGLISH, "Id per second: %.2f", (1000d * totalCount / elapsed)));
System.out.println("False positive count: " + falsePositives);
System.out.println("All: " + totalCount);
System.out.println(String.format(Locale.ENGLISH, "Precision:%.2f ", (100d * correctlyFound / correctAmount)));
System.out.println(String.format(Locale.ENGLISH, "Recall: %.2f", (100d * (totalCount - falsePositives) / totalCount)));
}
}
Aggregations