use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class CompressedCharNgramModel method compress.
public static void compress(MapBasedCharNgramLanguageModel model, File output) throws IOException {
Mphf[] mphfs = new MultiLevelMphf[model.getOrder() + 1];
DoubleLookup[] lookups = new DoubleLookup[model.getOrder() + 1];
try (DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output)))) {
dos.writeInt(model.getOrder());
dos.writeUTF(model.getId());
for (int i = 1; i <= model.getOrder(); i++) {
Histogram<Double> histogram = new Histogram<>();
histogram.add(model.gramLogProbs[i].values.values());
double[] lookup = new double[histogram.size()];
int j = 0;
for (Double key : histogram) {
lookup[j] = key;
j++;
}
Quantizer quantizer = BinningQuantizer.linearBinning(lookup, 8);
lookups[i] = quantizer.getDequantizer();
List<String> keys = Lists.newArrayList(model.gramLogProbs[i].values.keySet());
int[] fingerprints = new int[keys.size()];
int[] probabilityIndexes = new int[keys.size()];
mphfs[i] = MultiLevelMphf.generate(new StringListKeyProvider(keys));
for (final String key : keys) {
final int index = mphfs[i].get(key);
fingerprints[index] = MultiLevelMphf.hash(key, -1) & FINGER_PRINT_MASK;
probabilityIndexes[index] = quantizer.getQuantizationIndex(model.gramLogProbs[i].values.get(key));
}
lookups[i].save(dos);
dos.writeInt(keys.size());
for (int k = 0; k < keys.size(); k++) {
dos.writeShort(fingerprints[k] & 0xffff);
dos.writeByte(probabilityIndexes[k]);
}
mphfs[i].serialize(dos);
}
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class ConfusionTest method testContains.
public void testContains() throws IOException {
int sliceLength = 1000;
int maxSliceCount = 1000;
List<TestSet> sets = allSets(maxSliceCount, sliceLength);
Set<String> languages = identifier.getLanguages();
for (String language : languages) {
System.out.println(language);
Stopwatch sw = Stopwatch.createStarted();
int falsePositives = 0;
int totalCount = 0;
int correctlyFound = 0;
int correctAmount = 0;
for (TestSet set : sets) {
/* if(!set.modelId.equals("tr"))
continue;*/
totalCount += set.size();
Histogram<String> result = new Histogram<>();
for (String s : set.testPieces) {
/*
LanguageIdentifier.IdResult idResult = identifier.identifyFullConf(s);
result.add(idResult.id);
*/
// String t = identifier.identify(s, 100);
// String t = identifier.identify(s);
String t = "tr";
identifier.containsLanguage(s, "tr", 100, -1);
if (set.modelId.equals(language) && !t.equals(language)) {
/* if (identifier.containsLanguage(s, "tr", 100, -1))
System.out.println("Has tr slice!");
System.out.println(t + " " + s);*/
}
result.add(t);
// result.add(identifier.identifyWithSampling(s,sliceLength));
// result.add(identifier.identifyWithSampling(s, 4));
}
if (set.modelId.equals(language)) {
System.out.println("Lang test size:" + set.size());
correctlyFound = result.getCount(language);
correctAmount = set.size();
List<String> sorted = result.getSortedList();
for (String s : sorted) {
System.out.println(s + " : " + result.getCount(s));
}
continue;
} else {
int fpcount = result.getCount(language);
if (fpcount > 0) {
System.out.println(set.modelId + " " + fpcount);
}
}
falsePositives += result.getCount(language);
}
double elapsed = sw.elapsed(TimeUnit.MILLISECONDS);
System.out.println(String.format(Locale.ENGLISH, "Id per second: %.2f", (1000d * totalCount / elapsed)));
System.out.println("False positive count: " + falsePositives);
System.out.println("All: " + totalCount);
System.out.println(String.format(Locale.ENGLISH, "Precision:%.2f ", (100d * correctlyFound / correctAmount)));
System.out.println(String.format(Locale.ENGLISH, "Recall: %.2f", (100d * (totalCount - falsePositives) / totalCount)));
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class SpellCheckerPerformanceTests method correctWordFindingTest.
@Test
@Ignore(value = "Not a test.")
public void correctWordFindingTest() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
Path path = new File(Resources.getResource("spell-checker-test.txt").getFile()).toPath();
List<String> lines = Files.readAllLines(path);
List<String> sentences = extractor.fromParagraphs(lines);
Stopwatch sw = Stopwatch.createStarted();
Histogram<String> incorrectFound = new Histogram<>();
Histogram<String> correctFound = new Histogram<>();
for (String sentence : sentences) {
List<Token> tokens = tokenizer.tokenize(sentence);
for (Token token : tokens) {
String text = token.getText();
if (!spellChecker.check(text)) {
incorrectFound.add(text);
} else {
correctFound.add(text);
}
}
}
Log.info("Elapsed = %d", sw.elapsed(TimeUnit.MILLISECONDS));
Log.info("Incorrect (total/unique) = %d / %d", incorrectFound.totalCount(), incorrectFound.size());
Log.info("Correct (total/unique) = %d / %d", correctFound.totalCount(), correctFound.size());
incorrectFound.saveSortedByCounts(Paths.get("incorrect.txt"), " : ");
correctFound.saveSortedByCounts(Paths.get("correct.txt"), " : ");
/*
Path lmPath = Paths.get(ClassLoader.getSystemResource("lm-bigram.slm").toURI());
SmoothLm model = SmoothLm.builder(lmPath.toFile()).build();
*/
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method ambiguousWordStats.
public void ambiguousWordStats(String filename) throws IOException {
List<String> lines = readAll(filename);
Histogram<String> uniques = new Histogram<>(1000000);
int total = 0;
Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
for (String line : lines) {
for (String s : splitter.split(line)) {
List<WordAnalysis> results = parser.getWordAnalyzer().analyze(TurkishAlphabet.INSTANCE.normalize(s));
total++;
if (total % 50000 == 0) {
System.out.println("Processed: " + total);
}
if (results.size() > 1) {
uniques.add(s);
}
}
}
System.out.println("Total: " + total);
Stats st = new Stats(0.002);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (st.overCutoff(count)) {
String p1 = percentStr3(count, st.allCounts);
st.significantCounts += count;
st.significantUniques++;
System.out.println(s + " : " + count + " " + pp(p1));
}
}
st.dump();
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method noParse.
public void noParse(String... filename) throws IOException {
Histogram<String> uniques = new Histogram<>(1000000);
int total = 0;
for (String file : filename) {
List<String> lines = readAll(file);
Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
for (String line : lines) {
for (String s : splitter.split(line)) {
List<WordAnalysis> results = parser.getWordAnalyzer().analyze(TurkishAlphabet.INSTANCE.normalize(s));
total++;
if (total % 50000 == 0) {
System.out.println("Processed: " + total);
}
if (results.size() == 0) {
uniques.add(s);
}
}
}
System.out.println("Total: " + total);
}
Stats st = new Stats(0.0002);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (count > 5) {
st.significantCounts += count;
st.significantUniques++;
System.out.println(s + " : " + count);
}
}
st.dump();
}
Aggregations