use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.
the class FastText method predict.
public List<ScoredItem<String>> predict(String line, int k, float threshold) {
IntVector words = new IntVector();
IntVector labels = new IntVector();
dict_.getLine(line, words, labels);
if (words.isempty()) {
return Collections.emptyList();
}
List<Model.FloatIntPair> modelPredictions = model_.predict(words.copyOf(), threshold, k);
List<ScoredItem<String>> result = new ArrayList<>(modelPredictions.size());
for (Model.FloatIntPair pair : modelPredictions) {
result.add(new ScoredItem<>(dict_.getLabel(pair.second), pair.first));
}
return result;
}
use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method splitWords.
static void splitWords(Path noisyVocab, Path cleanVocab, Path splitFile, Path lmPath, Path asciiMapPath, TurkishMorphology morphology, int minWordCount) throws IOException {
Set<String> asciiMapKeys = Files.readAllLines(asciiMapPath).stream().map(s -> s.substring(0, s.indexOf('='))).collect(Collectors.toSet());
SmoothLm lm = SmoothLm.builder(lmPath).logBase(Math.E).build();
Log.info("Language model = %s", lm.info());
Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
Log.info("%d words loaded.", wordFreq.size());
wordFreq.removeSmaller(minWordCount);
if (minWordCount > 1) {
Log.info("%d words left after removing counts less than %d.", wordFreq.size(), minWordCount);
}
int unkIndex = lm.getVocabulary().getUnknownWordIndex();
try (PrintWriter pw = new PrintWriter(splitFile.toFile(), "utf-8");
PrintWriter pwFreq = new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
for (String word : wordFreq.getSortedList()) {
if (asciiMapKeys.contains(word)) {
continue;
}
if (word.length() < 5 || word.contains("-")) {
continue;
}
List<ScoredItem<String>> k = new ArrayList<>();
for (int i = 1; i < word.length() - 1; i++) {
String head = word.substring(0, i);
String tail = word.substring(i);
if (noSplitTails.contains(tail)) {
continue;
}
int hi = lm.getVocabulary().indexOf(head);
int ti = lm.getVocabulary().indexOf(tail);
if (hi == unkIndex || ti == unkIndex) {
continue;
}
if ((tail.equals("de") || tail.equals("da")) && morphology.analyze(head).isCorrect()) {
continue;
}
if (lm.ngramExists(hi, ti)) {
k.add(new ScoredItem<>(head + " " + tail, lm.getProbability(hi, ti)));
}
}
if (k.size() > 1) {
k.sort((a, b) -> Double.compare(b.score, a.score));
}
if (k.size() > 0) {
ScoredItem<String> best = k.get(0);
if (best.score > -7) {
pw.println(word + " = " + best.item);
pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
}
}
}
}
}
use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.
the class ItemFindExperiment method testGrams.
static void testGrams(Path root, Path test, String labelTarget, String name) throws IOException {
Path modelPath = root.resolve(name + ".model");
Path predictions = root.resolve(name + ".predictions");
FastTextClassifier classifier = FastTextClassifier.load(modelPath);
try (PrintWriter pw = new PrintWriter(predictions.toFile(), "utf-8")) {
List<String> all = Files.readAllLines(test);
for (String s : all) {
List<String> tokens = Splitter.on(" ").splitToList(s);
List<String> rest = tokens.subList(1, tokens.size());
List<String> grams = getGrams(rest, 7);
List<Hit> hits = new ArrayList<>();
for (String gram : grams) {
List<ScoredItem<String>> res = classifier.predict(gram, 2);
for (ScoredItem<String> re : res) {
float p = (float) Math.exp(re.score);
if (re.item.equals(labelTarget) && p > 0.45) {
hits.add(new Hit(gram, re));
}
}
}
pw.println(s);
for (Hit hit : hits) {
pw.println(hit);
}
pw.println("-----------------------");
}
}
}
use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.
the class TurkishSpellChecker method suggestForWord.
public List<String> suggestForWord(String word, String leftContext, String rightContext, NgramLanguageModel lm) {
List<String> unRanked = getUnrankedSuggestions(word);
if (lm == null) {
Log.warn("No language model provided. Returning unraked results.");
return unRanked;
}
if (lm.getOrder() < 2) {
Log.warn("Language model order is 1. For context ranking it should be at least 2. " + "Unigram ranking will be applied.");
return suggestForWord(word, lm);
}
LmVocabulary vocabulary = lm.getVocabulary();
List<ScoredItem<String>> results = new ArrayList<>(unRanked.size());
for (String str : unRanked) {
if (leftContext == null) {
leftContext = vocabulary.getSentenceStart();
} else {
leftContext = normalizeForLm(leftContext);
}
if (rightContext == null) {
rightContext = vocabulary.getSentenceEnd();
} else {
rightContext = normalizeForLm(rightContext);
}
String w = normalizeForLm(str);
int wordIndex = vocabulary.indexOf(w);
int leftIndex = vocabulary.indexOf(leftContext);
int rightIndex = vocabulary.indexOf(rightContext);
float score;
if (lm.getOrder() == 2) {
score = lm.getProbability(leftIndex, wordIndex) + lm.getProbability(wordIndex, rightIndex);
} else {
score = lm.getProbability(leftIndex, wordIndex, rightIndex);
}
results.add(new ScoredItem<>(str, score));
}
results.sort(ScoredItem.STRING_COMP_DESCENDING);
return results.stream().map(s -> s.item).collect(Collectors.toList());
}
use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method test.
private void test(Path corpusPath, Path testData, Path predictionPath, FastText fastText) throws IOException {
WebCorpus corpus = new WebCorpus("label", "label");
corpus.addDocuments(WebCorpus.loadDocuments(corpusPath));
Log.info("Testing started.");
List<String> testLines = Files.readAllLines(testData, StandardCharsets.UTF_8);
Stopwatch sw = Stopwatch.createStarted();
try (PrintWriter pw = new PrintWriter(predictionPath.toFile(), "utf-8")) {
for (String testLine : testLines) {
String id = testLine.substring(0, testLine.indexOf(' ')).substring(1);
WebDocument doc = corpus.getDocument(id);
List<ScoredItem<String>> res = fastText.predict(testLine, 7);
List<String> predictedLabels = new ArrayList<>();
for (ScoredItem<String> re : res) {
predictedLabels.add(String.format(Locale.ENGLISH, "%s (%.2f)", re.item.replaceAll("__label__", "").replaceAll("_", " "), re.score));
}
pw.println("id = " + id);
pw.println();
pw.println(doc.getContentAsString().replaceAll("[\n\r]+", "\n"));
pw.println();
pw.println("Actual Labels = " + String.join(", ", doc.getLabels()));
pw.println("Predictions = " + String.join(", ", predictedLabels));
pw.println();
pw.println("------------------------------------------------------");
pw.println();
}
}
Log.info("Done. in %d ms.", sw.elapsed(TimeUnit.MILLISECONDS));
}
Aggregations