use of zemberek.lm.compression.SmoothLm in project zemberek-nlp by ahmetaa.
the class ConvertToSmoothLmTest method testConversion.
@Test
public void testConversion() throws IOException {
File arpaFile = getTinyArpaFile();
File sm = new File(System.currentTimeMillis() + "-test-lm.smooth");
sm.deleteOnExit();
new CompressLm().execute("-arpaFile", arpaFile.getAbsolutePath(), "-smoothFile", sm.getAbsolutePath(), "-spaceUsage", "16-16-16");
Assert.assertTrue(sm.exists());
SmoothLm lm = SmoothLm.builder(sm).build();
Assert.assertEquals(3, lm.getOrder());
}
use of zemberek.lm.compression.SmoothLm in project zemberek-nlp by ahmetaa.
the class CompressLmTest method testConversion.
@Test
public void testConversion() throws IOException {
File arpaFile = getTinyArpaFile();
File sm = new File(System.currentTimeMillis() + "-test-lm.smooth");
sm.deleteOnExit();
new CompressLm().execute("-in", arpaFile.getAbsolutePath(), "-out", sm.getAbsolutePath(), "-spaceUsage", "16-16-16");
Assert.assertTrue(sm.exists());
SmoothLm lm = SmoothLm.builder(sm).build();
Assert.assertEquals(3, lm.getOrder());
}
use of zemberek.lm.compression.SmoothLm in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method splitWords.
static void splitWords(Path noisyVocab, Path cleanVocab, Path splitFile, Path lmPath, Path asciiMapPath, TurkishMorphology morphology, int minWordCount) throws IOException {
Set<String> asciiMapKeys = Files.readAllLines(asciiMapPath).stream().map(s -> s.substring(0, s.indexOf('='))).collect(Collectors.toSet());
SmoothLm lm = SmoothLm.builder(lmPath).logBase(Math.E).build();
Log.info("Language model = %s", lm.info());
Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
Log.info("%d words loaded.", wordFreq.size());
wordFreq.removeSmaller(minWordCount);
if (minWordCount > 1) {
Log.info("%d words left after removing counts less than %d.", wordFreq.size(), minWordCount);
}
int unkIndex = lm.getVocabulary().getUnknownWordIndex();
try (PrintWriter pw = new PrintWriter(splitFile.toFile(), "utf-8");
PrintWriter pwFreq = new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
for (String word : wordFreq.getSortedList()) {
if (asciiMapKeys.contains(word)) {
continue;
}
if (word.length() < 5 || word.contains("-")) {
continue;
}
List<ScoredItem<String>> k = new ArrayList<>();
for (int i = 1; i < word.length() - 1; i++) {
String head = word.substring(0, i);
String tail = word.substring(i);
if (noSplitTails.contains(tail)) {
continue;
}
int hi = lm.getVocabulary().indexOf(head);
int ti = lm.getVocabulary().indexOf(tail);
if (hi == unkIndex || ti == unkIndex) {
continue;
}
if ((tail.equals("de") || tail.equals("da")) && morphology.analyze(head).isCorrect()) {
continue;
}
if (lm.ngramExists(hi, ti)) {
k.add(new ScoredItem<>(head + " " + tail, lm.getProbability(hi, ti)));
}
}
if (k.size() > 1) {
k.sort((a, b) -> Double.compare(b.score, a.score));
}
if (k.size() > 0) {
ScoredItem<String> best = k.get(0);
if (best.score > -7) {
pw.println(word + " = " + best.item);
pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
}
}
}
}
}
Aggregations