use of zemberek.core.SpaceTabTokenizer in project zemberek-nlp by ahmetaa.
the class GenerateVocabulary method run.
@Override
protected void run() throws Exception {
if (!corpus.exists()) {
throw new IllegalArgumentException("Can not find the corpus file: " + corpus);
}
if (top < -1 || top == 0) {
throw new IllegalArgumentException("Illegal value for n: " + top);
}
Set<String> wordsToInclude = getWordsFromFile(includeFile);
Log.info("Amount of words to include using include file: %d", wordsToInclude.size());
Set<String> wordsToExclude = getWordsFromFile(excludeFile);
Log.info("Amount of words to exclude using exclude file: %d", wordsToExclude.size());
Set<String> intersection = Sets.newHashSet(wordsToExclude);
intersection.retainAll(wordsToInclude);
if (intersection.size() != 0) {
Log.warn("There are matching words in both include and exclude files: " + intersection.toString());
}
Collator collator = Collator.getInstance(Locale.ENGLISH);
if (sortLocale != null) {
collator = Collator.getInstance(new Locale(sortLocale));
}
Log.info("Processing corpus: %s", corpus);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
String line;
Histogram<String> histogram = new Histogram<>(50000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int count = 0;
while ((line = reader.readLine()) != null) {
List<String> words = Lists.newArrayList(tokenizer.split(line));
if (words.isEmpty()) {
continue;
}
histogram.add(words);
if (count % 500000 == 0 && count != 0) {
Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
}
count++;
}
Log.info("A total of %d lines have been processed. Vocabulary Size: %d", count, histogram.size());
if (top >= histogram.size()) {
top = histogram.size();
} else {
Log.info("Top %d words will be used.", top);
}
List<String> mostFrequent = histogram.getTop(top);
Log.info("Coverage: %.3f", 100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
LinkedHashSet<String> resultSet = Sets.newLinkedHashSet(mostFrequent);
resultSet.addAll(wordsToInclude);
resultSet.removeAll(wordsToExclude);
List<String> result = Lists.newArrayList(resultSet);
Log.info("Total size of vocabulary: %d", result.size());
if (ordered) {
Log.info("Sorting file with word order.");
Collections.sort(result, collator);
}
com.google.common.io.Files.createParentDirs(outFile);
Log.info("Saving to vocabulary file: %s", outFile);
SimpleTextWriter.utf8Builder(outFile).addNewLineBeforClose().build().writeLines(result);
Log.info("Done.");
}
}
use of zemberek.core.SpaceTabTokenizer in project zemberek-nlp by ahmetaa.
the class Dictionary method readFromFile.
static Dictionary readFromFile(Path file, final Args args) throws IOException {
Log.info("Initialize dictionary and histograms.");
Dictionary dictionary = new Dictionary(args);
Log.info("Loading text.");
BlockTextLoader loader = new BlockTextLoader(file, 100_000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int blockCounter = 1;
for (List<String> lines : loader) {
for (String line : lines) {
List<String> split = tokenizer.splitToList(line);
split.add(EOS);
for (String word : split) {
if (word.startsWith("#")) {
continue;
}
dictionary.addWithCount(word, 1);
}
}
Log.info("Lines read: %d (thousands) ", blockCounter * 100);
blockCounter++;
}
Log.info("Word + Label count = %d", dictionary.words_.size());
Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
// now we have the histograms. Remove based on count.
dictionary.words_.sort((e1, e2) -> {
if (e1.type != e2.type) {
return Integer.compare(e1.type, e2.type);
} else {
return Long.compare(e2.count, e1.count);
}
});
LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
all.removeAll(toRemove);
dictionary.words_ = new ArrayList<>(all);
dictionary.size_ = 0;
dictionary.nwords_ = 0;
dictionary.nlabels_ = 0;
Arrays.fill(dictionary.word2int_, -1);
for (Entry e : dictionary.words_) {
int i = dictionary.find(e.word);
dictionary.word2int_[i] = dictionary.size_++;
if (e.type == TYPE_WORD) {
dictionary.nwords_++;
}
if (e.type == TYPE_LABEL) {
dictionary.nlabels_++;
}
}
Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
dictionary.initTableDiscard();
Log.info("Adding character n-grams for words.");
dictionary.initNgrams();
Log.info("Done.");
return dictionary;
}
use of zemberek.core.SpaceTabTokenizer in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testActualData.
@Test
@Ignore
public void testActualData() throws IOException {
Stopwatch sw = Stopwatch.createStarted();
File lmFile = new File("/home/ahmetaa/data/lm/smoothnlp-test/lm1.slm");
File tmp = new File("/tmp");
if (!lmFile.exists()) {
final File arpaFile = new File("/home/ahmetaa/data/lm/smoothnlp-test/lm1.arpa");
UncompressedToSmoothLmConverter converter = new UncompressedToSmoothLmConverter(lmFile, tmp);
converter.convertLarge(MultiFileUncompressedLm.generate(arpaFile, tmp, "utf-8", 4).dir, new UncompressedToSmoothLmConverter.NgramDataBlock(2, 1, 1), 20);
}
SmoothLm lm = SmoothLm.builder(lmFile).build();
System.out.println(sw.elapsed(TimeUnit.MILLISECONDS));
sw.reset();
final int order = 3;
final int gramCount = 1000000;
int[][] ids = new int[gramCount][order];
long[] trigrams = new long[gramCount];
LineIterator li = SimpleTextReader.trimmingUTF8LineIterator(new File("/home/ahmetaa/data/lm/smoothnlp-test/corpus-lowercase_1000000_2000000"));
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int i = 0;
while (i < gramCount) {
String line = li.next();
String[] tokens = tokenizer.split(line);
if (tokens.length < order) {
continue;
}
for (int j = 0; j < tokens.length - order - 1; j++) {
String[] words = new String[order];
System.arraycopy(tokens, j, words, 0, order);
int[] indexes = lm.getVocabulary().toIndexes(words);
if (!lm.getVocabulary().containsAll(indexes)) {
continue;
}
ids[i] = indexes;
if (order == 3) {
trigrams[i] = lm.getVocabulary().encodeTrigram(indexes);
}
i++;
if (i == gramCount) {
break;
}
}
}
sw.start();
double tr = 0;
for (int[] id : ids) {
tr += lm.getProbability(id);
}
System.out.println(sw.elapsed(TimeUnit.MILLISECONDS));
System.out.println("tr = " + tr);
}
use of zemberek.core.SpaceTabTokenizer in project zemberek-nlp by ahmetaa.
the class Dictionary method readFromFile.
static Dictionary readFromFile(Path file, final Args args) {
Log.info("Initialize dictionary and histograms.");
Dictionary dictionary = new Dictionary(args);
Log.info("Loading text.");
BlockTextLoader loader = BlockTextLoader.fromPath(file, 100_000);
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int blockCounter = 1;
for (TextChunk lines : loader) {
for (String line : lines) {
List<String> split = tokenizer.splitToList(line);
split.add(EOS);
for (String word : split) {
if (word.startsWith("#")) {
continue;
}
dictionary.add(word);
}
}
Log.info("Lines read: %d (thousands) ", blockCounter * 100);
blockCounter++;
}
Log.info("Word + Label count = %d", dictionary.words_.size());
Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
// now we have the histograms. Remove based on count.
dictionary.words_.sort((e1, e2) -> {
if (e1.type != e2.type) {
return Integer.compare(e1.type, e2.type);
} else {
return Long.compare(e2.count, e1.count);
}
});
// TODO: add threshold method.
LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
all.removeAll(toRemove);
dictionary.words_ = new ArrayList<>(all);
dictionary.size_ = 0;
dictionary.nwords_ = 0;
dictionary.nlabels_ = 0;
Arrays.fill(dictionary.word2int_, -1);
for (Entry e : dictionary.words_) {
int i = dictionary.find(e.word);
dictionary.word2int_[i] = dictionary.size_++;
if (e.type == TYPE_WORD) {
dictionary.nwords_++;
}
if (e.type == TYPE_LABEL) {
dictionary.nlabels_++;
}
}
Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
dictionary.initTableDiscard();
dictionary.initNgrams();
return dictionary;
}
Aggregations