use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class FakeLm method generateArpa.
public void generateArpa(File fileName) throws IOException {
System.out.println("unigrams = " + unigrams.length);
SimpleTextWriter sw = SimpleTextWriter.keepOpenUTF8Writer(fileName);
/*
\data\
ngram 1= 4
ngram 2= 3
ngram 3= 2
*/
sw.writeLine("\\data\\");
for (int o = 1; o <= order; o++) {
sw.writeLine("ngram " + o + "=" + o * unigramLength);
}
for (int o = 1; o <= order; o++) {
FakeGram[] probs = getNgramProbs(o);
System.out.println("Validating..");
validate(probs);
System.out.println("Writing " + o + " grams.");
sw.writeLine();
sw.writeLine("\\" + o + "-grams:\n");
for (FakeGram prob : probs) {
if (o < order) {
sw.writeLine(String.format(Locale.ENGLISH, "%.4f %s %.4f", prob.prob, Joiner.on(" ").join(prob.vals), prob.backoff));
} else {
sw.writeLine(String.format(Locale.ENGLISH, "%.4f %s", prob.prob, Joiner.on(" ").join(prob.vals)));
}
}
}
sw.writeLine();
sw.writeLine("\\end\\");
}
use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class GenerateVocabulary method run.
@Override
public void run() throws Exception {
if (!corpus.exists()) {
throw new IllegalArgumentException("Can not find the corpus file: " + corpus);
}
if (top < -1 || top == 0)
throw new IllegalArgumentException("Illegal value for -top: " + top);
Set<String> wordsToInclude = getWordsFromFile(includeFile);
if (wordsToInclude.size() > 0) {
Log.info("Amount of words to include using include file: %d", wordsToInclude.size());
}
Set<String> wordsToExclude = getWordsFromFile(excludeFile);
if (wordsToExclude.size() > 0) {
Log.info("Amount of words to exclude using exclude file: %d", wordsToExclude.size());
}
Set<String> intersection = Sets.newHashSet(wordsToExclude);
intersection.retainAll(wordsToInclude);
if (intersection.size() != 0) {
Log.warn("There are matching words in both include and exclude files: " + intersection.toString());
}
Collator collator = Collator.getInstance(Locale.ENGLISH);
if (sortLocale != null) {
collator = Collator.getInstance(new Locale(sortLocale));
}
Log.info("Processing corpus: %s", corpus);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
String line;
Histogram<String> histogram = new Histogram<>(50000);
int count = 0;
while ((line = reader.readLine()) != null) {
List<String> words = new ArrayList<>(10);
for (String word : Splitter.on(" ").omitEmptyStrings().trimResults().split(line)) {
if (word.length() > 30) {
Log.warn("Too long word %s", word);
}
if (!countMetaWords) {
if (word.contains("<") || word.equalsIgnoreCase(">")) {
continue;
}
}
words.add(word);
}
if (words.isEmpty())
continue;
histogram.add(words);
if (count % 500000 == 0 && count != 0)
Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
count++;
}
Log.info("A total of %d lines have been processed. Vocabulary Size: %d", count, histogram.size());
if (minFreq > 1) {
histogram.removeSmaller(minFreq);
}
if (top >= histogram.size() || top == -1) {
top = histogram.size();
Log.info("All %d words will be in the vocabulary.", top);
} else
Log.info("Top %d words will be used in the vocabulary.", top);
List<String> mostFrequent;
if (top > 0) {
mostFrequent = histogram.getTop(top);
} else {
mostFrequent = histogram.getSortedList();
}
Log.info("Coverage: %.3f", 100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
LinkedHashSet<String> resultSet = Sets.newLinkedHashSet(mostFrequent);
resultSet.addAll(wordsToInclude);
resultSet.removeAll(wordsToExclude);
List<String> result = Lists.newArrayList(resultSet);
Log.info("Total size of vocabulary: %d", result.size());
if (ordered) {
Log.info("Sorting file with word order.");
Collections.sort(result, collator);
}
com.google.common.io.Files.createParentDirs(outFile);
Log.info("Saving to vocabulary file: %s", outFile);
if (!writeFrequencies) {
SimpleTextWriter.utf8Builder(outFile).addNewLineBeforClose().build().writeLines(result);
} else {
Log.info("Frequency values will be written with words.");
try (SimpleTextWriter stw = SimpleTextWriter.keepOpenUTF8Writer(outFile)) {
for (String s : result) {
stw.writeLine(s + frequencyFileDelimiter + histogram.getCount(s));
}
}
}
Log.info("Done.");
}
}
use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class ConllTreebankReader method generateCrossValidationSets.
public void generateCrossValidationSets(List<DependencySentence> trainingSentences, List<DependencySentence> testSentences, File directory, String name, int split) throws IOException {
directory.mkdirs();
int chunkSizeTest = testSentences.size() / split;
for (int i = 0; i < split; i++) {
SimpleTextWriter stwTest = SimpleTextWriter.keepOpenUTF8Writer(new File(directory, name + "-test-" + i + ".conll"));
SimpleTextWriter stwTrain = SimpleTextWriter.keepOpenUTF8Writer(new File(directory, name + "-train-" + i + ".conll"));
int start = chunkSizeTest * i;
int end = start + chunkSizeTest;
if (i == split - 1) {
end = testSentences.size();
}
List<DependencySentence> testSet = testSentences.subList(start, end);
for (DependencySentence dependencySentence : testSet) {
stwTest.writeLine(dependencySentence.getAsConnlString());
stwTest.writeLine();
}
LinkedHashSet<DependencySentence> trainingSet = new LinkedHashSet<>(trainingSentences);
for (DependencySentence dependencySentence : testSet) {
trainingSet.remove(dependencySentence);
}
for (DependencySentence dependencySentence : trainingSet) {
stwTrain.writeLine(dependencySentence.getAsConnlString());
stwTrain.writeLine();
}
}
}
use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class ConllTreebankReader method saveSentences.
public void saveSentences(List<DependencySentence> sentences, File file) throws IOException {
SimpleTextWriter stw = SimpleTextWriter.keepOpenUTF8Writer(file);
for (DependencySentence sentence : sentences) {
stw.writeLine(sentence.getAsConnlString());
stw.writeLine();
}
stw.close();
}
use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class ConllTreebankReader method dumpStats.
public void dumpStats(List<DependencySentence> sentences, File statFile) throws IOException {
Histogram<CoarsePosTag> coarsePos = new Histogram<>();
Histogram<PosTag> pos = new Histogram<>();
Histogram<DependencyRelation> depRelations = new Histogram<>();
Histogram<String> morphItems = new Histogram<>();
for (DependencySentence sentence : sentences) {
for (DependencyItem item : sentence.items) {
coarsePos.add(item.coarsePosTag);
pos.add(item.posTag);
depRelations.add(item.depRelation);
morphItems.add(Lists.newArrayList(Splitter.on("|").trimResults().omitEmptyStrings().split(item.feats)));
}
}
SimpleTextWriter writer = SimpleTextWriter.keepOpenUTF8Writer(statFile);
writer.writeLine("Sentence count:" + sentences.size());
writer.writeLine("\nCoarse POS values:\n");
for (CoarsePosTag coarsePo : coarsePos.getSortedList()) {
writer.writeLine(coarsePo.getAsConnlValue() + " : " + coarsePos.getCount(coarsePo));
}
writer.writeLine("\nPOS values:\n");
for (PosTag posTag : pos.getSortedList()) {
writer.writeLine(posTag.getAsConnlValue() + " : " + pos.getCount(posTag));
}
writer.writeLine("\nDEP Rel values:\n");
for (DependencyRelation depRel : depRelations.getSortedList()) {
writer.writeLine(depRel.getAsConnlString() + " : " + depRelations.getCount(depRel));
}
for (String morphItem : morphItems.getSortedList()) {
writer.writeLine(morphItem + " : " + morphItems.getCount(morphItem));
}
writer.close();
}
Aggregations