Search in sources :

Example 6 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class FakeLm method generateArpa.

public void generateArpa(File fileName) throws IOException {
    System.out.println("unigrams = " + unigrams.length);
    SimpleTextWriter sw = SimpleTextWriter.keepOpenUTF8Writer(fileName);
    /*
        \data\
        ngram 1= 4
        ngram 2= 3
        ngram 3= 2
        */
    sw.writeLine("\\data\\");
    for (int o = 1; o <= order; o++) {
        sw.writeLine("ngram " + o + "=" + o * unigramLength);
    }
    for (int o = 1; o <= order; o++) {
        FakeGram[] probs = getNgramProbs(o);
        System.out.println("Validating..");
        validate(probs);
        System.out.println("Writing " + o + " grams.");
        sw.writeLine();
        sw.writeLine("\\" + o + "-grams:\n");
        for (FakeGram prob : probs) {
            if (o < order) {
                sw.writeLine(String.format(Locale.ENGLISH, "%.4f %s %.4f", prob.prob, Joiner.on(" ").join(prob.vals), prob.backoff));
            } else {
                sw.writeLine(String.format(Locale.ENGLISH, "%.4f %s", prob.prob, Joiner.on(" ").join(prob.vals)));
            }
        }
    }
    sw.writeLine();
    sw.writeLine("\\end\\");
}
Also used : SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 7 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class GenerateVocabulary method run.

@Override
public void run() throws Exception {
    if (!corpus.exists()) {
        throw new IllegalArgumentException("Can not find the corpus file: " + corpus);
    }
    if (top < -1 || top == 0)
        throw new IllegalArgumentException("Illegal value for -top: " + top);
    Set<String> wordsToInclude = getWordsFromFile(includeFile);
    if (wordsToInclude.size() > 0) {
        Log.info("Amount of words to include using include file: %d", wordsToInclude.size());
    }
    Set<String> wordsToExclude = getWordsFromFile(excludeFile);
    if (wordsToExclude.size() > 0) {
        Log.info("Amount of words to exclude using exclude file: %d", wordsToExclude.size());
    }
    Set<String> intersection = Sets.newHashSet(wordsToExclude);
    intersection.retainAll(wordsToInclude);
    if (intersection.size() != 0) {
        Log.warn("There are matching words in both include and exclude files: " + intersection.toString());
    }
    Collator collator = Collator.getInstance(Locale.ENGLISH);
    if (sortLocale != null) {
        collator = Collator.getInstance(new Locale(sortLocale));
    }
    Log.info("Processing corpus: %s", corpus);
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
        String line;
        Histogram<String> histogram = new Histogram<>(50000);
        int count = 0;
        while ((line = reader.readLine()) != null) {
            List<String> words = new ArrayList<>(10);
            for (String word : Splitter.on(" ").omitEmptyStrings().trimResults().split(line)) {
                if (word.length() > 30) {
                    Log.warn("Too long word %s", word);
                }
                if (!countMetaWords) {
                    if (word.contains("<") || word.equalsIgnoreCase(">")) {
                        continue;
                    }
                }
                words.add(word);
            }
            if (words.isEmpty())
                continue;
            histogram.add(words);
            if (count % 500000 == 0 && count != 0)
                Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
            count++;
        }
        Log.info("A total of %d lines have been processed. Vocabulary Size: %d", count, histogram.size());
        if (minFreq > 1) {
            histogram.removeSmaller(minFreq);
        }
        if (top >= histogram.size() || top == -1) {
            top = histogram.size();
            Log.info("All %d words will be in the vocabulary.", top);
        } else
            Log.info("Top %d words will be used in the vocabulary.", top);
        List<String> mostFrequent;
        if (top > 0) {
            mostFrequent = histogram.getTop(top);
        } else {
            mostFrequent = histogram.getSortedList();
        }
        Log.info("Coverage: %.3f", 100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
        LinkedHashSet<String> resultSet = Sets.newLinkedHashSet(mostFrequent);
        resultSet.addAll(wordsToInclude);
        resultSet.removeAll(wordsToExclude);
        List<String> result = Lists.newArrayList(resultSet);
        Log.info("Total size of vocabulary: %d", result.size());
        if (ordered) {
            Log.info("Sorting file with word order.");
            Collections.sort(result, collator);
        }
        com.google.common.io.Files.createParentDirs(outFile);
        Log.info("Saving to vocabulary file: %s", outFile);
        if (!writeFrequencies) {
            SimpleTextWriter.utf8Builder(outFile).addNewLineBeforClose().build().writeLines(result);
        } else {
            Log.info("Frequency values will be written with words.");
            try (SimpleTextWriter stw = SimpleTextWriter.keepOpenUTF8Writer(outFile)) {
                for (String s : result) {
                    stw.writeLine(s + frequencyFileDelimiter + histogram.getCount(s));
                }
            }
        }
        Log.info("Done.");
    }
}
Also used : Histogram(zemberek.core.collections.Histogram) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) Collator(java.text.Collator) SimpleTextWriter(zemberek.core.io.SimpleTextWriter) BufferedReader(java.io.BufferedReader)

Example 8 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class ConllTreebankReader method generateCrossValidationSets.

public void generateCrossValidationSets(List<DependencySentence> trainingSentences, List<DependencySentence> testSentences, File directory, String name, int split) throws IOException {
    directory.mkdirs();
    int chunkSizeTest = testSentences.size() / split;
    for (int i = 0; i < split; i++) {
        SimpleTextWriter stwTest = SimpleTextWriter.keepOpenUTF8Writer(new File(directory, name + "-test-" + i + ".conll"));
        SimpleTextWriter stwTrain = SimpleTextWriter.keepOpenUTF8Writer(new File(directory, name + "-train-" + i + ".conll"));
        int start = chunkSizeTest * i;
        int end = start + chunkSizeTest;
        if (i == split - 1) {
            end = testSentences.size();
        }
        List<DependencySentence> testSet = testSentences.subList(start, end);
        for (DependencySentence dependencySentence : testSet) {
            stwTest.writeLine(dependencySentence.getAsConnlString());
            stwTest.writeLine();
        }
        LinkedHashSet<DependencySentence> trainingSet = new LinkedHashSet<>(trainingSentences);
        for (DependencySentence dependencySentence : testSet) {
            trainingSet.remove(dependencySentence);
        }
        for (DependencySentence dependencySentence : trainingSet) {
            stwTrain.writeLine(dependencySentence.getAsConnlString());
            stwTrain.writeLine();
        }
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) File(java.io.File) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 9 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class ConllTreebankReader method saveSentences.

public void saveSentences(List<DependencySentence> sentences, File file) throws IOException {
    SimpleTextWriter stw = SimpleTextWriter.keepOpenUTF8Writer(file);
    for (DependencySentence sentence : sentences) {
        stw.writeLine(sentence.getAsConnlString());
        stw.writeLine();
    }
    stw.close();
}
Also used : SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 10 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class ConllTreebankReader method dumpStats.

public void dumpStats(List<DependencySentence> sentences, File statFile) throws IOException {
    Histogram<CoarsePosTag> coarsePos = new Histogram<>();
    Histogram<PosTag> pos = new Histogram<>();
    Histogram<DependencyRelation> depRelations = new Histogram<>();
    Histogram<String> morphItems = new Histogram<>();
    for (DependencySentence sentence : sentences) {
        for (DependencyItem item : sentence.items) {
            coarsePos.add(item.coarsePosTag);
            pos.add(item.posTag);
            depRelations.add(item.depRelation);
            morphItems.add(Lists.newArrayList(Splitter.on("|").trimResults().omitEmptyStrings().split(item.feats)));
        }
    }
    SimpleTextWriter writer = SimpleTextWriter.keepOpenUTF8Writer(statFile);
    writer.writeLine("Sentence count:" + sentences.size());
    writer.writeLine("\nCoarse POS values:\n");
    for (CoarsePosTag coarsePo : coarsePos.getSortedList()) {
        writer.writeLine(coarsePo.getAsConnlValue() + " : " + coarsePos.getCount(coarsePo));
    }
    writer.writeLine("\nPOS values:\n");
    for (PosTag posTag : pos.getSortedList()) {
        writer.writeLine(posTag.getAsConnlValue() + " : " + pos.getCount(posTag));
    }
    writer.writeLine("\nDEP Rel values:\n");
    for (DependencyRelation depRel : depRelations.getSortedList()) {
        writer.writeLine(depRel.getAsConnlString() + " : " + depRelations.getCount(depRel));
    }
    for (String morphItem : morphItems.getSortedList()) {
        writer.writeLine(morphItem + " : " + morphItems.getCount(morphItem));
    }
    writer.close();
}
Also used : Histogram(zemberek.core.collections.Histogram) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Aggregations

SimpleTextWriter (zemberek.core.io.SimpleTextWriter)12 File (java.io.File)2 ArrayList (java.util.ArrayList)2 Histogram (zemberek.core.collections.Histogram)2 LineIterator (zemberek.core.io.LineIterator)2 SimpleTextReader (zemberek.core.io.SimpleTextReader)2 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 InputStreamReader (java.io.InputStreamReader)1 Collator (java.text.Collator)1 LinkedHashSet (java.util.LinkedHashSet)1