Search in sources :

Example 1 with SpaceTabTokenizer

use of zemberek.core.SpaceTabTokenizer in project zemberek-nlp by ahmetaa.

the class GenerateVocabulary method run.

@Override
protected void run() throws Exception {
    if (!corpus.exists()) {
        throw new IllegalArgumentException("Can not find the corpus file: " + corpus);
    }
    if (top < -1 || top == 0) {
        throw new IllegalArgumentException("Illegal value for n: " + top);
    }
    Set<String> wordsToInclude = getWordsFromFile(includeFile);
    Log.info("Amount of words to include using include file: %d", wordsToInclude.size());
    Set<String> wordsToExclude = getWordsFromFile(excludeFile);
    Log.info("Amount of words to exclude using exclude file: %d", wordsToExclude.size());
    Set<String> intersection = Sets.newHashSet(wordsToExclude);
    intersection.retainAll(wordsToInclude);
    if (intersection.size() != 0) {
        Log.warn("There are matching words in both include and exclude files: " + intersection.toString());
    }
    Collator collator = Collator.getInstance(Locale.ENGLISH);
    if (sortLocale != null) {
        collator = Collator.getInstance(new Locale(sortLocale));
    }
    Log.info("Processing corpus: %s", corpus);
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
        String line;
        Histogram<String> histogram = new Histogram<>(50000);
        SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
        int count = 0;
        while ((line = reader.readLine()) != null) {
            List<String> words = Lists.newArrayList(tokenizer.split(line));
            if (words.isEmpty()) {
                continue;
            }
            histogram.add(words);
            if (count % 500000 == 0 && count != 0) {
                Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
            }
            count++;
        }
        Log.info("A total of %d lines have been processed. Vocabulary Size: %d", count, histogram.size());
        if (top >= histogram.size()) {
            top = histogram.size();
        } else {
            Log.info("Top %d words will be used.", top);
        }
        List<String> mostFrequent = histogram.getTop(top);
        Log.info("Coverage: %.3f", 100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
        LinkedHashSet<String> resultSet = Sets.newLinkedHashSet(mostFrequent);
        resultSet.addAll(wordsToInclude);
        resultSet.removeAll(wordsToExclude);
        List<String> result = Lists.newArrayList(resultSet);
        Log.info("Total size of vocabulary: %d", result.size());
        if (ordered) {
            Log.info("Sorting file with word order.");
            Collections.sort(result, collator);
        }
        com.google.common.io.Files.createParentDirs(outFile);
        Log.info("Saving to vocabulary file: %s", outFile);
        SimpleTextWriter.utf8Builder(outFile).addNewLineBeforClose().build().writeLines(result);
        Log.info("Done.");
    }
}
Also used : Locale(java.util.Locale) Histogram(zemberek.core.collections.Histogram) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) Collator(java.text.Collator) BufferedReader(java.io.BufferedReader) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer)

Example 2 with SpaceTabTokenizer

use of zemberek.core.SpaceTabTokenizer in project zemberek-nlp by ahmetaa.

the class Dictionary method readFromFile.

static Dictionary readFromFile(Path file, final Args args) throws IOException {
    Log.info("Initialize dictionary and histograms.");
    Dictionary dictionary = new Dictionary(args);
    Log.info("Loading text.");
    BlockTextLoader loader = new BlockTextLoader(file, 100_000);
    SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
    int blockCounter = 1;
    for (List<String> lines : loader) {
        for (String line : lines) {
            List<String> split = tokenizer.splitToList(line);
            split.add(EOS);
            for (String word : split) {
                if (word.startsWith("#")) {
                    continue;
                }
                dictionary.addWithCount(word, 1);
            }
        }
        Log.info("Lines read: %d (thousands) ", blockCounter * 100);
        blockCounter++;
    }
    Log.info("Word + Label count = %d", dictionary.words_.size());
    Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
    // now we have the histograms. Remove based on count.
    dictionary.words_.sort((e1, e2) -> {
        if (e1.type != e2.type) {
            return Integer.compare(e1.type, e2.type);
        } else {
            return Long.compare(e2.count, e1.count);
        }
    });
    LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
    List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
    all.removeAll(toRemove);
    dictionary.words_ = new ArrayList<>(all);
    dictionary.size_ = 0;
    dictionary.nwords_ = 0;
    dictionary.nlabels_ = 0;
    Arrays.fill(dictionary.word2int_, -1);
    for (Entry e : dictionary.words_) {
        int i = dictionary.find(e.word);
        dictionary.word2int_[i] = dictionary.size_++;
        if (e.type == TYPE_WORD) {
            dictionary.nwords_++;
        }
        if (e.type == TYPE_LABEL) {
            dictionary.nlabels_++;
        }
    }
    Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
    dictionary.initTableDiscard();
    Log.info("Adding character n-grams for words.");
    dictionary.initNgrams();
    Log.info("Done.");
    return dictionary;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) DataInputStream(java.io.DataInputStream) Arrays(java.util.Arrays) IOException(java.io.IOException) Random(java.util.Random) UIntIntMap(zemberek.core.collections.UIntIntMap) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer) List(java.util.List) DataOutputStream(java.io.DataOutputStream) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) IntVector(zemberek.core.collections.IntVector) LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer)

Example 3 with SpaceTabTokenizer

use of zemberek.core.SpaceTabTokenizer in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testActualData.

@Test
@Ignore
public void testActualData() throws IOException {
    Stopwatch sw = Stopwatch.createStarted();
    File lmFile = new File("/home/ahmetaa/data/lm/smoothnlp-test/lm1.slm");
    File tmp = new File("/tmp");
    if (!lmFile.exists()) {
        final File arpaFile = new File("/home/ahmetaa/data/lm/smoothnlp-test/lm1.arpa");
        UncompressedToSmoothLmConverter converter = new UncompressedToSmoothLmConverter(lmFile, tmp);
        converter.convertLarge(MultiFileUncompressedLm.generate(arpaFile, tmp, "utf-8", 4).dir, new UncompressedToSmoothLmConverter.NgramDataBlock(2, 1, 1), 20);
    }
    SmoothLm lm = SmoothLm.builder(lmFile).build();
    System.out.println(sw.elapsed(TimeUnit.MILLISECONDS));
    sw.reset();
    final int order = 3;
    final int gramCount = 1000000;
    int[][] ids = new int[gramCount][order];
    long[] trigrams = new long[gramCount];
    LineIterator li = SimpleTextReader.trimmingUTF8LineIterator(new File("/home/ahmetaa/data/lm/smoothnlp-test/corpus-lowercase_1000000_2000000"));
    SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
    int i = 0;
    while (i < gramCount) {
        String line = li.next();
        String[] tokens = tokenizer.split(line);
        if (tokens.length < order) {
            continue;
        }
        for (int j = 0; j < tokens.length - order - 1; j++) {
            String[] words = new String[order];
            System.arraycopy(tokens, j, words, 0, order);
            int[] indexes = lm.getVocabulary().toIndexes(words);
            if (!lm.getVocabulary().containsAll(indexes)) {
                continue;
            }
            ids[i] = indexes;
            if (order == 3) {
                trigrams[i] = lm.getVocabulary().encodeTrigram(indexes);
            }
            i++;
            if (i == gramCount) {
                break;
            }
        }
    }
    sw.start();
    double tr = 0;
    for (int[] id : ids) {
        tr += lm.getProbability(id);
    }
    System.out.println(sw.elapsed(TimeUnit.MILLISECONDS));
    System.out.println("tr = " + tr);
}
Also used : Stopwatch(com.google.common.base.Stopwatch) LineIterator(zemberek.core.io.LineIterator) File(java.io.File) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 4 with SpaceTabTokenizer

use of zemberek.core.SpaceTabTokenizer in project zemberek-nlp by ahmetaa.

the class Dictionary method readFromFile.

static Dictionary readFromFile(Path file, final Args args) {
    Log.info("Initialize dictionary and histograms.");
    Dictionary dictionary = new Dictionary(args);
    Log.info("Loading text.");
    BlockTextLoader loader = BlockTextLoader.fromPath(file, 100_000);
    SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
    int blockCounter = 1;
    for (TextChunk lines : loader) {
        for (String line : lines) {
            List<String> split = tokenizer.splitToList(line);
            split.add(EOS);
            for (String word : split) {
                if (word.startsWith("#")) {
                    continue;
                }
                dictionary.add(word);
            }
        }
        Log.info("Lines read: %d (thousands) ", blockCounter * 100);
        blockCounter++;
    }
    Log.info("Word + Label count = %d", dictionary.words_.size());
    Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
    // now we have the histograms. Remove based on count.
    dictionary.words_.sort((e1, e2) -> {
        if (e1.type != e2.type) {
            return Integer.compare(e1.type, e2.type);
        } else {
            return Long.compare(e2.count, e1.count);
        }
    });
    // TODO: add threshold method.
    LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
    List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
    all.removeAll(toRemove);
    dictionary.words_ = new ArrayList<>(all);
    dictionary.size_ = 0;
    dictionary.nwords_ = 0;
    dictionary.nlabels_ = 0;
    Arrays.fill(dictionary.word2int_, -1);
    for (Entry e : dictionary.words_) {
        int i = dictionary.find(e.word);
        dictionary.word2int_[i] = dictionary.size_++;
        if (e.type == TYPE_WORD) {
            dictionary.nwords_++;
        }
        if (e.type == TYPE_LABEL) {
            dictionary.nlabels_++;
        }
    }
    Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
    dictionary.initTableDiscard();
    dictionary.initNgrams();
    return dictionary;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) IntIntMap(zemberek.core.collections.IntIntMap) DataInputStream(java.io.DataInputStream) Arrays(java.util.Arrays) IOException(java.io.IOException) Random(java.util.Random) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer) List(java.util.List) DataOutputStream(java.io.DataOutputStream) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) IntVector(zemberek.core.collections.IntVector) LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer)

Aggregations

SpaceTabTokenizer (zemberek.core.SpaceTabTokenizer)4 DataInputStream (java.io.DataInputStream)2 DataOutputStream (java.io.DataOutputStream)2 IOException (java.io.IOException)2 Path (java.nio.file.Path)2 ArrayList (java.util.ArrayList)2 Arrays (java.util.Arrays)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2 Random (java.util.Random)2 Collectors (java.util.stream.Collectors)2 IntVector (zemberek.core.collections.IntVector)2 Log (zemberek.core.logging.Log)2 BlockTextLoader (zemberek.core.text.BlockTextLoader)2 Stopwatch (com.google.common.base.Stopwatch)1 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 Collator (java.text.Collator)1