Search in sources :

Example 36 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class FindTreebankTree method main.

public static void main(String[] args) {
    // Args specified with -tagSeparator, -encoding, etc are assigned
    // to the appropriate option.  Otherwise, the first arg found is
    // the sentence to look for, and all other args are paths in which
    // to look for that sentence.
    String needle = "";
    String tagSeparator = "_";
    String encoding = "utf-8";
    String fileRegex = "";
    List<String> paths = new ArrayList<>();
    for (int i = 0; i < args.length; ++i) {
        if ((args[i].equalsIgnoreCase("-tagSeparator") || args[i].equalsIgnoreCase("--tagSeparator")) && i + 1 < args.length) {
            tagSeparator = args[i + 1];
            ++i;
        } else if ((args[i].equalsIgnoreCase("-encoding") || args[i].equalsIgnoreCase("--encoding")) && i + 1 < args.length) {
            encoding = args[i + 1];
            ++i;
        } else if ((args[i].equalsIgnoreCase("-fileRegex") || args[i].equalsIgnoreCase("--fileRegex")) && i + 1 < args.length) {
            fileRegex = args[i + 1];
            ++i;
        } else if (needle.equals("")) {
            needle = args[i].trim();
        } else {
            paths.add(args[i]);
        }
    }
    TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
    // If the user specified a regex, here we make a filter using that
    // regex.  We just use an anonymous class for the filter
    FileFilter filter = null;
    if (!fileRegex.equals("")) {
        final Pattern filePattern = Pattern.compile(fileRegex);
        filter = pathname -> (pathname.isDirectory() || filePattern.matcher(pathname.getName()).matches());
    }
    for (String path : paths) {
        // Start a new treebank with the given path, encoding, filter, etc
        DiskTreebank treebank = new DiskTreebank(trf, encoding);
        treebank.loadPath(path, filter);
        Iterator<Tree> treeIterator = treebank.iterator();
        int treeCount = 0;
        String currentFile = "";
        while (treeIterator.hasNext()) {
            // keep track of which file we are currently looking at
            if (!currentFile.equals(treebank.getCurrentFilename())) {
                currentFile = treebank.getCurrentFilename();
                treeCount = 0;
            }
            ++treeCount;
            Tree tree = treeIterator.next();
            List<TaggedWord> sentence = tree.taggedYield();
            boolean found = false;
            // The tree can match in one of three ways: tagged, untagged,
            // or untagged and unsegmented (which is useful for Chinese,
            // for example)
            String haystack = SentenceUtils.listToString(sentence, true);
            found = needle.equals(haystack);
            haystack = haystack.replaceAll(" ", "");
            found = found || needle.equals(haystack);
            haystack = SentenceUtils.listToString(sentence, false, tagSeparator);
            found = found || needle.equals(haystack);
            if (found) {
                System.out.println("needle found in " + currentFile + " tree " + treeCount);
            }
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) ArrayList(java.util.ArrayList) TaggedWord(edu.stanford.nlp.ling.TaggedWord) FileFilter(java.io.FileFilter)

Example 37 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ChineseCharacterBasedLexicon method finishTraining.

@Override
public void finishTraining() {
    Timing.tick("Counting characters...");
    ClassicCounter<Symbol> charCounter = new ClassicCounter<>();
    // first find all chars that occur only once
    for (List<TaggedWord> labels : trainingSentences) {
        for (TaggedWord label : labels) {
            String word = label.word();
            if (word.equals(BOUNDARY)) {
                continue;
            }
            for (int j = 0, length = word.length(); j < length; j++) {
                Symbol sym = Symbol.cannonicalSymbol(word.charAt(j));
                charCounter.incrementCount(sym);
            }
            charCounter.incrementCount(Symbol.END_WORD);
        }
    }
    Set<Symbol> singletons = Counters.keysBelow(charCounter, 1.5);
    knownChars = Generics.newHashSet(charCounter.keySet());
    Timing.tick("Counting nGrams...");
    GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[CONTEXT_LENGTH + 1];
    for (int i = 0; i <= CONTEXT_LENGTH; i++) {
        POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2);
    }
    ClassicCounter<String> POSCounter = new ClassicCounter<>();
    List<Serializable> context = new ArrayList<>(CONTEXT_LENGTH + 1);
    for (List<TaggedWord> words : trainingSentences) {
        for (TaggedWord taggedWord : words) {
            String word = taggedWord.word();
            String tag = taggedWord.tag();
            tagIndex.add(tag);
            if (word.equals(BOUNDARY)) {
                continue;
            }
            POSCounter.incrementCount(tag);
            for (int i = 0, size = word.length(); i <= size; i++) {
                Symbol sym;
                Symbol unknownCharClass = null;
                context.clear();
                context.add(tag);
                if (i < size) {
                    char thisCh = word.charAt(i);
                    sym = Symbol.cannonicalSymbol(thisCh);
                    if (singletons.contains(sym)) {
                        unknownCharClass = unknownCharClass(sym);
                        charCounter.incrementCount(unknownCharClass);
                    }
                } else {
                    sym = Symbol.END_WORD;
                }
                // POS-specific 1-gram
                POSspecificCharNGrams[0].incrementCount(context, sym);
                if (unknownCharClass != null) {
                    // for unknown ch model
                    POSspecificCharNGrams[0].incrementCount(context, unknownCharClass);
                }
                // this could be made faster using .sublist like in score
                for (int j = 1; j <= CONTEXT_LENGTH; j++) {
                    // poly grams
                    if (i - j < 0) {
                        context.add(Symbol.BEGIN_WORD);
                        POSspecificCharNGrams[j].incrementCount(context, sym);
                        if (unknownCharClass != null) {
                            // for unknown ch model
                            POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
                        }
                        break;
                    } else {
                        Symbol prev = Symbol.cannonicalSymbol(word.charAt(i - j));
                        if (singletons.contains(prev)) {
                            context.add(unknownCharClass(prev));
                        } else {
                            context.add(prev);
                        }
                        POSspecificCharNGrams[j].incrementCount(context, sym);
                        if (unknownCharClass != null) {
                            // for unknown ch model
                            POSspecificCharNGrams[j].incrementCount(context, unknownCharClass);
                        }
                    }
                }
            }
        }
    }
    POSDistribution = Distribution.getDistribution(POSCounter);
    Timing.tick("Creating character prior distribution...");
    charDistributions = Generics.newHashMap();
    //    charDistributions = Generics.newHashMap();  // 1.5
    //    charCounter.incrementCount(Symbol.UNKNOWN, singletons.size());
    int numberOfKeys = charCounter.size() + singletons.size();
    Distribution<Symbol> prior = Distribution.goodTuringSmoothedCounter(charCounter, numberOfKeys);
    charDistributions.put(Collections.EMPTY_LIST, prior);
    for (int i = 0; i <= CONTEXT_LENGTH; i++) {
        Set<Map.Entry<List<Serializable>, ClassicCounter<Symbol>>> counterEntries = POSspecificCharNGrams[i].lowestLevelCounterEntrySet();
        Timing.tick("Creating " + counterEntries.size() + " character " + (i + 1) + "-gram distributions...");
        for (Map.Entry<List<Serializable>, ClassicCounter<Symbol>> entry : counterEntries) {
            context = entry.getKey();
            ClassicCounter<Symbol> c = entry.getValue();
            Distribution<Symbol> thisPrior = charDistributions.get(context.subList(0, context.size() - 1));
            double priorWeight = thisPrior.getNumberOfKeys() / 200.0;
            Distribution<Symbol> newDist = Distribution.dynamicCounterWithDirichletPrior(c, thisPrior, priorWeight);
            charDistributions.put(context, newDist);
        }
    }
}
Also used : GeneralizedCounter(edu.stanford.nlp.stats.GeneralizedCounter) TaggedWord(edu.stanford.nlp.ling.TaggedWord) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) RadicalMap(edu.stanford.nlp.trees.international.pennchinese.RadicalMap)

Example 38 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ChineseMaxentLexicon method train.

/**
   * Add the given sentence to the statistics counted.  Can
   * be called multiple times with different sentences.
   */
@Override
public void train(List<TaggedWord> sentence, double weight) {
    featExtractor.train(sentence, weight);
    for (TaggedWord word : sentence) {
        datumCounter.incrementCount(word, weight);
        tagsForWord.add(word.word(), word.tag());
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Example 39 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class ChineseWordFeatureExtractor method train.

public void train(List<TaggedWord> sentence, double weight) {
    for (TaggedWord word : sentence) {
        String wordString = word.word();
        wordCounter.incrementCount(wordString, weight);
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Example 40 with TaggedWord

use of edu.stanford.nlp.ling.TaggedWord in project CoreNLP by stanfordnlp.

the class BaseLexicon method trainUnannotated.

@Override
public final void trainUnannotated(List<TaggedWord> sentence, double weight) {
    uwModelTrainer.incrementTreesRead(weight);
    int loc = 0;
    for (TaggedWord tw : sentence) {
        String baseTag = op.langpack().basicCategory(tw.tag());
        Counter<String> counts = baseTagCounts.get(baseTag);
        if (counts == null) {
            ++loc;
            continue;
        }
        double totalCount = counts.totalCount();
        if (totalCount == 0) {
            ++loc;
            continue;
        }
        for (String tag : counts.keySet()) {
            TaggedWord newTW = new TaggedWord(tw.word(), tag);
            train(newTW, loc, weight * counts.getCount(tag) / totalCount);
        }
        ++loc;
    }
}
Also used : TaggedWord(edu.stanford.nlp.ling.TaggedWord)

Aggregations

TaggedWord (edu.stanford.nlp.ling.TaggedWord)43 HasWord (edu.stanford.nlp.ling.HasWord)9 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)5 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 Tree (edu.stanford.nlp.trees.Tree)5 ArrayList (java.util.ArrayList)5 Label (edu.stanford.nlp.ling.Label)4 WordTag (edu.stanford.nlp.ling.WordTag)4 List (java.util.List)4 HasTag (edu.stanford.nlp.ling.HasTag)3 TaggedFileRecord (edu.stanford.nlp.tagger.io.TaggedFileRecord)3 File (java.io.File)3 StringReader (java.io.StringReader)3 Word (edu.stanford.nlp.ling.Word)2 Morphology (edu.stanford.nlp.process.Morphology)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2