Search in sources :

Example 26 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ExhaustivePCFGParser method initializeChart.

private void initializeChart(List<? extends HasWord> sentence) {
    int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);
    for (int start = 0; start < length; start++) {
        if (op.testOptions.maxSpanForTags > 1) {
            // note we don't look for "words" including the end symbol!
            for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags) || (start + 1 == end); end++) {
                StringBuilder word = new StringBuilder();
                //wsg: Feb 2010 - Appears to support character-level parsing
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }
                }
                for (int state = 0; state < numStates; state++) {
                    float iS = iScore[start][end][state];
                    if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
                        IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex, tagIndex);
                        iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
                        if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
                            narrowRExtent[start][state] = start + 1;
                            narrowLExtent[end][state] = end - 1;
                            wideRExtent[start][state] = start + 1;
                            wideLExtent[end][state] = end - 1;
                        }
                    }
                }
            }
        } else {
            // "normal" chart initialization of the [start,start+1] cell
            int word = words[start];
            int end = start + 1;
            Arrays.fill(tags[start], false);
            float[] iScore_start_end = iScore[start][end];
            int[] narrowRExtent_start = narrowRExtent[start];
            int[] narrowLExtent_end = narrowLExtent[end];
            int[] wideRExtent_start = wideRExtent[start];
            int[] wideLExtent_end = wideLExtent[end];
            //Force tags
            String trueTagStr = null;
            if (sentence.get(start) instanceof HasTag) {
                trueTagStr = ((HasTag) sentence.get(start)).tag();
                if ("".equals(trueTagStr)) {
                    trueTagStr = null;
                }
            }
            // Another option for forcing tags: supply a regex
            String candidateTagRegex = null;
            if (sentence.get(start) instanceof CoreLabel) {
                candidateTagRegex = ((CoreLabel) sentence.get(start)).get(ParserAnnotations.CandidatePartOfSpeechAnnotation.class);
                if ("".equals(candidateTagRegex)) {
                    candidateTagRegex = null;
                }
            }
            //Word context (e.g., morphosyntactic info)
            String wordContextStr = null;
            if (sentence.get(start) instanceof HasContext) {
                wordContextStr = ((HasContext) sentence.get(start)).originalText();
                if ("".equals(wordContextStr))
                    wordContextStr = null;
            }
            boolean assignedSomeTag = false;
            if (!floodTags || word == boundary) {
                // which may itself be tagging flexibly or using a strict lexicon.
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]", "UTF-8");
                }
                for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start, wordContextStr); taggingI.hasNext(); ) {
                    IntTaggedWord tagging = taggingI.next();
                    int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
                    // not basicCategory() compatible with supplied tag.
                    if (trueTagStr != null) {
                        if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
                            }
                            continue;
                        }
                    }
                    if (candidateTagRegex != null) {
                        if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging + " as it doesn't match candidateTagRegex: " + candidateTagRegex, "UTF-8");
                            }
                            continue;
                        }
                    }
                    // try {
                    // score the cell according to P(word|tag) in the lexicon
                    float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr);
                    if (lexScore > Float.NEGATIVE_INFINITY) {
                        assignedSomeTag = true;
                        iScore_start_end[state] = lexScore;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                    // } catch (Exception e) {
                    // e.printStackTrace();
                    // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
                    // }
                    int tag = tagging.tag;
                    tags[start][tag] = true;
                    if (dumpTagging) {
                        EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8");
                    }
                //if (start == length-2 && tagging.parent == puncTag)
                //  lastIsPunc = true;
                }
            }
            if (!assignedSomeTag) {
                // specified taggings
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        if (trueTagStr != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
                                continue;
                            }
                        }
                        float lexScore = lex.score(new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start, wordIndex.get(word), wordContextStr);
                        if (candidateTagRegex != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
                                continue;
                            }
                        }
                        if (lexScore > Float.NEGATIVE_INFINITY) {
                            iScore_start_end[state] = lexScore;
                            narrowRExtent_start[state] = end;
                            narrowLExtent_end[state] = start;
                            wideRExtent_start[state] = end;
                            wideLExtent_end[state] = start;
                        }
                        if (dumpTagging) {
                            EncodingPrintWriter.err.println("Word pos " + start + " tagging " + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state)))) + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8");
                        }
                    }
                }
            }
            // tag multi-counting
            if (op.dcTags) {
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state]) {
                        iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
                    }
                }
            }
            if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
                // Search above for "floodTags = true".
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        iScore_start_end[state] = -1000.0f;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                }
            }
            // Apply unary rules in diagonal cells of chart
            if (spillGuts) {
                tick("Terminal Unary...");
            }
            for (int state = 0; state < numStates; state++) {
                float iS = iScore_start_end[state];
                if (iS == Float.NEGATIVE_INFINITY) {
                    continue;
                }
                UnaryRule[] unaries = ug.closedRulesByChild(state);
                for (UnaryRule ur : unaries) {
                    int parentState = ur.parent;
                    float pS = ur.score;
                    float tot = iS + pS;
                    if (tot > iScore_start_end[parentState]) {
                        iScore_start_end[parentState] = tot;
                        narrowRExtent_start[parentState] = end;
                        narrowLExtent_end[parentState] = start;
                        wideRExtent_start[parentState] = end;
                        wideLExtent_end[parentState] = start;
                    }
                }
            }
            if (spillGuts) {
                tick("Next word...");
            }
        }
    }
// end for start
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) HasTag(edu.stanford.nlp.ling.HasTag) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint) CoreLabel(edu.stanford.nlp.ling.CoreLabel) HasContext(edu.stanford.nlp.ling.HasContext)

Example 27 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ShiftReduceParser method initialStateFromTaggedSentence.

public static State initialStateFromTaggedSentence(List<? extends HasWord> words) {
    List<Tree> preterminals = Generics.newArrayList();
    for (int index = 0; index < words.size(); ++index) {
        HasWord hw = words.get(index);
        CoreLabel wordLabel;
        String tag;
        if (hw instanceof CoreLabel) {
            wordLabel = (CoreLabel) hw;
            tag = wordLabel.tag();
        } else {
            wordLabel = new CoreLabel();
            wordLabel.setValue(hw.word());
            wordLabel.setWord(hw.word());
            if (!(hw instanceof HasTag)) {
                throw new IllegalArgumentException("Expected tagged words");
            }
            tag = ((HasTag) hw).tag();
            wordLabel.setTag(tag);
        }
        if (tag == null) {
            throw new IllegalArgumentException("Input word not tagged");
        }
        CoreLabel tagLabel = new CoreLabel();
        tagLabel.setValue(tag);
        // Index from 1.  Tools downstream from the parser expect that
        // Internally this parser uses the index, so we have to
        // overwrite incorrect indices if the label is already indexed
        wordLabel.setIndex(index + 1);
        tagLabel.setIndex(index + 1);
        LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel);
        LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel);
        tagNode.addChild(wordNode);
        // TODO: can we get away with not setting these on the wordLabel?
        wordLabel.set(TreeCoreAnnotations.HeadWordLabelAnnotation.class, wordLabel);
        wordLabel.set(TreeCoreAnnotations.HeadTagLabelAnnotation.class, tagLabel);
        tagLabel.set(TreeCoreAnnotations.HeadWordLabelAnnotation.class, wordLabel);
        tagLabel.set(TreeCoreAnnotations.HeadTagLabelAnnotation.class, tagLabel);
        preterminals.add(tagNode);
    }
    return new State(preterminals);
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Tree(edu.stanford.nlp.trees.Tree) HasTag(edu.stanford.nlp.ling.HasTag) LabeledScoredTreeNode(edu.stanford.nlp.trees.LabeledScoredTreeNode) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations)

Example 28 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class DocumentPreprocessorTest method runTest.

private static void runTest(String input, String[] expected, String[] sentenceFinalPuncWords, boolean whitespaceTokenize) {
    List<String> results = new ArrayList<>();
    DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)));
    if (sentenceFinalPuncWords != null) {
        document.setSentenceFinalPuncWords(sentenceFinalPuncWords);
    }
    if (whitespaceTokenize) {
        document.setTokenizerFactory(null);
        document.setSentenceDelimiter("\n");
    }
    for (List<HasWord> sentence : document) {
        results.add(SentenceUtils.listToString(sentence));
    }
    assertEquals("Should be " + expected.length + " sentences but got " + results.size() + ": " + results, expected.length, results.size());
    for (int i = 0; i < results.size(); ++i) {
        assertEquals("Failed on sentence " + i, expected[i], results.get(i));
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) ArrayList(java.util.ArrayList) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader)

Example 29 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class TaggerDemo method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        log.info("usage: java TaggerDemo modelFile fileToTag");
        return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[1])));
    for (List<HasWord> sentence : sentences) {
        List<TaggedWord> tSentence = tagger.tagSentence(sentence);
        System.out.println(SentenceUtils.listToString(tSentence, false));
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) List(java.util.List)

Example 30 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class TaggerDemo2 method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        log.info("usage: java TaggerDemo2 modelFile fileToTag");
        return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
        List<TaggedWord> tSentence = tagger.tagSentence(sentence);
        pw.println(SentenceUtils.listToString(tSentence, false));
    }
    // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
    List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
        if (tw.tag().startsWith("JJ")) {
            pw.println(tw.word());
        }
    }
    pw.close();
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabelTokenFactory(edu.stanford.nlp.process.CoreLabelTokenFactory) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) PrintWriter(java.io.PrintWriter)

Aggregations

HasWord (edu.stanford.nlp.ling.HasWord)57 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 TaggedWord (edu.stanford.nlp.ling.TaggedWord)15 ArrayList (java.util.ArrayList)14 HasTag (edu.stanford.nlp.ling.HasTag)13 Tree (edu.stanford.nlp.trees.Tree)13 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)11 StringReader (java.io.StringReader)11 Label (edu.stanford.nlp.ling.Label)10 Word (edu.stanford.nlp.ling.Word)10 List (java.util.List)8 BufferedReader (java.io.BufferedReader)6 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 File (java.io.File)5 PrintWriter (java.io.PrintWriter)5 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4 Pair (edu.stanford.nlp.util.Pair)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 HasIndex (edu.stanford.nlp.ling.HasIndex)3 Sentence (edu.stanford.nlp.ling.Sentence)3