Search in sources :

Example 1 with HasContext

use of edu.stanford.nlp.ling.HasContext in project CoreNLP by stanfordnlp.

the class BiLexPCFGParser method makeInitialItems.

protected List<Item> makeInitialItems(List<? extends HasWord> wordList) {
    List<Item> itemList = new ArrayList<>();
    int length = wordList.size();
    int numTags = tagIndex.size();
    words = new int[length];
    taggedWordList = new List[length];
    int terminalCount = 0;
    originalLabels = new CoreLabel[wordList.size()];
    for (int i = 0; i < length; i++) {
        taggedWordList[i] = new ArrayList<>(numTags);
        HasWord wordObject = wordList.get(i);
        if (wordObject instanceof CoreLabel) {
            originalLabels[i] = (CoreLabel) wordObject;
        }
        String wordStr = wordObject.word();
        //Word context (e.g., morphosyntactic info)
        String wordContextStr = null;
        if (wordObject instanceof HasContext) {
            wordContextStr = ((HasContext) wordObject).originalText();
            if ("".equals(wordContextStr))
                wordContextStr = null;
        }
        if (!wordIndex.contains(wordStr)) {
            wordStr = Lexicon.UNKNOWN_WORD;
        }
        int word = wordIndex.indexOf(wordStr);
        words[i] = word;
        for (Iterator<IntTaggedWord> tagI = lex.ruleIteratorByWord(word, i, wordContextStr); tagI.hasNext(); ) {
            IntTaggedWord tagging = tagI.next();
            int tag = tagging.tag;
            //String curTagStr = tagIndex.get(tag);
            //if (!tagStr.equals("") && !tagStr.equals(curTagStr))
            //  continue;
            int state = stateIndex.indexOf(tagIndex.get(tag));
            //itemList.add(makeInitialItem(i,tag,state,1.0*tagging.score));
            // THIS WILL CAUSE BUGS!!!  Don't use with another A* scorer
            tempEdge.state = state;
            tempEdge.head = i;
            tempEdge.start = i;
            tempEdge.end = i + 1;
            tempEdge.tag = tag;
            itemList.add(makeInitialItem(i, tag, state, scorer.iScore(tempEdge)));
            terminalCount++;
            taggedWordList[i].add(new IntTaggedWord(word, tag));
        }
    }
    if (op.testOptions.verbose) {
        log.info("Terminals (# of tag edges in chart): " + terminalCount);
    }
    return itemList;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) HasContext(edu.stanford.nlp.ling.HasContext)

Example 2 with HasContext

use of edu.stanford.nlp.ling.HasContext in project CoreNLP by stanfordnlp.

the class ExhaustivePCFGParser method initializeChart.

private void initializeChart(List<? extends HasWord> sentence) {
    int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);
    for (int start = 0; start < length; start++) {
        if (op.testOptions.maxSpanForTags > 1) {
            // note we don't look for "words" including the end symbol!
            for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags) || (start + 1 == end); end++) {
                StringBuilder word = new StringBuilder();
                //wsg: Feb 2010 - Appears to support character-level parsing
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }
                }
                for (int state = 0; state < numStates; state++) {
                    float iS = iScore[start][end][state];
                    if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
                        IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex, tagIndex);
                        iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
                        if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
                            narrowRExtent[start][state] = start + 1;
                            narrowLExtent[end][state] = end - 1;
                            wideRExtent[start][state] = start + 1;
                            wideLExtent[end][state] = end - 1;
                        }
                    }
                }
            }
        } else {
            // "normal" chart initialization of the [start,start+1] cell
            int word = words[start];
            int end = start + 1;
            Arrays.fill(tags[start], false);
            float[] iScore_start_end = iScore[start][end];
            int[] narrowRExtent_start = narrowRExtent[start];
            int[] narrowLExtent_end = narrowLExtent[end];
            int[] wideRExtent_start = wideRExtent[start];
            int[] wideLExtent_end = wideLExtent[end];
            //Force tags
            String trueTagStr = null;
            if (sentence.get(start) instanceof HasTag) {
                trueTagStr = ((HasTag) sentence.get(start)).tag();
                if ("".equals(trueTagStr)) {
                    trueTagStr = null;
                }
            }
            // Another option for forcing tags: supply a regex
            String candidateTagRegex = null;
            if (sentence.get(start) instanceof CoreLabel) {
                candidateTagRegex = ((CoreLabel) sentence.get(start)).get(ParserAnnotations.CandidatePartOfSpeechAnnotation.class);
                if ("".equals(candidateTagRegex)) {
                    candidateTagRegex = null;
                }
            }
            //Word context (e.g., morphosyntactic info)
            String wordContextStr = null;
            if (sentence.get(start) instanceof HasContext) {
                wordContextStr = ((HasContext) sentence.get(start)).originalText();
                if ("".equals(wordContextStr))
                    wordContextStr = null;
            }
            boolean assignedSomeTag = false;
            if (!floodTags || word == boundary) {
                // which may itself be tagging flexibly or using a strict lexicon.
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]", "UTF-8");
                }
                for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start, wordContextStr); taggingI.hasNext(); ) {
                    IntTaggedWord tagging = taggingI.next();
                    int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
                    // not basicCategory() compatible with supplied tag.
                    if (trueTagStr != null) {
                        if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
                            }
                            continue;
                        }
                    }
                    if (candidateTagRegex != null) {
                        if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging + " as it doesn't match candidateTagRegex: " + candidateTagRegex, "UTF-8");
                            }
                            continue;
                        }
                    }
                    // try {
                    // score the cell according to P(word|tag) in the lexicon
                    float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr);
                    if (lexScore > Float.NEGATIVE_INFINITY) {
                        assignedSomeTag = true;
                        iScore_start_end[state] = lexScore;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                    // } catch (Exception e) {
                    // e.printStackTrace();
                    // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
                    // }
                    int tag = tagging.tag;
                    tags[start][tag] = true;
                    if (dumpTagging) {
                        EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8");
                    }
                //if (start == length-2 && tagging.parent == puncTag)
                //  lastIsPunc = true;
                }
            }
            if (!assignedSomeTag) {
                // specified taggings
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        if (trueTagStr != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
                                continue;
                            }
                        }
                        float lexScore = lex.score(new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start, wordIndex.get(word), wordContextStr);
                        if (candidateTagRegex != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
                                continue;
                            }
                        }
                        if (lexScore > Float.NEGATIVE_INFINITY) {
                            iScore_start_end[state] = lexScore;
                            narrowRExtent_start[state] = end;
                            narrowLExtent_end[state] = start;
                            wideRExtent_start[state] = end;
                            wideLExtent_end[state] = start;
                        }
                        if (dumpTagging) {
                            EncodingPrintWriter.err.println("Word pos " + start + " tagging " + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state)))) + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8");
                        }
                    }
                }
            }
            // tag multi-counting
            if (op.dcTags) {
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state]) {
                        iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
                    }
                }
            }
            if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
                // Search above for "floodTags = true".
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        iScore_start_end[state] = -1000.0f;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                }
            }
            // Apply unary rules in diagonal cells of chart
            if (spillGuts) {
                tick("Terminal Unary...");
            }
            for (int state = 0; state < numStates; state++) {
                float iS = iScore_start_end[state];
                if (iS == Float.NEGATIVE_INFINITY) {
                    continue;
                }
                UnaryRule[] unaries = ug.closedRulesByChild(state);
                for (UnaryRule ur : unaries) {
                    int parentState = ur.parent;
                    float pS = ur.score;
                    float tot = iS + pS;
                    if (tot > iScore_start_end[parentState]) {
                        iScore_start_end[parentState] = tot;
                        narrowRExtent_start[parentState] = end;
                        narrowLExtent_end[parentState] = start;
                        wideRExtent_start[parentState] = end;
                        wideLExtent_end[parentState] = start;
                    }
                }
            }
            if (spillGuts) {
                tick("Next word...");
            }
        }
    }
// end for start
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) HasTag(edu.stanford.nlp.ling.HasTag) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint) CoreLabel(edu.stanford.nlp.ling.CoreLabel) HasContext(edu.stanford.nlp.ling.HasContext)

Aggregations

CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 HasContext (edu.stanford.nlp.ling.HasContext)2 HasWord (edu.stanford.nlp.ling.HasWord)2 HasTag (edu.stanford.nlp.ling.HasTag)1 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)1