use of edu.stanford.nlp.ling.HasContext in project CoreNLP by stanfordnlp.
the class BiLexPCFGParser method makeInitialItems.
protected List<Item> makeInitialItems(List<? extends HasWord> wordList) {
List<Item> itemList = new ArrayList<>();
int length = wordList.size();
int numTags = tagIndex.size();
words = new int[length];
taggedWordList = new List[length];
int terminalCount = 0;
originalLabels = new CoreLabel[wordList.size()];
for (int i = 0; i < length; i++) {
taggedWordList[i] = new ArrayList<>(numTags);
HasWord wordObject = wordList.get(i);
if (wordObject instanceof CoreLabel) {
originalLabels[i] = (CoreLabel) wordObject;
}
String wordStr = wordObject.word();
//Word context (e.g., morphosyntactic info)
String wordContextStr = null;
if (wordObject instanceof HasContext) {
wordContextStr = ((HasContext) wordObject).originalText();
if ("".equals(wordContextStr))
wordContextStr = null;
}
if (!wordIndex.contains(wordStr)) {
wordStr = Lexicon.UNKNOWN_WORD;
}
int word = wordIndex.indexOf(wordStr);
words[i] = word;
for (Iterator<IntTaggedWord> tagI = lex.ruleIteratorByWord(word, i, wordContextStr); tagI.hasNext(); ) {
IntTaggedWord tagging = tagI.next();
int tag = tagging.tag;
//String curTagStr = tagIndex.get(tag);
//if (!tagStr.equals("") && !tagStr.equals(curTagStr))
// continue;
int state = stateIndex.indexOf(tagIndex.get(tag));
//itemList.add(makeInitialItem(i,tag,state,1.0*tagging.score));
// THIS WILL CAUSE BUGS!!! Don't use with another A* scorer
tempEdge.state = state;
tempEdge.head = i;
tempEdge.start = i;
tempEdge.end = i + 1;
tempEdge.tag = tag;
itemList.add(makeInitialItem(i, tag, state, scorer.iScore(tempEdge)));
terminalCount++;
taggedWordList[i].add(new IntTaggedWord(word, tag));
}
}
if (op.testOptions.verbose) {
log.info("Terminals (# of tag edges in chart): " + terminalCount);
}
return itemList;
}
use of edu.stanford.nlp.ling.HasContext in project CoreNLP by stanfordnlp.
the class ExhaustivePCFGParser method initializeChart.
private void initializeChart(List<? extends HasWord> sentence) {
int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);
for (int start = 0; start < length; start++) {
if (op.testOptions.maxSpanForTags > 1) {
// note we don't look for "words" including the end symbol!
for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags) || (start + 1 == end); end++) {
StringBuilder word = new StringBuilder();
//wsg: Feb 2010 - Appears to support character-level parsing
for (int i = start; i < end; i++) {
if (sentence.get(i) instanceof HasWord) {
HasWord cl = sentence.get(i);
word.append(cl.word());
} else {
word.append(sentence.get(i).toString());
}
}
for (int state = 0; state < numStates; state++) {
float iS = iScore[start][end][state];
if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex, tagIndex);
iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
narrowRExtent[start][state] = start + 1;
narrowLExtent[end][state] = end - 1;
wideRExtent[start][state] = start + 1;
wideLExtent[end][state] = end - 1;
}
}
}
}
} else {
// "normal" chart initialization of the [start,start+1] cell
int word = words[start];
int end = start + 1;
Arrays.fill(tags[start], false);
float[] iScore_start_end = iScore[start][end];
int[] narrowRExtent_start = narrowRExtent[start];
int[] narrowLExtent_end = narrowLExtent[end];
int[] wideRExtent_start = wideRExtent[start];
int[] wideLExtent_end = wideLExtent[end];
//Force tags
String trueTagStr = null;
if (sentence.get(start) instanceof HasTag) {
trueTagStr = ((HasTag) sentence.get(start)).tag();
if ("".equals(trueTagStr)) {
trueTagStr = null;
}
}
// Another option for forcing tags: supply a regex
String candidateTagRegex = null;
if (sentence.get(start) instanceof CoreLabel) {
candidateTagRegex = ((CoreLabel) sentence.get(start)).get(ParserAnnotations.CandidatePartOfSpeechAnnotation.class);
if ("".equals(candidateTagRegex)) {
candidateTagRegex = null;
}
}
//Word context (e.g., morphosyntactic info)
String wordContextStr = null;
if (sentence.get(start) instanceof HasContext) {
wordContextStr = ((HasContext) sentence.get(start)).originalText();
if ("".equals(wordContextStr))
wordContextStr = null;
}
boolean assignedSomeTag = false;
if (!floodTags || word == boundary) {
// which may itself be tagging flexibly or using a strict lexicon.
if (dumpTagging) {
EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]", "UTF-8");
}
for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start, wordContextStr); taggingI.hasNext(); ) {
IntTaggedWord tagging = taggingI.next();
int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
// not basicCategory() compatible with supplied tag.
if (trueTagStr != null) {
if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
if (dumpTagging) {
EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
}
continue;
}
}
if (candidateTagRegex != null) {
if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
if (dumpTagging) {
EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match candidateTagRegex: " + candidateTagRegex, "UTF-8");
}
continue;
}
}
// try {
// score the cell according to P(word|tag) in the lexicon
float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr);
if (lexScore > Float.NEGATIVE_INFINITY) {
assignedSomeTag = true;
iScore_start_end[state] = lexScore;
narrowRExtent_start[state] = end;
narrowLExtent_end[state] = start;
wideRExtent_start[state] = end;
wideLExtent_end[state] = start;
}
// } catch (Exception e) {
// e.printStackTrace();
// System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
// }
int tag = tagging.tag;
tags[start][tag] = true;
if (dumpTagging) {
EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8");
}
//if (start == length-2 && tagging.parent == puncTag)
// lastIsPunc = true;
}
}
if (!assignedSomeTag) {
// specified taggings
if (dumpTagging) {
EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
}
for (int state = 0; state < numStates; state++) {
if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
if (trueTagStr != null) {
String tagString = stateIndex.get(state);
if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
continue;
}
}
float lexScore = lex.score(new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start, wordIndex.get(word), wordContextStr);
if (candidateTagRegex != null) {
String tagString = stateIndex.get(state);
if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
continue;
}
}
if (lexScore > Float.NEGATIVE_INFINITY) {
iScore_start_end[state] = lexScore;
narrowRExtent_start[state] = end;
narrowLExtent_end[state] = start;
wideRExtent_start[state] = end;
wideLExtent_end[state] = start;
}
if (dumpTagging) {
EncodingPrintWriter.err.println("Word pos " + start + " tagging " + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state)))) + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8");
}
}
}
}
// tag multi-counting
if (op.dcTags) {
for (int state = 0; state < numStates; state++) {
if (isTag[state]) {
iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
}
}
}
if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
// Search above for "floodTags = true".
if (dumpTagging) {
EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
}
for (int state = 0; state < numStates; state++) {
if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
iScore_start_end[state] = -1000.0f;
narrowRExtent_start[state] = end;
narrowLExtent_end[state] = start;
wideRExtent_start[state] = end;
wideLExtent_end[state] = start;
}
}
}
// Apply unary rules in diagonal cells of chart
if (spillGuts) {
tick("Terminal Unary...");
}
for (int state = 0; state < numStates; state++) {
float iS = iScore_start_end[state];
if (iS == Float.NEGATIVE_INFINITY) {
continue;
}
UnaryRule[] unaries = ug.closedRulesByChild(state);
for (UnaryRule ur : unaries) {
int parentState = ur.parent;
float pS = ur.score;
float tot = iS + pS;
if (tot > iScore_start_end[parentState]) {
iScore_start_end[parentState] = tot;
narrowRExtent_start[parentState] = end;
narrowLExtent_end[parentState] = start;
wideRExtent_start[parentState] = end;
wideLExtent_end[parentState] = start;
}
}
}
if (spillGuts) {
tick("Next word...");
}
}
}
// end for start
}
Aggregations