use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class ExhaustivePCFGParser method initializeChart.
private void initializeChart(List<? extends HasWord> sentence) {
int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);
for (int start = 0; start < length; start++) {
if (op.testOptions.maxSpanForTags > 1) {
// note we don't look for "words" including the end symbol!
for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags) || (start + 1 == end); end++) {
StringBuilder word = new StringBuilder();
//wsg: Feb 2010 - Appears to support character-level parsing
for (int i = start; i < end; i++) {
if (sentence.get(i) instanceof HasWord) {
HasWord cl = sentence.get(i);
word.append(cl.word());
} else {
word.append(sentence.get(i).toString());
}
}
for (int state = 0; state < numStates; state++) {
float iS = iScore[start][end][state];
if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex, tagIndex);
iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
narrowRExtent[start][state] = start + 1;
narrowLExtent[end][state] = end - 1;
wideRExtent[start][state] = start + 1;
wideLExtent[end][state] = end - 1;
}
}
}
}
} else {
// "normal" chart initialization of the [start,start+1] cell
int word = words[start];
int end = start + 1;
Arrays.fill(tags[start], false);
float[] iScore_start_end = iScore[start][end];
int[] narrowRExtent_start = narrowRExtent[start];
int[] narrowLExtent_end = narrowLExtent[end];
int[] wideRExtent_start = wideRExtent[start];
int[] wideLExtent_end = wideLExtent[end];
//Force tags
String trueTagStr = null;
if (sentence.get(start) instanceof HasTag) {
trueTagStr = ((HasTag) sentence.get(start)).tag();
if ("".equals(trueTagStr)) {
trueTagStr = null;
}
}
// Another option for forcing tags: supply a regex
String candidateTagRegex = null;
if (sentence.get(start) instanceof CoreLabel) {
candidateTagRegex = ((CoreLabel) sentence.get(start)).get(ParserAnnotations.CandidatePartOfSpeechAnnotation.class);
if ("".equals(candidateTagRegex)) {
candidateTagRegex = null;
}
}
//Word context (e.g., morphosyntactic info)
String wordContextStr = null;
if (sentence.get(start) instanceof HasContext) {
wordContextStr = ((HasContext) sentence.get(start)).originalText();
if ("".equals(wordContextStr))
wordContextStr = null;
}
boolean assignedSomeTag = false;
if (!floodTags || word == boundary) {
// which may itself be tagging flexibly or using a strict lexicon.
if (dumpTagging) {
EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]", "UTF-8");
}
for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start, wordContextStr); taggingI.hasNext(); ) {
IntTaggedWord tagging = taggingI.next();
int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
// not basicCategory() compatible with supplied tag.
if (trueTagStr != null) {
if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
if (dumpTagging) {
EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
}
continue;
}
}
if (candidateTagRegex != null) {
if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
if (dumpTagging) {
EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match candidateTagRegex: " + candidateTagRegex, "UTF-8");
}
continue;
}
}
// try {
// score the cell according to P(word|tag) in the lexicon
float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr);
if (lexScore > Float.NEGATIVE_INFINITY) {
assignedSomeTag = true;
iScore_start_end[state] = lexScore;
narrowRExtent_start[state] = end;
narrowLExtent_end[state] = start;
wideRExtent_start[state] = end;
wideLExtent_end[state] = start;
}
// } catch (Exception e) {
// e.printStackTrace();
// System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
// }
int tag = tagging.tag;
tags[start][tag] = true;
if (dumpTagging) {
EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8");
}
//if (start == length-2 && tagging.parent == puncTag)
// lastIsPunc = true;
}
}
if (!assignedSomeTag) {
// specified taggings
if (dumpTagging) {
EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
}
for (int state = 0; state < numStates; state++) {
if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
if (trueTagStr != null) {
String tagString = stateIndex.get(state);
if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
continue;
}
}
float lexScore = lex.score(new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start, wordIndex.get(word), wordContextStr);
if (candidateTagRegex != null) {
String tagString = stateIndex.get(state);
if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
continue;
}
}
if (lexScore > Float.NEGATIVE_INFINITY) {
iScore_start_end[state] = lexScore;
narrowRExtent_start[state] = end;
narrowLExtent_end[state] = start;
wideRExtent_start[state] = end;
wideLExtent_end[state] = start;
}
if (dumpTagging) {
EncodingPrintWriter.err.println("Word pos " + start + " tagging " + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state)))) + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8");
}
}
}
}
// tag multi-counting
if (op.dcTags) {
for (int state = 0; state < numStates; state++) {
if (isTag[state]) {
iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
}
}
}
if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
// Search above for "floodTags = true".
if (dumpTagging) {
EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
}
for (int state = 0; state < numStates; state++) {
if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
iScore_start_end[state] = -1000.0f;
narrowRExtent_start[state] = end;
narrowLExtent_end[state] = start;
wideRExtent_start[state] = end;
wideLExtent_end[state] = start;
}
}
}
// Apply unary rules in diagonal cells of chart
if (spillGuts) {
tick("Terminal Unary...");
}
for (int state = 0; state < numStates; state++) {
float iS = iScore_start_end[state];
if (iS == Float.NEGATIVE_INFINITY) {
continue;
}
UnaryRule[] unaries = ug.closedRulesByChild(state);
for (UnaryRule ur : unaries) {
int parentState = ur.parent;
float pS = ur.score;
float tot = iS + pS;
if (tot > iScore_start_end[parentState]) {
iScore_start_end[parentState] = tot;
narrowRExtent_start[parentState] = end;
narrowLExtent_end[parentState] = start;
wideRExtent_start[parentState] = end;
wideLExtent_end[parentState] = start;
}
}
}
if (spillGuts) {
tick("Next word...");
}
}
}
// end for start
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class ShiftReduceParser method initialStateFromTaggedSentence.
public static State initialStateFromTaggedSentence(List<? extends HasWord> words) {
List<Tree> preterminals = Generics.newArrayList();
for (int index = 0; index < words.size(); ++index) {
HasWord hw = words.get(index);
CoreLabel wordLabel;
String tag;
if (hw instanceof CoreLabel) {
wordLabel = (CoreLabel) hw;
tag = wordLabel.tag();
} else {
wordLabel = new CoreLabel();
wordLabel.setValue(hw.word());
wordLabel.setWord(hw.word());
if (!(hw instanceof HasTag)) {
throw new IllegalArgumentException("Expected tagged words");
}
tag = ((HasTag) hw).tag();
wordLabel.setTag(tag);
}
if (tag == null) {
throw new IllegalArgumentException("Input word not tagged");
}
CoreLabel tagLabel = new CoreLabel();
tagLabel.setValue(tag);
// Index from 1. Tools downstream from the parser expect that
// Internally this parser uses the index, so we have to
// overwrite incorrect indices if the label is already indexed
wordLabel.setIndex(index + 1);
tagLabel.setIndex(index + 1);
LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel);
LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel);
tagNode.addChild(wordNode);
// TODO: can we get away with not setting these on the wordLabel?
wordLabel.set(TreeCoreAnnotations.HeadWordLabelAnnotation.class, wordLabel);
wordLabel.set(TreeCoreAnnotations.HeadTagLabelAnnotation.class, tagLabel);
tagLabel.set(TreeCoreAnnotations.HeadWordLabelAnnotation.class, wordLabel);
tagLabel.set(TreeCoreAnnotations.HeadTagLabelAnnotation.class, tagLabel);
preterminals.add(tagNode);
}
return new State(preterminals);
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class DocumentPreprocessorTest method runTest.
private static void runTest(String input, String[] expected, String[] sentenceFinalPuncWords, boolean whitespaceTokenize) {
List<String> results = new ArrayList<>();
DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)));
if (sentenceFinalPuncWords != null) {
document.setSentenceFinalPuncWords(sentenceFinalPuncWords);
}
if (whitespaceTokenize) {
document.setTokenizerFactory(null);
document.setSentenceDelimiter("\n");
}
for (List<HasWord> sentence : document) {
results.add(SentenceUtils.listToString(sentence));
}
assertEquals("Should be " + expected.length + " sentences but got " + results.size() + ": " + results, expected.length, results.size());
for (int i = 0; i < results.size(); ++i) {
assertEquals("Failed on sentence " + i, expected[i], results.get(i));
}
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class TaggerDemo method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.info("usage: java TaggerDemo modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[1])));
for (List<HasWord> sentence : sentences) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
System.out.println(SentenceUtils.listToString(tSentence, false));
}
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class TaggerDemo2 method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.info("usage: java TaggerDemo2 modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(SentenceUtils.listToString(tSentence, false));
}
// print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
List<TaggedWord> taggedSent = tagger.tagSentence(sent);
for (TaggedWord tw : taggedSent) {
if (tw.tag().startsWith("JJ")) {
pw.println(tw.word());
}
}
pw.close();
}
Aggregations