use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class TreeAnnotator method transformTreeHelper.
/**
* Do the category splitting of the tree passed in.
* This is initially called on the root node of a tree, and it recursively
* calls itself on children. A depth first left-to-right traversal is
* done whereby a tree node's children are first transformed and then
* the parent is transformed. At the time of calling, the original root
* always sits above the current node. This routine can be assumed to,
* and does, change the tree passed in: it destructively modifies tree nodes,
* and makes new tree structure when it needs to.
*
* @param t The tree node to subcategorize.
* @param root The root of the tree. It must contain {@code t} or
* this code will throw a NullPointerException.
* @return The annotated tree.
*/
private Tree transformTreeHelper(Tree t, Tree root) {
if (t == null) {
// handle null
return null;
}
if (t.isLeaf()) {
//No need to change the label
return t;
}
String cat = t.label().value();
Tree parent;
String parentStr;
String grandParentStr;
if (root == null || t.equals(root)) {
parent = null;
parentStr = "";
} else {
parent = t.parent(root);
parentStr = parent.label().value();
}
if (parent == null || parent.equals(root)) {
grandParentStr = "";
} else {
grandParentStr = parent.parent(root).label().value();
}
String baseParentStr = tlpParams.treebankLanguagePack().basicCategory(parentStr);
String baseGrandParentStr = tlpParams.treebankLanguagePack().basicCategory(grandParentStr);
if (t.isPreTerminal()) {
// handle tags
// recurse
Tree childResult = transformTreeHelper(t.children()[0], null);
// would be nicer if Word/CWT ??
String word = childResult.value();
if (!trainOptions.noTagSplit) {
if (trainOptions.tagPA) {
String test = cat + "^" + baseParentStr;
if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.contains(test)) {
cat = test;
}
}
if (trainOptions.markUnaryTags && parent.numChildren() == 1) {
cat = cat + "^U";
}
}
// otherwise, leave the tags alone!
// Label label = new CategoryWordTag(cat, word, cat);
Label label = t.label().labelFactory().newLabel(t.label());
label.setValue(cat);
if (label instanceof HasCategory)
((HasCategory) label).setCategory(cat);
if (label instanceof HasWord)
((HasWord) label).setWord(word);
if (label instanceof HasTag)
((HasTag) label).setTag(cat);
t.setLabel(label);
// just in case word is changed
t.setChild(0, childResult);
if (trainOptions.noTagSplit) {
return t;
} else {
// language-specific transforms
return tlpParams.transformTree(t, root);
}
}
// end isPreTerminal()
// handle phrasal categories
Tree[] kids = t.children();
for (int childNum = 0; childNum < kids.length; childNum++) {
Tree child = kids[childNum];
// recursive call
Tree childResult = transformTreeHelper(child, root);
t.setChild(childNum, childResult);
}
Tree headChild = hf.determineHead(t);
if (headChild == null || headChild.label() == null) {
throw new RuntimeException("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t);
}
Label headLabel = headChild.label();
if (!(headLabel instanceof HasWord))
throw new RuntimeException("TreeAnnotator: Head label lacks a Word annotation!");
if (!(headLabel instanceof HasTag))
throw new RuntimeException("TreeAnnotator: Head label lacks a Tag annotation!");
String word = ((HasWord) headLabel).word();
String tag = ((HasTag) headLabel).tag();
// String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag);
String baseCat = tlpParams.treebankLanguagePack().basicCategory(cat);
/* Sister annotation. Potential problem: if multiple sisters are
* strong indicators for a single category's expansions. This
* happens concretely in the Chinese Treebank when NP (object)
* has left sisters VV and AS. Could lead to too much
* sparseness. The ideal solution would be to give the
* splitting list an ordering, and take only the highest (~most
* informative/reliable) sister annotation.
*/
if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.length() > 0) {
List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));
List<String> leftAnn = new ArrayList<>();
List<String> rightAnn = new ArrayList<>();
for (String s : leftSis) {
//s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s);
leftAnn.add(baseCat + "=l=" + tlpParams.treebankLanguagePack().basicCategory(s));
//System.out.println("left-annotated test string " + s);
}
for (String s : rightSis) {
//s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s);
rightAnn.add(baseCat + "=r=" + tlpParams.treebankLanguagePack().basicCategory(s));
}
for (Iterator<String> j = rightAnn.iterator(); j.hasNext(); ) {
//System.out.println("new rightsis " + (String)j.next()); //debugging
}
for (String annCat : trainOptions.sisterSplitters) {
//System.out.println("annotated test string " + annCat);
if (leftAnn.contains(annCat) || rightAnn.contains(annCat)) {
cat = cat + annCat.replaceAll("^" + baseCat, "");
break;
}
}
}
if (trainOptions.PA && !trainOptions.smoothing && baseParentStr.length() > 0) {
String cat2 = baseCat + "^" + baseParentStr;
if (!trainOptions.selectiveSplit || trainOptions.splitters.contains(cat2)) {
cat = cat + "^" + baseParentStr;
}
}
if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.length() > 0) {
if (trainOptions.selectiveSplit) {
String cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr;
if (cat.contains("^") && trainOptions.splitters.contains(cat2)) {
cat = cat + "~" + baseGrandParentStr;
}
} else {
cat = cat + "~" + baseGrandParentStr;
}
}
if (trainOptions.markUnary > 0) {
if (trainOptions.markUnary == 1 && kids.length == 1 && kids[0].depth() >= 2) {
cat = cat + "-U";
} else if (trainOptions.markUnary == 2 && parent != null && parent.numChildren() == 1 && t.depth() >= 2) {
cat = cat + "-u";
}
}
if (trainOptions.rightRec && rightRec(t, baseCat)) {
cat = cat + "-R";
}
if (trainOptions.leftRec && leftRec(t, baseCat)) {
cat = cat + "-L";
}
if (trainOptions.splitPrePreT && t.isPrePreTerminal()) {
cat = cat + "-PPT";
}
// Label label = new CategoryWordTag(cat, word, tag);
Label label = t.label().labelFactory().newLabel(t.label());
label.setValue(cat);
if (label instanceof HasCategory)
((HasCategory) label).setCategory(cat);
if (label instanceof HasWord)
((HasWord) label).setWord(word);
if (label instanceof HasTag)
((HasTag) label).setTag(tag);
t.setLabel(label);
return tlpParams.transformTree(t, root);
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class ExhaustivePCFGParser method extractBestParse.
private Tree extractBestParse(int goal, int start, int end) {
// find source of inside score
// no backtraces so we can speed up the parsing for its primary use
double bestScore = iScore[start][end][goal];
double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal]) : bestScore;
String goalStr = stateIndex.get(goal);
// check tags
if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) {
if (op.testOptions.maxSpanForTags > 1) {
Tree wordNode = null;
if (sentence != null) {
StringBuilder word = new StringBuilder();
for (int i = start; i < end; i++) {
if (sentence.get(i) instanceof HasWord) {
HasWord cl = (HasWord) sentence.get(i);
word.append(cl.word());
} else {
word.append(sentence.get(i).toString());
}
}
wordNode = tf.newLeaf(word.toString());
} else if (lr != null) {
List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end);
for (LatticeEdge edge : latticeEdges) {
IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex);
float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null);
if (matches(bestScore, tagScore + (float) edge.weight)) {
wordNode = tf.newLeaf(edge.word);
if (wordNode.label() instanceof CoreLabel) {
CoreLabel cl = (CoreLabel) wordNode.label();
cl.setBeginPosition(start);
cl.setEndPosition(end);
}
break;
}
}
if (wordNode == null) {
throw new RuntimeException("could not find matching word from lattice in parse reconstruction");
}
} else {
throw new RuntimeException("attempt to get word when sentence and lattice are null!");
}
Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
tagNode.setScore(bestScore);
if (originalTags[start] != null) {
tagNode.label().setValue(originalTags[start].tag());
}
return tagNode;
} else {
// normal lexicon is single words case
IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr));
String contextStr = getCoreLabel(start).originalText();
float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr);
if (tagScore > Float.NEGATIVE_INFINITY || floodTags) {
// return a pre-terminal tree
CoreLabel terminalLabel = getCoreLabel(start);
Tree wordNode = tf.newLeaf(terminalLabel);
Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
tagNode.setScore(bestScore);
if (terminalLabel.tag() != null) {
tagNode.label().setValue(terminalLabel.tag());
}
if (tagNode.label() instanceof HasTag) {
((HasTag) tagNode.label()).setTag(tagNode.label().value());
}
return tagNode;
}
}
}
// check binaries first
for (int split = start + 1; split < end; split++) {
for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext(); ) {
BinaryRule br = binaryI.next();
double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild];
boolean matches;
if (op.testOptions.lengthNormalization) {
double normScore = score / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]);
matches = matches(normScore, normBestScore);
} else {
matches = matches(score, bestScore);
}
if (matches) {
// build binary split
Tree leftChildTree = extractBestParse(br.leftChild, start, split);
Tree rightChildTree = extractBestParse(br.rightChild, split, end);
List<Tree> children = new ArrayList<>();
children.add(leftChildTree);
children.add(rightChildTree);
Tree result = tf.newTreeNode(goalStr, children);
result.setScore(score);
// log.info(" Found Binary node: "+result);
return result;
}
}
}
// for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) {
for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext(); ) {
UnaryRule ur = unaryI.next();
// log.info(" Trying " + ur + " dtr score: " + iScore[start][end][ur.child]);
double score = ur.score + iScore[start][end][ur.child];
boolean matches;
if (op.testOptions.lengthNormalization) {
double normScore = score / wordsInSpan[start][end][ur.child];
matches = matches(normScore, normBestScore);
} else {
matches = matches(score, bestScore);
}
if (ur.child != ur.parent && matches) {
// build unary
Tree childTree = extractBestParse(ur.child, start, end);
Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree));
// log.info(" Matched! Unary node: "+result);
result.setScore(score);
return result;
}
}
log.info("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start + ", " + end + "] looking for " + goalStr);
return null;
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class ParserDemo method demoDP.
/**
* demoDP demonstrates turning a file into tokens and then parse
* trees. Note that the trees are printed by calling pennPrint on
* the Tree object. It is also possible to pass a PrintWriter to
* pennPrint if you want to capture the output.
* This code will work with any supported language.
*/
public static void demoDP(LexicalizedParser lp, String filename) {
// This option shows loading, sentence-segmenting and tokenizing
// a file using DocumentPreprocessor.
// a PennTreebankLanguagePack for English
TreebankLanguagePack tlp = lp.treebankLanguagePack();
GrammaticalStructureFactory gsf = null;
if (tlp.supportsGrammaticalStructures()) {
gsf = tlp.grammaticalStructureFactory();
}
// to DocumentPreprocessor
for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
Tree parse = lp.apply(sentence);
parse.pennPrint();
System.out.println();
if (gsf != null) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
Collection tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
}
}
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class ParserDemo2 method main.
/** This example shows a few more ways of providing input to a parser.
*
* Usage: ParserDemo2 [grammar [textFile]]
*/
public static void main(String[] args) throws IOException {
String grammar = args.length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
String[] options = { "-maxLength", "80", "-retainTmpSubcategories" };
LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
TreebankLanguagePack tlp = lp.getOp().langpack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
Iterable<List<? extends HasWord>> sentences;
if (args.length > 1) {
DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
List<List<? extends HasWord>> tmp = new ArrayList<>();
for (List<HasWord> sentence : dp) {
tmp.add(sentence);
}
sentences = tmp;
} else {
// Showing tokenization and parsing in code a couple of different ways.
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
List<HasWord> sentence = new ArrayList<>();
for (String word : sent) {
sentence.add(new Word(word));
}
String sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization.");
// Use the default tokenizer for this TreebankLanguagePack
Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
List<? extends HasWord> sentence2 = toke.tokenize();
String[] sent3 = { "It", "can", "can", "it", "." };
// Parser gets second "can" wrong without help
String[] tag3 = { "PRP", "MD", "VB", "PRP", "." };
List<TaggedWord> sentence3 = new ArrayList<>();
for (int i = 0; i < sent3.length; i++) {
sentence3.add(new TaggedWord(sent3[i], tag3[i]));
}
Tree parse = lp.parse(sentence3);
parse.pennPrint();
List<List<? extends HasWord>> tmp = new ArrayList<>();
tmp.add(sentence);
tmp.add(sentence2);
tmp.add(sentence3);
sentences = tmp;
}
for (List<? extends HasWord> sentence : sentences) {
Tree parse = lp.parse(sentence);
parse.pennPrint();
System.out.println();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
System.out.println("The words of the sentence:");
for (Label lab : parse.yield()) {
if (lab instanceof CoreLabel) {
System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
} else {
System.out.println(lab);
}
}
System.out.println();
System.out.println(parse.taggedYield());
System.out.println();
}
// This method turns the String into a single sentence using the
// default tokenizer for the TreebankLanguagePack.
String sent3 = "This is one last test!";
lp.parse(sent3).pennPrint();
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class MaxMatchSegmenter method segment.
@Override
public List<HasWord> segment(String s) {
List<Word> segmentedWords = new ArrayList<>();
for (int start = 0, length = s.length(); start < length; ) {
int end = Math.min(length, start + maxLength);
while (end > start + 1) {
String nextWord = s.substring(start, end);
if (words.contains(nextWord)) {
segmentedWords.add(new Word(nextWord));
break;
}
end--;
}
if (end == start + 1) {
// handle non-BMP characters
if (s.codePointAt(start) >= 0x10000) {
segmentedWords.add(new Word(new String(s.substring(start, start + 2))));
start += 2;
} else {
segmentedWords.add(new Word(new String(s.substring(start, start + 1))));
start++;
}
} else {
start = end;
}
}
return new ArrayList<>(segmentedWords);
}
Aggregations