use of edu.stanford.nlp.semgraph.SemanticGraphEdge in project CoreNLP by stanfordnlp.
the class FeatureExtractor method getFeatures.
private Counter<String> getFeatures(Document doc, Mention m, Map<Integer, List<Mention>> mentionsByHeadIndex) {
Counter<String> features = new ClassicCounter<>();
// type features
features.incrementCount("mention-type=" + m.mentionType);
features.incrementCount("gender=" + m.gender);
features.incrementCount("person-fine=" + m.person);
features.incrementCount("head-ne-type=" + m.nerString);
List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
if (e.getKey() < singletonFeatures.size()) {
features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
}
}
// length and location features
addNumeric(features, "mention-length", m.spanToString().length());
addNumeric(features, "mention-words", m.originalSpan.size());
addNumeric(features, "sentence-words", m.sentenceWords.size());
features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
features.incrementCount("mention-position", m.mentionNum / (double) doc.predictedMentions.size());
features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);
// lexical features
CoreLabel firstWord = firstWord(m);
CoreLabel lastWord = lastWord(m);
CoreLabel headWord = headWord(m);
CoreLabel prevWord = prevWord(m);
CoreLabel nextWord = nextWord(m);
CoreLabel prevprevWord = prevprevWord(m);
CoreLabel nextnextWord = nextnextWord(m);
String headPOS = getPOS(headWord);
String firstPOS = getPOS(firstWord);
String lastPOS = getPOS(lastWord);
String prevPOS = getPOS(prevWord);
String nextPOS = getPOS(nextWord);
String prevprevPOS = getPOS(prevprevWord);
String nextnextPOS = getPOS(nextnextWord);
features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
features.incrementCount("next-bigram=" + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
features.incrementCount("prev-bigram=" + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
features.incrementCount("next-pos=" + nextPOS);
features.incrementCount("prev-pos=" + prevPOS);
features.incrementCount("first-pos=" + firstPOS);
features.incrementCount("last-pos=" + lastPOS);
features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
addDependencyFeatures(features, "parent", getDependencyParent(m), true);
addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));
// syntax features
IndexedWord w = m.headIndexedWord;
String depPath = "";
int depth = 0;
while (w != null) {
SemanticGraphEdge e = getDependencyParent(m, w);
depth++;
if (depth <= 3 && e != null) {
depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
features.incrementCount("dep-path=" + depPath);
w = e.getSource();
} else {
w = null;
}
}
if (useConstituencyParse) {
int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
features.incrementCount("mention-embedding-level=" + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
features.incrementCount("head-embedding-level=" + bin(mentionEmbeddingLevel));
} else {
features.incrementCount("undetermined-embedding-level");
}
features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
String syntaxPath = "";
Tree tree = m.contextParseTree;
Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
depth = 0;
for (Tree node : tree.pathNodeToNode(head, tree)) {
syntaxPath += node.value() + "-";
features.incrementCount("syntax-path=" + syntaxPath);
depth++;
if (depth >= 4 || node.value().equals("S")) {
break;
}
}
}
// mention containment features
addFeature(features, "contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
addFeature(features, "contains-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));
// features from dcoref rules
addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
if (doc.docType != DocType.ARTICLE && m.person == Person.YOU && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
features.incrementCount("generic-you");
}
return features;
}
use of edu.stanford.nlp.semgraph.SemanticGraphEdge in project CoreNLP by stanfordnlp.
the class OpenIE method entailmentsFromClause.
/**
* Returns all of the entailed shortened clauses (as per natural logic) from the given clause.
* This runs the forward entailment component of the OpenIE system only.
* It is usually chained together with the clause splitting component: {@link OpenIE#clausesInSentence(CoreMap)}.
*
* @param clause The premise clause, as a sentence fragment in itself.
*
* @return A list of entailed clauses.
*/
@SuppressWarnings("unchecked")
public List<SentenceFragment> entailmentsFromClause(SentenceFragment clause) {
if (clause.parseTree.isEmpty()) {
return Collections.emptyList();
} else {
// Get the forward entailments
List<SentenceFragment> list = new ArrayList<>();
if (entailmentsPerSentence > 0) {
list.addAll(forwardEntailer.apply(clause.parseTree, true).search().stream().map(x -> x.changeScore(x.score * clause.score)).collect(Collectors.toList()));
}
list.add(clause);
// A special case for adjective entailments
List<SentenceFragment> adjFragments = new ArrayList<>();
SemgrexMatcher matcher = adjectivePattern.matcher(clause.parseTree);
OUTER: while (matcher.find()) {
// (get nodes)
IndexedWord subj = matcher.getNode("subj");
IndexedWord be = matcher.getNode("be");
IndexedWord adj = matcher.getNode("adj");
IndexedWord obj = matcher.getNode("obj");
IndexedWord pobj = matcher.getNode("pobj");
String prep = matcher.getRelnString("prep");
// (if the adjective, or any earlier adjective, is privative, then all bets are off)
for (SemanticGraphEdge edge : clause.parseTree.outgoingEdgeIterable(obj)) {
if ("amod".equals(edge.getRelation().toString()) && edge.getDependent().index() <= adj.index() && Util.PRIVATIVE_ADJECTIVES.contains(edge.getDependent().word().toLowerCase())) {
continue OUTER;
}
}
// (create the core tree)
SemanticGraph tree = new SemanticGraph();
tree.addRoot(adj);
tree.addVertex(subj);
tree.addVertex(be);
tree.addEdge(adj, be, GrammaticalRelation.valueOf(Language.English, "cop"), Double.NEGATIVE_INFINITY, false);
tree.addEdge(adj, subj, GrammaticalRelation.valueOf(Language.English, "nsubj"), Double.NEGATIVE_INFINITY, false);
// (add pp attachment, if it existed)
if (pobj != null) {
assert prep != null;
tree.addEdge(adj, pobj, GrammaticalRelation.valueOf(Language.English, prep), Double.NEGATIVE_INFINITY, false);
}
// (check for monotonicity)
if (adj.get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards() && be.get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards()) {
// (add tree)
adjFragments.add(new SentenceFragment(tree, clause.assumedTruth, false));
}
}
list.addAll(adjFragments);
return list;
}
}
use of edu.stanford.nlp.semgraph.SemanticGraphEdge in project CoreNLP by stanfordnlp.
the class OpenIE method canonicalizeCoref.
/**
* Create a copy of the passed parse tree, canonicalizing pronominal nodes with their canonical mention.
* Canonical mentions are tied together with the <i>compound</i> dependency arc; otherwise, the structure of
* the tree remains unchanged.
*
* @param parse The original dependency parse of the sentence.
* @param canonicalMentionMap The map from tokens to their canonical mentions.
*
* @return A <b>copy</b> of the passed parse tree, with pronouns replaces with their canonical mention.
*/
private static SemanticGraph canonicalizeCoref(SemanticGraph parse, Map<CoreLabel, List<CoreLabel>> canonicalMentionMap) {
parse = new SemanticGraph(parse);
for (IndexedWord node : new HashSet<>(parse.vertexSet())) {
// copy the vertex set to prevent ConcurrentModificationExceptions
if (node.tag() != null && node.tag().startsWith("PRP")) {
List<CoreLabel> canonicalMention = canonicalMentionMap.get(node.backingLabel());
if (canonicalMention != null) {
// Case: this node is a preposition with a valid antecedent.
// 1. Save the attaching edges
List<SemanticGraphEdge> incomingEdges = parse.incomingEdgeList(node);
List<SemanticGraphEdge> outgoingEdges = parse.outgoingEdgeList(node);
// 2. Remove the node
parse.removeVertex(node);
// 3. Add the new head word
IndexedWord headWord = new IndexedWord(canonicalMention.get(canonicalMention.size() - 1));
headWord.setPseudoPosition(node.pseudoPosition());
parse.addVertex(headWord);
for (SemanticGraphEdge edge : incomingEdges) {
parse.addEdge(edge.getGovernor(), headWord, edge.getRelation(), edge.getWeight(), edge.isExtra());
}
for (SemanticGraphEdge edge : outgoingEdges) {
parse.addEdge(headWord, edge.getDependent(), edge.getRelation(), edge.getWeight(), edge.isExtra());
}
// 4. Add other words
double pseudoPosition = headWord.pseudoPosition() - 1e-3;
for (int i = canonicalMention.size() - 2; i >= 0; --i) {
// Create the node
IndexedWord dependent = new IndexedWord(canonicalMention.get(i));
// Set its pseudo position appropriately
dependent.setPseudoPosition(pseudoPosition);
pseudoPosition -= 1e-3;
// Add the node to the graph
parse.addVertex(dependent);
parse.addEdge(headWord, dependent, UniversalEnglishGrammaticalRelations.COMPOUND_MODIFIER, 1.0, false);
}
}
}
}
return parse;
}
use of edu.stanford.nlp.semgraph.SemanticGraphEdge in project CoreNLP by stanfordnlp.
the class RelationTripleSegmenter method getValidChunk.
/**
* @see RelationTripleSegmenter#getValidSubjectChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
* @see RelationTripleSegmenter#getValidObjectChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
* @see RelationTripleSegmenter#getValidAdverbChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
*/
@SuppressWarnings("StatementWithEmptyBody")
protected Optional<List<IndexedWord>> getValidChunk(SemanticGraph parse, IndexedWord originalRoot, Set<String> validArcs, Optional<String> ignoredArc, boolean allowExtraArcs) {
PriorityQueue<IndexedWord> chunk = new FixedPrioritiesPriorityQueue<>();
Set<Double> seenIndices = new HashSet<>();
Queue<IndexedWord> fringe = new LinkedList<>();
IndexedWord root = originalRoot;
fringe.add(root);
boolean isCopula = false;
IndexedWord primaryCase = null;
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(originalRoot)) {
String shortName = edge.getRelation().getShortName();
if (shortName.equals("cop") || shortName.equals("auxpass")) {
isCopula = true;
}
if (shortName.equals("case")) {
primaryCase = edge.getDependent();
}
}
while (!fringe.isEmpty()) {
root = fringe.poll();
chunk.add(root, -root.pseudoPosition());
// Sanity check to prevent infinite loops
if (seenIndices.contains(root.pseudoPosition())) {
// TODO(gabor) Indicates a cycle in the tree!
return Optional.empty();
}
seenIndices.add(root.pseudoPosition());
// Check outgoing edges
boolean hasConj = false;
boolean hasCC = false;
for (SemanticGraphEdge edge : parse.getOutEdgesSorted(root)) {
String shortName = edge.getRelation().getShortName();
String name = edge.getRelation().toString();
if (shortName.startsWith("conj")) {
hasConj = true;
}
if (shortName.equals("cc")) {
hasCC = true;
}
//noinspection StatementWithEmptyBody
if (isCopula && (shortName.equals("cop") || shortName.contains("subj") || shortName.equals("auxpass"))) {
// noop; ignore nsubj, cop for extractions with copula
} else if (edge.getDependent() == primaryCase) {
// noop: ignore case edge
} else if (ignoredArc.isPresent() && (ignoredArc.get().equals(name) || (ignoredArc.get().startsWith("conj") && name.equals("cc")))) {
// noop; ignore explicitly requested noop arc, or "CC" if the noop arc is a conj:*
} else if (!validArcs.contains(edge.getRelation().getShortName()) && !validArcs.contains(edge.getRelation().getShortName().replaceAll(":.*", ":*"))) {
if (!allowExtraArcs) {
return Optional.empty();
} else {
// noop: just some dangling arc
}
} else {
fringe.add(edge.getDependent());
}
}
// Ensure that we don't have a conj without a cc, or vice versa
if (Boolean.logicalXor(hasConj, hasCC)) {
return Optional.empty();
}
}
return Optional.of(chunk.toSortedList());
}
use of edu.stanford.nlp.semgraph.SemanticGraphEdge in project CoreNLP by stanfordnlp.
the class RelationTripleSegmenter method extract.
/**
* Extract the nominal patterns from this sentence.
*
* @see RelationTripleSegmenter#NOUN_TOKEN_PATTERNS
* @see RelationTripleSegmenter#NOUN_DEPENDENCY_PATTERNS
*
* @param parse The parse tree of the sentence to annotate.
* @param tokens The tokens of the sentence to annotate.
* @return A list of {@link RelationTriple}s. Note that these do not have an associated tree with them.
*/
@SuppressWarnings("unchecked")
public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens) {
List<RelationTriple> extractions = new ArrayList<>();
Set<Triple<Span, String, Span>> alreadyExtracted = new HashSet<>();
//
for (TokenSequencePattern tokenPattern : NOUN_TOKEN_PATTERNS) {
TokenSequenceMatcher tokenMatcher = tokenPattern.matcher(tokens);
while (tokenMatcher.find()) {
boolean missingPrefixBe;
boolean missingSuffixOf = false;
// Create subject
List<? extends CoreMap> subject = tokenMatcher.groupNodes("$subject");
Span subjectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) subject.get(0)).index() - 1, ((CoreLabel) subject.get(subject.size() - 1)).index()));
List<CoreLabel> subjectTokens = new ArrayList<>();
for (int i : subjectSpan) {
subjectTokens.add(tokens.get(i));
}
// Create object
List<? extends CoreMap> object = tokenMatcher.groupNodes("$object");
Span objectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) object.get(0)).index() - 1, ((CoreLabel) object.get(object.size() - 1)).index()));
if (Span.overlaps(subjectSpan, objectSpan)) {
continue;
}
List<CoreLabel> objectTokens = new ArrayList<>();
for (int i : objectSpan) {
objectTokens.add(tokens.get(i));
}
// Create relation
if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
List<CoreLabel> relationTokens = new ArrayList<>();
// (add the 'be')
missingPrefixBe = true;
// (add a complement to the 'be')
List<? extends CoreMap> beofComp = tokenMatcher.groupNodes("$beof_comp");
if (beofComp != null) {
// (add the complement
for (CoreMap token : beofComp) {
if (token instanceof CoreLabel) {
relationTokens.add((CoreLabel) token);
} else {
relationTokens.add(new CoreLabel(token));
}
}
// (add the 'of')
missingSuffixOf = true;
}
// Add extraction
String relationGloss = StringUtils.join(relationTokens.stream().map(CoreLabel::word), " ");
if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
RelationTriple extraction = new RelationTriple(subjectTokens, relationTokens, objectTokens);
//noinspection ConstantConditions
extraction.isPrefixBe(missingPrefixBe);
extraction.isSuffixOf(missingSuffixOf);
extractions.add(extraction);
alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
}
}
}
//
for (SemgrexPattern semgrex : NOUN_DEPENDENCY_PATTERNS) {
SemgrexMatcher matcher = semgrex.matcher(parse);
while (matcher.find()) {
boolean missingPrefixBe = false;
boolean missingSuffixBe = false;
boolean istmod = false;
// Get relaux if applicable
String relaux = matcher.getRelnString("relaux");
String ignoredArc = relaux;
if (ignoredArc == null) {
ignoredArc = matcher.getRelnString("arc");
}
// Create subject
IndexedWord subject = matcher.getNode("subject");
List<IndexedWord> subjectTokens = new ArrayList<>();
Span subjectSpan;
if (subject.ner() != null && !"O".equals(subject.ner())) {
subjectSpan = Util.extractNER(tokens, Span.fromValues(subject.index() - 1, subject.index()));
for (int i : subjectSpan) {
subjectTokens.add(new IndexedWord(tokens.get(i)));
}
} else {
subjectTokens = getValidChunk(parse, subject, VALID_SUBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(subject));
subjectSpan = Util.tokensToSpan(subjectTokens);
}
// Create object
IndexedWord object = matcher.getNode("object");
List<IndexedWord> objectTokens = new ArrayList<>();
Span objectSpan;
if (object.ner() != null && !"O".equals(object.ner())) {
objectSpan = Util.extractNER(tokens, Span.fromValues(object.index() - 1, object.index()));
for (int i : objectSpan) {
objectTokens.add(new IndexedWord(tokens.get(i)));
}
} else {
objectTokens = getValidChunk(parse, object, VALID_OBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(object));
objectSpan = Util.tokensToSpan(objectTokens);
}
// Check that the pair is valid
if (Span.overlaps(subjectSpan, objectSpan)) {
// We extracted an identity
continue;
}
if (subjectSpan.end() == objectSpan.start() - 1 && (tokens.get(subjectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(subjectSpan.end()).tag()))) {
// We're straddling a clause
continue;
}
if (objectSpan.end() == subjectSpan.start() - 1 && (tokens.get(objectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(objectSpan.end()).tag()))) {
// We're straddling a clause
continue;
}
// Get any prepositional edges
String expected = relaux == null ? "" : relaux.substring(relaux.indexOf(":") + 1).replace("_", " ");
IndexedWord prepWord = null;
// (these usually come from the object)
boolean prepositionIsPrefix = false;
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(object)) {
if (edge.getRelation().toString().equals("case")) {
prepWord = edge.getDependent();
}
}
// (...but sometimes from the subject)
if (prepWord == null) {
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(subject)) {
if (edge.getRelation().toString().equals("case")) {
prepositionIsPrefix = true;
prepWord = edge.getDependent();
}
}
}
List<IndexedWord> prepChunk = Collections.EMPTY_LIST;
if (prepWord != null && !expected.equals("tmod")) {
Optional<List<IndexedWord>> optionalPrepChunk = getValidChunk(parse, prepWord, Collections.singleton("mwe"), Optional.empty(), true);
if (!optionalPrepChunk.isPresent()) {
continue;
}
prepChunk = optionalPrepChunk.get();
Collections.sort(prepChunk, (a, b) -> {
double val = a.pseudoPosition() - b.pseudoPosition();
if (val < 0) {
return -1;
}
if (val > 0) {
return 1;
} else {
return 0;
}
});
// ascending sort
}
// Get the relation
if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
LinkedList<IndexedWord> relationTokens = new LinkedList<>();
IndexedWord relNode = matcher.getNode("relation");
if (relNode != null) {
// Case: we have a grounded relation span
// (add the relation)
relationTokens.add(relNode);
// (add any prepositional case markings)
if (prepositionIsPrefix) {
// We're almost certainly missing a suffix 'be'
missingSuffixBe = true;
for (int i = prepChunk.size() - 1; i >= 0; --i) {
relationTokens.addFirst(prepChunk.get(i));
}
} else {
relationTokens.addAll(prepChunk);
}
if (expected.equalsIgnoreCase("tmod")) {
istmod = true;
}
} else {
// (mark it as missing a preceding 'be'
if (!expected.equals("poss")) {
missingPrefixBe = true;
}
// (add any prepositional case markings)
if (prepositionIsPrefix) {
for (int i = prepChunk.size() - 1; i >= 0; --i) {
relationTokens.addFirst(prepChunk.get(i));
}
} else {
relationTokens.addAll(prepChunk);
}
if (expected.equalsIgnoreCase("tmod")) {
istmod = true;
}
// (some fine-tuning)
if (allowNominalsWithoutNER && "of".equals(expected)) {
// prohibit things like "conductor of electricity" -> "conductor; be of; electricity"
continue;
}
}
// Add extraction
String relationGloss = StringUtils.join(relationTokens.stream().map(IndexedWord::word), " ");
if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
RelationTriple extraction = new RelationTriple(subjectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), relationTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), objectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()));
extraction.istmod(istmod);
extraction.isPrefixBe(missingPrefixBe);
extraction.isSuffixBe(missingSuffixBe);
extractions.add(extraction);
alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
}
}
}
}
}
//
// Filter downward polarity extractions
//
Iterator<RelationTriple> iter = extractions.iterator();
while (iter.hasNext()) {
RelationTriple term = iter.next();
boolean shouldRemove = true;
for (CoreLabel token : term) {
if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null || !token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards()) {
shouldRemove = false;
}
}
if (shouldRemove) {
// Don't extract things in downward polarity contexts.
iter.remove();
}
}
// Return
return extractions;
}
Aggregations