use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class RelationTripleSegmenter method getValidChunk.
/**
* @see RelationTripleSegmenter#getValidSubjectChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
* @see RelationTripleSegmenter#getValidObjectChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
* @see RelationTripleSegmenter#getValidAdverbChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
*/
@SuppressWarnings("StatementWithEmptyBody")
protected Optional<List<IndexedWord>> getValidChunk(SemanticGraph parse, IndexedWord originalRoot, Set<String> validArcs, Optional<String> ignoredArc, boolean allowExtraArcs) {
PriorityQueue<IndexedWord> chunk = new FixedPrioritiesPriorityQueue<>();
Set<Double> seenIndices = new HashSet<>();
Queue<IndexedWord> fringe = new LinkedList<>();
IndexedWord root = originalRoot;
fringe.add(root);
boolean isCopula = false;
IndexedWord primaryCase = null;
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(originalRoot)) {
String shortName = edge.getRelation().getShortName();
if (shortName.equals("cop") || shortName.equals("auxpass")) {
isCopula = true;
}
if (shortName.equals("case")) {
primaryCase = edge.getDependent();
}
}
while (!fringe.isEmpty()) {
root = fringe.poll();
chunk.add(root, -root.pseudoPosition());
// Sanity check to prevent infinite loops
if (seenIndices.contains(root.pseudoPosition())) {
// TODO(gabor) Indicates a cycle in the tree!
return Optional.empty();
}
seenIndices.add(root.pseudoPosition());
// Check outgoing edges
boolean hasConj = false;
boolean hasCC = false;
for (SemanticGraphEdge edge : parse.getOutEdgesSorted(root)) {
String shortName = edge.getRelation().getShortName();
String name = edge.getRelation().toString();
if (shortName.startsWith("conj")) {
hasConj = true;
}
if (shortName.equals("cc")) {
hasCC = true;
}
//noinspection StatementWithEmptyBody
if (isCopula && (shortName.equals("cop") || shortName.contains("subj") || shortName.equals("auxpass"))) {
// noop; ignore nsubj, cop for extractions with copula
} else if (edge.getDependent() == primaryCase) {
// noop: ignore case edge
} else if (ignoredArc.isPresent() && (ignoredArc.get().equals(name) || (ignoredArc.get().startsWith("conj") && name.equals("cc")))) {
// noop; ignore explicitly requested noop arc, or "CC" if the noop arc is a conj:*
} else if (!validArcs.contains(edge.getRelation().getShortName()) && !validArcs.contains(edge.getRelation().getShortName().replaceAll(":.*", ":*"))) {
if (!allowExtraArcs) {
return Optional.empty();
} else {
// noop: just some dangling arc
}
} else {
fringe.add(edge.getDependent());
}
}
// Ensure that we don't have a conj without a cc, or vice versa
if (Boolean.logicalXor(hasConj, hasCC)) {
return Optional.empty();
}
}
return Optional.of(chunk.toSortedList());
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class RelationTripleSegmenter method extract.
/**
* Extract the nominal patterns from this sentence.
*
* @see RelationTripleSegmenter#NOUN_TOKEN_PATTERNS
* @see RelationTripleSegmenter#NOUN_DEPENDENCY_PATTERNS
*
* @param parse The parse tree of the sentence to annotate.
* @param tokens The tokens of the sentence to annotate.
* @return A list of {@link RelationTriple}s. Note that these do not have an associated tree with them.
*/
@SuppressWarnings("unchecked")
public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens) {
List<RelationTriple> extractions = new ArrayList<>();
Set<Triple<Span, String, Span>> alreadyExtracted = new HashSet<>();
//
for (TokenSequencePattern tokenPattern : NOUN_TOKEN_PATTERNS) {
TokenSequenceMatcher tokenMatcher = tokenPattern.matcher(tokens);
while (tokenMatcher.find()) {
boolean missingPrefixBe;
boolean missingSuffixOf = false;
// Create subject
List<? extends CoreMap> subject = tokenMatcher.groupNodes("$subject");
Span subjectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) subject.get(0)).index() - 1, ((CoreLabel) subject.get(subject.size() - 1)).index()));
List<CoreLabel> subjectTokens = new ArrayList<>();
for (int i : subjectSpan) {
subjectTokens.add(tokens.get(i));
}
// Create object
List<? extends CoreMap> object = tokenMatcher.groupNodes("$object");
Span objectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) object.get(0)).index() - 1, ((CoreLabel) object.get(object.size() - 1)).index()));
if (Span.overlaps(subjectSpan, objectSpan)) {
continue;
}
List<CoreLabel> objectTokens = new ArrayList<>();
for (int i : objectSpan) {
objectTokens.add(tokens.get(i));
}
// Create relation
if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
List<CoreLabel> relationTokens = new ArrayList<>();
// (add the 'be')
missingPrefixBe = true;
// (add a complement to the 'be')
List<? extends CoreMap> beofComp = tokenMatcher.groupNodes("$beof_comp");
if (beofComp != null) {
// (add the complement
for (CoreMap token : beofComp) {
if (token instanceof CoreLabel) {
relationTokens.add((CoreLabel) token);
} else {
relationTokens.add(new CoreLabel(token));
}
}
// (add the 'of')
missingSuffixOf = true;
}
// Add extraction
String relationGloss = StringUtils.join(relationTokens.stream().map(CoreLabel::word), " ");
if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
RelationTriple extraction = new RelationTriple(subjectTokens, relationTokens, objectTokens);
//noinspection ConstantConditions
extraction.isPrefixBe(missingPrefixBe);
extraction.isSuffixOf(missingSuffixOf);
extractions.add(extraction);
alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
}
}
}
//
for (SemgrexPattern semgrex : NOUN_DEPENDENCY_PATTERNS) {
SemgrexMatcher matcher = semgrex.matcher(parse);
while (matcher.find()) {
boolean missingPrefixBe = false;
boolean missingSuffixBe = false;
boolean istmod = false;
// Get relaux if applicable
String relaux = matcher.getRelnString("relaux");
String ignoredArc = relaux;
if (ignoredArc == null) {
ignoredArc = matcher.getRelnString("arc");
}
// Create subject
IndexedWord subject = matcher.getNode("subject");
List<IndexedWord> subjectTokens = new ArrayList<>();
Span subjectSpan;
if (subject.ner() != null && !"O".equals(subject.ner())) {
subjectSpan = Util.extractNER(tokens, Span.fromValues(subject.index() - 1, subject.index()));
for (int i : subjectSpan) {
subjectTokens.add(new IndexedWord(tokens.get(i)));
}
} else {
subjectTokens = getValidChunk(parse, subject, VALID_SUBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(subject));
subjectSpan = Util.tokensToSpan(subjectTokens);
}
// Create object
IndexedWord object = matcher.getNode("object");
List<IndexedWord> objectTokens = new ArrayList<>();
Span objectSpan;
if (object.ner() != null && !"O".equals(object.ner())) {
objectSpan = Util.extractNER(tokens, Span.fromValues(object.index() - 1, object.index()));
for (int i : objectSpan) {
objectTokens.add(new IndexedWord(tokens.get(i)));
}
} else {
objectTokens = getValidChunk(parse, object, VALID_OBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(object));
objectSpan = Util.tokensToSpan(objectTokens);
}
// Check that the pair is valid
if (Span.overlaps(subjectSpan, objectSpan)) {
// We extracted an identity
continue;
}
if (subjectSpan.end() == objectSpan.start() - 1 && (tokens.get(subjectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(subjectSpan.end()).tag()))) {
// We're straddling a clause
continue;
}
if (objectSpan.end() == subjectSpan.start() - 1 && (tokens.get(objectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(objectSpan.end()).tag()))) {
// We're straddling a clause
continue;
}
// Get any prepositional edges
String expected = relaux == null ? "" : relaux.substring(relaux.indexOf(":") + 1).replace("_", " ");
IndexedWord prepWord = null;
// (these usually come from the object)
boolean prepositionIsPrefix = false;
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(object)) {
if (edge.getRelation().toString().equals("case")) {
prepWord = edge.getDependent();
}
}
// (...but sometimes from the subject)
if (prepWord == null) {
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(subject)) {
if (edge.getRelation().toString().equals("case")) {
prepositionIsPrefix = true;
prepWord = edge.getDependent();
}
}
}
List<IndexedWord> prepChunk = Collections.EMPTY_LIST;
if (prepWord != null && !expected.equals("tmod")) {
Optional<List<IndexedWord>> optionalPrepChunk = getValidChunk(parse, prepWord, Collections.singleton("mwe"), Optional.empty(), true);
if (!optionalPrepChunk.isPresent()) {
continue;
}
prepChunk = optionalPrepChunk.get();
Collections.sort(prepChunk, (a, b) -> {
double val = a.pseudoPosition() - b.pseudoPosition();
if (val < 0) {
return -1;
}
if (val > 0) {
return 1;
} else {
return 0;
}
});
// ascending sort
}
// Get the relation
if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
LinkedList<IndexedWord> relationTokens = new LinkedList<>();
IndexedWord relNode = matcher.getNode("relation");
if (relNode != null) {
// Case: we have a grounded relation span
// (add the relation)
relationTokens.add(relNode);
// (add any prepositional case markings)
if (prepositionIsPrefix) {
// We're almost certainly missing a suffix 'be'
missingSuffixBe = true;
for (int i = prepChunk.size() - 1; i >= 0; --i) {
relationTokens.addFirst(prepChunk.get(i));
}
} else {
relationTokens.addAll(prepChunk);
}
if (expected.equalsIgnoreCase("tmod")) {
istmod = true;
}
} else {
// (mark it as missing a preceding 'be'
if (!expected.equals("poss")) {
missingPrefixBe = true;
}
// (add any prepositional case markings)
if (prepositionIsPrefix) {
for (int i = prepChunk.size() - 1; i >= 0; --i) {
relationTokens.addFirst(prepChunk.get(i));
}
} else {
relationTokens.addAll(prepChunk);
}
if (expected.equalsIgnoreCase("tmod")) {
istmod = true;
}
// (some fine-tuning)
if (allowNominalsWithoutNER && "of".equals(expected)) {
// prohibit things like "conductor of electricity" -> "conductor; be of; electricity"
continue;
}
}
// Add extraction
String relationGloss = StringUtils.join(relationTokens.stream().map(IndexedWord::word), " ");
if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
RelationTriple extraction = new RelationTriple(subjectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), relationTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), objectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()));
extraction.istmod(istmod);
extraction.isPrefixBe(missingPrefixBe);
extraction.isSuffixBe(missingSuffixBe);
extractions.add(extraction);
alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
}
}
}
}
}
//
// Filter downward polarity extractions
//
Iterator<RelationTriple> iter = extractions.iterator();
while (iter.hasNext()) {
RelationTriple term = iter.next();
boolean shouldRemove = true;
for (CoreLabel token : term) {
if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null || !token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards()) {
shouldRemove = false;
}
}
if (shouldRemove) {
// Don't extract things in downward polarity contexts.
iter.remove();
}
}
// Return
return extractions;
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class NaturalLogicAnnotator method annotateUnaries.
/**
* Annotate any unary quantifiers that weren't found in the main {@link NaturalLogicAnnotator#annotateOperators(CoreMap)} method.
* @param sentence The sentence to annotate.
*/
private static void annotateUnaries(CoreMap sentence) {
// Get tree and tokens
SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
if (tree == null) {
tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
}
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
// Get operator exists mask
boolean[] isOperator = new boolean[tokens.size()];
for (int i = 0; i < isOperator.length; ++i) {
OperatorSpec spec = tokens.get(i).get(OperatorAnnotation.class);
if (spec != null) {
for (int k = spec.quantifierBegin; k < spec.quantifierEnd; ++k) {
isOperator[k] = true;
}
}
}
// Match Semgrex
SemgrexMatcher matcher = UNARY_PATTERN.matcher(tree);
while (matcher.find()) {
// Get relevant nodes
IndexedWord quantifier = matcher.getNode("quantifier");
String word = quantifier.word().toLowerCase();
if (word.equals("a") || word.equals("an") || word.equals("the") || "CD".equals(quantifier.tag())) {
// These are absurdly common, and uninformative, and we're just going to shoot ourselves in the foot from parsing errors and idiomatic expressions.
continue;
}
IndexedWord subject = matcher.getNode("subject");
// ... If there is not already an operator there
if (!isOperator[quantifier.index() - 1]) {
Optional<Triple<Operator, Integer, Integer>> quantifierInfo = validateQuantifierByHead(sentence, quantifier);
// ... and if we found a quantifier span
if (quantifierInfo.isPresent()) {
// Then add the unary operator!
OperatorSpec scope = computeScope(tree, quantifierInfo.get().first, subject, Pair.makePair(quantifierInfo.get().second, quantifierInfo.get().third), null, false, null, tokens.size());
CoreLabel token = tokens.get(quantifier.index() - 1);
token.set(OperatorAnnotation.class, scope);
}
}
}
// Match TokensRegex
TokenSequenceMatcher tokenMatcher = DOUBT_PATTERN.matcher(tokens);
while (tokenMatcher.find()) {
List<CoreLabel> doubt = (List<CoreLabel>) tokenMatcher.groupNodes("$doubt");
List<CoreLabel> target = (List<CoreLabel>) tokenMatcher.groupNodes("$target");
for (CoreLabel word : doubt) {
OperatorSpec spec = new OperatorSpec(Operator.GENERAL_NEG_POLARITY, word.index() - 1, word.index(), target.get(0).index() - 1, target.get(target.size() - 1).index(), 0, 0, tokens.size());
word.set(OperatorAnnotation.class, spec);
}
}
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class NaturalLogicAnnotator method annotateOperators.
/**
* Find the operators in this sentence, annotating the head word (only!) of each operator with the
* {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.OperatorAnnotation}.
*
* @param sentence As in {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotator#doOneSentence(edu.stanford.nlp.pipeline.Annotation, edu.stanford.nlp.util.CoreMap)}
*/
private void annotateOperators(CoreMap sentence) {
SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
if (tree == null) {
tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
}
for (SemgrexPattern pattern : PATTERNS) {
SemgrexMatcher matcher = pattern.matcher(tree);
while (matcher.find()) {
// Get terms
IndexedWord properSubject = matcher.getNode("Subject");
IndexedWord quantifier, subject;
boolean namedEntityQuantifier = false;
if (properSubject != null) {
quantifier = subject = properSubject;
namedEntityQuantifier = true;
} else {
quantifier = matcher.getNode("quantifier");
subject = matcher.getNode("subject");
}
// Validate quantifier
// At the end of this
Optional<Triple<Operator, Integer, Integer>> quantifierInfo;
if (namedEntityQuantifier) {
// named entities have the "all" semantics by default.
if (!neQuantifiers) {
continue;
}
// note: empty quantifier span given
quantifierInfo = Optional.of(Triple.makeTriple(Operator.IMPLICIT_NAMED_ENTITY, quantifier.index(), quantifier.index()));
} else {
// find the quantifier, and return some info about it.
quantifierInfo = validateQuantifierByHead(sentence, quantifier);
}
// (fix up 'there are')
if ("be".equals(subject == null ? null : subject.lemma())) {
boolean hasExpl = false;
IndexedWord newSubject = null;
for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(subject)) {
if ("nsubj".equals(outgoingEdge.getRelation().toString())) {
newSubject = outgoingEdge.getDependent();
} else if ("expl".equals(outgoingEdge.getRelation().toString())) {
hasExpl = true;
}
}
if (hasExpl) {
subject = newSubject;
}
}
// (fix up '$n$ of')
if ("CD".equals(subject == null ? null : subject.tag())) {
for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(subject)) {
String rel = outgoingEdge.getRelation().toString();
if (rel.startsWith("nmod")) {
subject = outgoingEdge.getDependent();
}
}
}
// Set tokens
if (quantifierInfo.isPresent()) {
// Compute span
OperatorSpec scope = computeScope(tree, quantifierInfo.get().first, matcher.getNode("pivot"), Pair.makePair(quantifierInfo.get().second, quantifierInfo.get().third), subject, namedEntityQuantifier, matcher.getNode("object"), tokens.size());
// Set annotation
CoreLabel token = sentence.get(CoreAnnotations.TokensAnnotation.class).get(quantifier.index() - 1);
OperatorSpec oldScope = token.get(OperatorAnnotation.class);
if (oldScope == null || oldScope.quantifierLength() < scope.quantifierLength() || oldScope.instance != scope.instance) {
token.set(OperatorAnnotation.class, scope);
} else {
token.set(OperatorAnnotation.class, OperatorSpec.merge(oldScope, scope));
}
}
}
}
// Ensure we didn't select overlapping quantifiers. For example, "a" and "a few" can often overlap.
// In these cases, take the longer quantifier match.
List<OperatorSpec> quantifiers = new ArrayList<>();
sentence.get(CoreAnnotations.TokensAnnotation.class).stream().filter(token -> token.containsKey(OperatorAnnotation.class)).forEach(token -> quantifiers.add(token.get(OperatorAnnotation.class)));
quantifiers.sort((x, y) -> y.quantifierLength() - x.quantifierLength());
for (OperatorSpec quantifier : quantifiers) {
for (int i = quantifier.quantifierBegin; i < quantifier.quantifierEnd; ++i) {
if (i != quantifier.quantifierHead) {
tokens.get(i).remove(OperatorAnnotation.class);
}
}
}
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class NaturalLogicAnnotator method annotatePolarity.
/**
* Annotate every token for its polarity, based on the operators found. This function will set the
* {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.PolarityAnnotation} for every token.
*
* @param sentence As in {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotator#doOneSentence(edu.stanford.nlp.pipeline.Annotation, edu.stanford.nlp.util.CoreMap)}
*/
private static void annotatePolarity(CoreMap sentence) {
// Collect all the operators in this sentence
List<OperatorSpec> operators = new ArrayList<>();
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (CoreLabel token : tokens) {
OperatorSpec specOrNull = token.get(OperatorAnnotation.class);
if (specOrNull != null) {
operators.add(specOrNull);
}
}
// dependency tree is put together haphazardly.
if (sentence.containsKey(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)) {
for (IndexedWord token : sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class).vertexSet()) {
token.set(PolarityAnnotation.class, Polarity.DEFAULT);
}
}
if (sentence.containsKey(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)) {
for (IndexedWord token : sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class).vertexSet()) {
token.set(PolarityAnnotation.class, Polarity.DEFAULT);
}
}
if (sentence.containsKey(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)) {
for (IndexedWord token : sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class).vertexSet()) {
token.set(PolarityAnnotation.class, Polarity.DEFAULT);
}
}
// Set polarity for each token
for (int i = 0; i < tokens.size(); ++i) {
CoreLabel token = tokens.get(i);
// Get operators in scope
List<Triple<Integer, Monotonicity, MonotonicityType>> inScope = new ArrayList<>(4);
for (OperatorSpec operator : operators) {
if (i >= operator.subjectBegin && i < operator.subjectEnd) {
inScope.add(Triple.makeTriple(operator.subjectEnd - operator.subjectBegin, operator.instance.subjMono, operator.instance.subjType));
} else if (i >= operator.objectBegin && i < operator.objectEnd) {
inScope.add(Triple.makeTriple(operator.objectEnd - operator.objectBegin, operator.instance.objMono, operator.instance.objType));
}
}
// Sort the operators by their scope (approximated by the size of their argument span
inScope.sort((x, y) -> y.first - x.first);
// Create polarity
List<Pair<Monotonicity, MonotonicityType>> info = new ArrayList<>(inScope.size());
for (Triple<Integer, Monotonicity, MonotonicityType> term : inScope) {
info.add(Pair.makePair(term.second, term.third));
}
Polarity polarity = new Polarity(info);
// Set polarity
token.set(PolarityAnnotation.class, polarity);
}
}
Aggregations