use of edu.stanford.nlp.ie.machinereading.structure.Span in project CoreNLP by stanfordnlp.
the class ProtobufAnnotationSerializer method fromProto.
/**
* Return a {@link RelationTriple} object from the serialized representation.
* This requires a sentence and a document so that
* (1) we have a docid for the dependency tree can be accurately rebuilt,
* and (2) we have references to the tokens to include in the relation triple.
*
* @param proto The serialized relation triples.
* @param doc The document we are deserializing. This document should already
* have a docid annotation set, if there is one.
* @param sentenceIndex The index of the sentence this extraction should be attached to.
*
* @return A relation triple as a Java object, corresponding to the seriaized proto.
*/
public static RelationTriple fromProto(CoreNLPProtos.RelationTriple proto, Annotation doc, int sentenceIndex) {
if (Thread.interrupted()) {
throw new RuntimeInterruptedException();
}
// Get the spans for the extraction
List<CoreLabel> subject = proto.getSubjectTokensList().stream().map(loc -> doc.get(SentencesAnnotation.class).get(loc.getSentenceIndex()).get(TokensAnnotation.class).get(loc.getTokenIndex())).collect(Collectors.toList());
List<CoreLabel> relation;
if (proto.getRelationTokensCount() == 0) {
// If we don't have a real span for the relation, make a dummy word
relation = Collections.singletonList(new CoreLabel(new Word(proto.getRelation())));
} else {
relation = proto.getRelationTokensList().stream().map(loc -> doc.get(SentencesAnnotation.class).get(loc.getSentenceIndex()).get(TokensAnnotation.class).get(loc.getTokenIndex())).collect(Collectors.toList());
}
List<CoreLabel> object = proto.getObjectTokensList().stream().map(loc -> doc.get(SentencesAnnotation.class).get(loc.getSentenceIndex()).get(TokensAnnotation.class).get(loc.getTokenIndex())).collect(Collectors.toList());
// Create the extraction
RelationTriple extraction;
double confidence = proto.getConfidence();
if (proto.hasTree()) {
SemanticGraph tree = fromProto(proto.getTree(), doc.get(SentencesAnnotation.class).get(sentenceIndex).get(TokensAnnotation.class), doc.get(DocIDAnnotation.class), Optional.of(doc));
extraction = new RelationTriple.WithTree(subject, relation, object, tree, confidence);
} else {
extraction = new RelationTriple(subject, relation, object, confidence);
}
// Tweak the extraction
if (proto.hasIstmod()) {
extraction.istmod(proto.getIstmod());
}
if (proto.hasPrefixBe()) {
extraction.isPrefixBe(proto.getPrefixBe());
}
if (proto.hasSuffixBe()) {
extraction.isSuffixBe(proto.getSuffixBe());
}
if (proto.hasSuffixOf()) {
extraction.isSuffixOf(proto.getSuffixOf());
}
// Return
return extraction;
}
use of edu.stanford.nlp.ie.machinereading.structure.Span in project CoreNLP by stanfordnlp.
the class ProtobufAnnotationSerializer method fromProto.
/**
* Read a relation mention from its serialized form. Requires the containing sentence to be
* passed in along with the protocol buffer.
* @param proto The serialized relation mention.
* @param sentence The sentence this mention is attached to.
* @return The relation mention corresponding to the serialized object.
*/
private RelationMention fromProto(CoreNLPProtos.Relation proto, CoreMap sentence) {
List<ExtractionObject> args = proto.getArgList().stream().map(arg -> fromProto(arg, sentence)).collect(Collectors.toList());
RelationMention rtn = new RelationMention(proto.hasObjectID() ? proto.getObjectID() : null, sentence, proto.hasExtentStart() ? new Span(proto.getExtentStart(), proto.getExtentEnd()) : null, proto.hasType() ? proto.getType() : null, proto.hasSubtype() ? proto.getSubtype() : null, args);
if (proto.hasSignature()) {
rtn.setSignature(proto.getSignature());
}
if (proto.getArgNameCount() > 0 || proto.getArgCount() == 0) {
rtn.setArgNames(proto.getArgNameList());
}
return rtn;
}
use of edu.stanford.nlp.ie.machinereading.structure.Span in project CoreNLP by stanfordnlp.
the class KBPStatisticalExtractor method surfaceFeatures.
@SuppressWarnings("UnusedParameters")
private static void surfaceFeatures(KBPInput input, Sentence simpleSentence, ClassicCounter<String> feats) {
List<String> lemmaSpan = spanBetweenMentions(input, CoreLabel::lemma);
List<String> nerSpan = spanBetweenMentions(input, CoreLabel::ner);
List<String> posSpan = spanBetweenMentions(input, CoreLabel::tag);
// Unigram features of the sentence
List<CoreLabel> tokens = input.sentence.asCoreLabels(Sentence::lemmas, Sentence::nerTags);
for (CoreLabel token : tokens) {
indicator(feats, "sentence_unigram", token.lemma());
}
// Full lemma span ( -0.3 F1 )
// if (lemmaSpan.size() <= 5) {
// indicator(feats, "full_lemma_span", withMentionsPositioned(input, StringUtils.join(lemmaSpan, " ")));
// }
// Lemma n-grams
String lastLemma = "_^_";
for (String lemma : lemmaSpan) {
indicator(feats, "lemma_bigram", withMentionsPositioned(input, lastLemma + " " + lemma));
indicator(feats, "lemma_unigram", withMentionsPositioned(input, lemma));
lastLemma = lemma;
}
indicator(feats, "lemma_bigram", withMentionsPositioned(input, lastLemma + " _$_"));
// NER + lemma bi-grams
for (int i = 0; i < lemmaSpan.size() - 1; ++i) {
if (!"O".equals(nerSpan.get(i)) && "O".equals(nerSpan.get(i + 1)) && "IN".equals(posSpan.get(i + 1))) {
indicator(feats, "ner/lemma_bigram", withMentionsPositioned(input, nerSpan.get(i) + " " + lemmaSpan.get(i + 1)));
}
if (!"O".equals(nerSpan.get(i + 1)) && "O".equals(nerSpan.get(i)) && "IN".equals(posSpan.get(i))) {
indicator(feats, "ner/lemma_bigram", withMentionsPositioned(input, lemmaSpan.get(i) + " " + nerSpan.get(i + 1)));
}
}
// Distance between mentions
String distanceBucket = ">10";
if (lemmaSpan.size() == 0) {
distanceBucket = "0";
} else if (lemmaSpan.size() <= 3) {
distanceBucket = "<=3";
} else if (lemmaSpan.size() <= 5) {
distanceBucket = "<=5";
} else if (lemmaSpan.size() <= 10) {
distanceBucket = "<=10";
} else if (lemmaSpan.size() <= 15) {
distanceBucket = "<=15";
}
indicator(feats, "distance_between_entities_bucket", distanceBucket);
// Punctuation features
int numCommasInSpan = 0;
int numQuotesInSpan = 0;
int parenParity = 0;
for (String lemma : lemmaSpan) {
if (lemma.equals(",")) {
numCommasInSpan += 1;
}
if (lemma.equals("\"") || lemma.equals("``") || lemma.equals("''")) {
numQuotesInSpan += 1;
}
if (lemma.equals("(") || lemma.equals("-LRB-")) {
parenParity += 1;
}
if (lemma.equals(")") || lemma.equals("-RRB-")) {
parenParity -= 1;
}
}
indicator(feats, "comma_parity", numCommasInSpan % 2 == 0 ? "even" : "odd");
indicator(feats, "quote_parity", numQuotesInSpan % 2 == 0 ? "even" : "odd");
indicator(feats, "paren_parity", "" + parenParity);
// Is broken by entity
Set<String> intercedingNERTags = nerSpan.stream().filter(ner -> !ner.equals("O")).collect(Collectors.toSet());
if (!intercedingNERTags.isEmpty()) {
indicator(feats, "has_interceding_ner", "t");
}
for (String ner : intercedingNERTags) {
indicator(feats, "interceding_ner", ner);
}
// Left and right context
List<CoreLabel> sentence = input.sentence.asCoreLabels(Sentence::nerTags);
if (input.subjectSpan.start() == 0) {
indicator(feats, "subj_left", "^");
} else {
indicator(feats, "subj_left", sentence.get(input.subjectSpan.start() - 1).lemma());
}
if (input.subjectSpan.end() == sentence.size()) {
indicator(feats, "subj_right", "$");
} else {
indicator(feats, "subj_right", sentence.get(input.subjectSpan.end()).lemma());
}
if (input.objectSpan.start() == 0) {
indicator(feats, "obj_left", "^");
} else {
indicator(feats, "obj_left", sentence.get(input.objectSpan.start() - 1).lemma());
}
if (input.objectSpan.end() == sentence.size()) {
indicator(feats, "obj_right", "$");
} else {
indicator(feats, "obj_right", sentence.get(input.objectSpan.end()).lemma());
}
// Skip-word patterns
if (lemmaSpan.size() == 1 && input.subjectSpan.isBefore(input.objectSpan)) {
String left = input.subjectSpan.start() == 0 ? "^" : sentence.get(input.subjectSpan.start() - 1).lemma();
indicator(feats, "X<subj>Y<obj>", left + "_" + lemmaSpan.get(0));
}
}
use of edu.stanford.nlp.ie.machinereading.structure.Span in project CoreNLP by stanfordnlp.
the class KBPStatisticalExtractor method spanBetweenMentions.
/**
* Get information from the span between the two mentions.
* Canonically, get the words in this span.
* For instance, for "Obama was born in Hawaii", this would return a list
* "was born in" if the selector is <code>CoreLabel::token</code>;
* or "be bear in" if the selector is <code>CoreLabel::lemma</code>.
*
* @param input The featurizer input.
* @param selector The field to compute for each element in the span. A good default is <code></code>CoreLabel::word</code> or <code></code>CoreLabel::token</code>
* @param <E> The type of element returned by the selector.
*
* @return A list of elements between the two mentions.
*/
@SuppressWarnings("unchecked")
private static <E> List<E> spanBetweenMentions(KBPInput input, Function<CoreLabel, E> selector) {
List<CoreLabel> sentence = input.sentence.asCoreLabels(Sentence::lemmas, Sentence::nerTags);
Span subjSpan = input.subjectSpan;
Span objSpan = input.objectSpan;
// Corner cases
if (Span.overlaps(subjSpan, objSpan)) {
return Collections.EMPTY_LIST;
}
// Get the range between the subject and object
int begin = subjSpan.end();
int end = objSpan.start();
if (begin > end) {
begin = objSpan.end();
end = subjSpan.start();
}
if (begin > end) {
throw new IllegalArgumentException("Gabor sucks at logic and he should feel bad about it: " + subjSpan + " and " + objSpan);
} else if (begin == end) {
return Collections.EMPTY_LIST;
}
// Compute the return value
List<E> rtn = new ArrayList<>();
for (int i = begin; i < end; ++i) {
rtn.add(selector.apply(sentence.get(i)));
}
return rtn;
}
use of edu.stanford.nlp.ie.machinereading.structure.Span in project CoreNLP by stanfordnlp.
the class RothCONLL04Reader method setHeadWord.
/*
* Sets the head word and the index for an entity, given the parse tree for
* the sentence containing the entity.
*
* This code is no longer used, but I've kept it around (at least for now) as
* reference when we modify preProcessSentences().
*/
@SuppressWarnings("unused")
private void setHeadWord(EntityMention entity, Tree tree) {
List<Tree> leaves = tree.getLeaves();
Tree argRoot = tree.joinNode(leaves.get(entity.getExtentTokenStart()), leaves.get(entity.getExtentTokenEnd()));
Tree headWordNode = argRoot.headTerminal(headFinder);
int headWordIndex = getIndexByObjectEquality(leaves, headWordNode);
if (StringUtils.isPunct(leaves.get(entity.getExtentTokenEnd()).label().value().trim()) && (headWordIndex >= entity.getExtentTokenEnd() || headWordIndex < entity.getExtentTokenStart())) {
argRoot = tree.joinNode(leaves.get(entity.getExtentTokenStart()), leaves.get(entity.getExtentTokenEnd() - 1));
headWordNode = argRoot.headTerminal(headFinder);
headWordIndex = getIndexByObjectEquality(leaves, headWordNode);
if (headWordIndex >= entity.getExtentTokenStart() && headWordIndex <= entity.getExtentTokenEnd() - 1) {
entity.setHeadTokenPosition(headWordIndex);
entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
}
}
if (headWordIndex >= entity.getExtentTokenStart() && headWordIndex <= entity.getExtentTokenEnd()) {
entity.setHeadTokenPosition(headWordIndex);
entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
} else {
// Re-parse the argument words by themselves
// Get the list of words in the arg by looking at the leaves between
// arg.getExtentTokenStart() and arg.getExtentTokenEnd() inclusive
List<String> argWords = new ArrayList<>();
for (int i = entity.getExtentTokenStart(); i <= entity.getExtentTokenEnd(); i++) {
argWords.add(leaves.get(i).label().value());
}
if (StringUtils.isPunct(argWords.get(argWords.size() - 1))) {
argWords.remove(argWords.size() - 1);
}
Tree argTree = parseStrings(argWords);
headWordNode = argTree.headTerminal(headFinder);
headWordIndex = getIndexByObjectEquality(argTree.getLeaves(), headWordNode) + entity.getExtentTokenStart();
entity.setHeadTokenPosition(headWordIndex);
entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
}
}
Aggregations