use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class Rules method entityHaveExtraProperNoun.
/** Have extra proper noun except strings involved in semantic match */
public static boolean entityHaveExtraProperNoun(Mention m, Mention a, Set<String> exceptWords) {
Set<String> mProper = Generics.newHashSet();
Set<String> aProper = Generics.newHashSet();
String mString = m.spanToString();
String aString = a.spanToString();
for (CoreLabel w : m.originalSpan) {
if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
mProper.add(w.get(CoreAnnotations.TextAnnotation.class));
}
}
for (CoreLabel w : a.originalSpan) {
if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
aProper.add(w.get(CoreAnnotations.TextAnnotation.class));
}
}
boolean mHasExtra = false;
boolean aHasExtra = false;
for (String s : mProper) {
if (!aString.contains(s) && !exceptWords.contains(s.toLowerCase())) {
mHasExtra = true;
break;
}
}
for (String s : aProper) {
if (!mString.contains(s) && !exceptWords.contains(s.toLowerCase())) {
aHasExtra = true;
break;
}
}
if (mHasExtra && aHasExtra) {
return true;
}
return false;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class Rules method isAcronym.
public static boolean isAcronym(List<CoreLabel> first, List<CoreLabel> second) {
if (first.size() > 1 && second.size() > 1) {
return false;
}
if (first.size() == 0 && second.size() == 0) {
return false;
}
List<CoreLabel> longer;
List<CoreLabel> shorter;
if (first.size() == second.size()) {
String firstWord = first.get(0).get(CoreAnnotations.TextAnnotation.class);
String secondWord = second.get(0).get(CoreAnnotations.TextAnnotation.class);
longer = (firstWord.length() > secondWord.length()) ? first : second;
shorter = (firstWord.length() > secondWord.length()) ? second : first;
} else {
longer = (first.size() > 0 && first.size() > second.size()) ? first : second;
shorter = (second.size() > 0 && first.size() > second.size()) ? second : first;
}
String acronym = shorter.size() > 0 ? shorter.get(0).get(CoreAnnotations.TextAnnotation.class) : "<UNK>";
// time iterating through the text of the longer mention
for (int acronymPos = 0; acronymPos < acronym.length(); ++acronymPos) {
if (acronym.charAt(acronymPos) < 'A' || acronym.charAt(acronymPos) > 'Z') {
return false;
}
}
int acronymPos = 0;
for (CoreLabel aLonger1 : longer) {
String word = aLonger1.get(CoreAnnotations.TextAnnotation.class);
for (int charNum = 0; charNum < word.length(); ++charNum) {
if (word.charAt(charNum) >= 'A' && word.charAt(charNum) <= 'Z') {
// the longer mention than in the shorter mention
if (acronymPos >= acronym.length()) {
return false;
}
if (acronym.charAt(acronymPos) != word.charAt(charNum)) {
return false;
}
++acronymPos;
}
}
}
if (acronymPos != acronym.length()) {
return false;
}
for (CoreLabel aLonger : longer) {
if (aLonger.get(CoreAnnotations.TextAnnotation.class).contains(acronym)) {
return false;
}
}
return true;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class KBPStatisticalExtractor method surfaceFeatures.
@SuppressWarnings("UnusedParameters")
private static void surfaceFeatures(KBPInput input, Sentence simpleSentence, ClassicCounter<String> feats) {
List<String> lemmaSpan = spanBetweenMentions(input, CoreLabel::lemma);
List<String> nerSpan = spanBetweenMentions(input, CoreLabel::ner);
List<String> posSpan = spanBetweenMentions(input, CoreLabel::tag);
// Unigram features of the sentence
List<CoreLabel> tokens = input.sentence.asCoreLabels(Sentence::lemmas, Sentence::nerTags);
for (CoreLabel token : tokens) {
indicator(feats, "sentence_unigram", token.lemma());
}
// Full lemma span ( -0.3 F1 )
// if (lemmaSpan.size() <= 5) {
// indicator(feats, "full_lemma_span", withMentionsPositioned(input, StringUtils.join(lemmaSpan, " ")));
// }
// Lemma n-grams
String lastLemma = "_^_";
for (String lemma : lemmaSpan) {
indicator(feats, "lemma_bigram", withMentionsPositioned(input, lastLemma + " " + lemma));
indicator(feats, "lemma_unigram", withMentionsPositioned(input, lemma));
lastLemma = lemma;
}
indicator(feats, "lemma_bigram", withMentionsPositioned(input, lastLemma + " _$_"));
// NER + lemma bi-grams
for (int i = 0; i < lemmaSpan.size() - 1; ++i) {
if (!"O".equals(nerSpan.get(i)) && "O".equals(nerSpan.get(i + 1)) && "IN".equals(posSpan.get(i + 1))) {
indicator(feats, "ner/lemma_bigram", withMentionsPositioned(input, nerSpan.get(i) + " " + lemmaSpan.get(i + 1)));
}
if (!"O".equals(nerSpan.get(i + 1)) && "O".equals(nerSpan.get(i)) && "IN".equals(posSpan.get(i))) {
indicator(feats, "ner/lemma_bigram", withMentionsPositioned(input, lemmaSpan.get(i) + " " + nerSpan.get(i + 1)));
}
}
// Distance between mentions
String distanceBucket = ">10";
if (lemmaSpan.size() == 0) {
distanceBucket = "0";
} else if (lemmaSpan.size() <= 3) {
distanceBucket = "<=3";
} else if (lemmaSpan.size() <= 5) {
distanceBucket = "<=5";
} else if (lemmaSpan.size() <= 10) {
distanceBucket = "<=10";
} else if (lemmaSpan.size() <= 15) {
distanceBucket = "<=15";
}
indicator(feats, "distance_between_entities_bucket", distanceBucket);
// Punctuation features
int numCommasInSpan = 0;
int numQuotesInSpan = 0;
int parenParity = 0;
for (String lemma : lemmaSpan) {
if (lemma.equals(",")) {
numCommasInSpan += 1;
}
if (lemma.equals("\"") || lemma.equals("``") || lemma.equals("''")) {
numQuotesInSpan += 1;
}
if (lemma.equals("(") || lemma.equals("-LRB-")) {
parenParity += 1;
}
if (lemma.equals(")") || lemma.equals("-RRB-")) {
parenParity -= 1;
}
}
indicator(feats, "comma_parity", numCommasInSpan % 2 == 0 ? "even" : "odd");
indicator(feats, "quote_parity", numQuotesInSpan % 2 == 0 ? "even" : "odd");
indicator(feats, "paren_parity", "" + parenParity);
// Is broken by entity
Set<String> intercedingNERTags = nerSpan.stream().filter(ner -> !ner.equals("O")).collect(Collectors.toSet());
if (!intercedingNERTags.isEmpty()) {
indicator(feats, "has_interceding_ner", "t");
}
for (String ner : intercedingNERTags) {
indicator(feats, "interceding_ner", ner);
}
// Left and right context
List<CoreLabel> sentence = input.sentence.asCoreLabels(Sentence::nerTags);
if (input.subjectSpan.start() == 0) {
indicator(feats, "subj_left", "^");
} else {
indicator(feats, "subj_left", sentence.get(input.subjectSpan.start() - 1).lemma());
}
if (input.subjectSpan.end() == sentence.size()) {
indicator(feats, "subj_right", "$");
} else {
indicator(feats, "subj_right", sentence.get(input.subjectSpan.end()).lemma());
}
if (input.objectSpan.start() == 0) {
indicator(feats, "obj_left", "^");
} else {
indicator(feats, "obj_left", sentence.get(input.objectSpan.start() - 1).lemma());
}
if (input.objectSpan.end() == sentence.size()) {
indicator(feats, "obj_right", "$");
} else {
indicator(feats, "obj_right", sentence.get(input.objectSpan.end()).lemma());
}
// Skip-word patterns
if (lemmaSpan.size() == 1 && input.subjectSpan.isBefore(input.objectSpan)) {
String left = input.subjectSpan.start() == 0 ? "^" : sentence.get(input.subjectSpan.start() - 1).lemma();
indicator(feats, "X<subj>Y<obj>", left + "_" + lemmaSpan.get(0));
}
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class KBPStatisticalExtractor method spanBetweenMentions.
/**
* Get information from the span between the two mentions.
* Canonically, get the words in this span.
* For instance, for "Obama was born in Hawaii", this would return a list
* "was born in" if the selector is <code>CoreLabel::token</code>;
* or "be bear in" if the selector is <code>CoreLabel::lemma</code>.
*
* @param input The featurizer input.
* @param selector The field to compute for each element in the span. A good default is <code></code>CoreLabel::word</code> or <code></code>CoreLabel::token</code>
* @param <E> The type of element returned by the selector.
*
* @return A list of elements between the two mentions.
*/
@SuppressWarnings("unchecked")
private static <E> List<E> spanBetweenMentions(KBPInput input, Function<CoreLabel, E> selector) {
List<CoreLabel> sentence = input.sentence.asCoreLabels(Sentence::lemmas, Sentence::nerTags);
Span subjSpan = input.subjectSpan;
Span objSpan = input.objectSpan;
// Corner cases
if (Span.overlaps(subjSpan, objSpan)) {
return Collections.EMPTY_LIST;
}
// Get the range between the subject and object
int begin = subjSpan.end();
int end = objSpan.start();
if (begin > end) {
begin = objSpan.end();
end = subjSpan.start();
}
if (begin > end) {
throw new IllegalArgumentException("Gabor sucks at logic and he should feel bad about it: " + subjSpan + " and " + objSpan);
} else if (begin == end) {
return Collections.EMPTY_LIST;
}
// Compute the return value
List<E> rtn = new ArrayList<>();
for (int i = begin; i < end; ++i) {
rtn.add(selector.apply(sentence.get(i)));
}
return rtn;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class KBPTokensregexExtractor method classify.
@Override
public Pair<String, Double> classify(KBPInput input) {
// Annotate Sentence
CoreMap sentenceAsMap = input.sentence.asCoreMap(Sentence::nerTags);
List<CoreLabel> tokens = sentenceAsMap.get(CoreAnnotations.TokensAnnotation.class);
// Annotate where the subject is
for (int i : input.subjectSpan) {
tokens.get(i).set(Subject.class, "true");
if ("O".equals(tokens.get(i).ner())) {
tokens.get(i).setNER(input.subjectType.name);
}
}
// Annotate where the object is
for (int i : input.objectSpan) {
tokens.get(i).set(Object.class, "true");
if ("O".equals(tokens.get(i).ner())) {
tokens.get(i).setNER(input.objectType.name);
}
}
// Run Rules
for (RelationType rel : RelationType.values()) {
if (rules.containsKey(rel) && rel.entityType == input.subjectType && rel.validNamedEntityLabels.contains(input.objectType)) {
CoreMapExpressionExtractor extractor = rules.get(rel);
@SuppressWarnings("unchecked") List<MatchedExpression> extractions = extractor.extractExpressions(sentenceAsMap);
if (extractions != null && extractions.size() > 0) {
MatchedExpression best = MatchedExpression.getBestMatched(extractions, MatchedExpression.EXPR_WEIGHT_SCORER);
// Un-Annotate Sentence
for (CoreLabel token : tokens) {
token.remove(Subject.class);
token.remove(Object.class);
}
return Pair.makePair(rel.canonicalName, best.getWeight());
}
}
}
// Un-Annotate Sentence
for (CoreLabel token : tokens) {
token.remove(Subject.class);
token.remove(Object.class);
}
return Pair.makePair(NO_RELATION, 1.0);
}
Aggregations