use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method filterPredictedMentions.
/** When mention boundaries are given */
public List<List<Mention>> filterPredictedMentions(List<List<Mention>> allGoldMentions, Annotation doc, Dictionaries dict, Properties props) {
List<List<Mention>> predictedMentions = new ArrayList<>();
for (int i = 0; i < allGoldMentions.size(); i++) {
CoreMap s = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i);
List<Mention> goldMentions = allGoldMentions.get(i);
List<Mention> mentions = new ArrayList<>();
predictedMentions.add(mentions);
mentions.addAll(goldMentions);
findHead(s, mentions);
// todo [cdm 2013]: This block seems to do nothing - the two sets are never used
Set<IntPair> mentionSpanSet = Generics.newHashSet();
Set<IntPair> namedEntitySpanSet = Generics.newHashSet();
for (Mention m : mentions) {
mentionSpanSet.add(new IntPair(m.startIndex, m.endIndex));
if (!m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O")) {
namedEntitySpanSet.add(new IntPair(m.startIndex, m.endIndex));
}
}
setBarePlural(mentions);
}
removeSpuriousMentions(doc, predictedMentions, dict, CorefProperties.removeNestedMentions(props), lang);
return predictedMentions;
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class Document method findTwinMentionsStrict.
/** Mark twin mentions: All mention boundaries should be matched */
private void findTwinMentionsStrict() {
for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);
// For CoNLL training there are some documents with gold mentions with the same position offsets
// See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
// (Packwood - Roth)
CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
for (Mention g : golds) {
IntPair ip = new IntPair(g.startIndex, g.endIndex);
if (goldMentionPositions.containsKey(ip)) {
StringBuilder existingMentions = new StringBuilder();
for (Mention eg : goldMentionPositions.get(ip)) {
if (existingMentions.length() > 0) {
existingMentions.append(",");
}
existingMentions.append(eg.mentionID);
}
SieveCoreferenceSystem.logger.warning("WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
}
//assert(!goldMentionPositions.containsKey(ip));
goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
}
for (Mention p : predicts) {
IntPair pos = new IntPair(p.startIndex, p.endIndex);
if (goldMentionPositions.containsKey(pos)) {
Collection<Mention> cm = goldMentionPositions.get(pos);
Mention g = cm.iterator().next();
cm.remove(g);
p.mentionID = g.mentionID;
p.twinless = false;
g.twinless = false;
}
}
// temp: for making easy to recognize twinless mention
for (Mention p : predicts) {
if (p.twinless)
p.mentionID += 10000;
}
}
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class IOBUtils method TokenSpansForIOB.
public static List<IntPair> TokenSpansForIOB(List<CoreLabel> labeledSequence) {
List<IntPair> spans = CollectionUtils.makeList();
String lastLabel = "";
boolean inToken = false;
int tokenStart = 0;
final int sequenceLength = labeledSequence.size();
for (int i = 0; i < sequenceLength; ++i) {
CoreLabel labeledChar = labeledSequence.get(i);
String token = labeledChar.get(CoreAnnotations.CharAnnotation.class);
String label = labeledChar.get(CoreAnnotations.AnswerAnnotation.class);
if (token.equals(BoundaryChar)) {
if (inToken) {
spans.add(new IntPair(tokenStart, i));
}
inToken = false;
} else {
switch(label) {
case BeginSymbol:
if (lastLabel.equals(ContinuationSymbol) || lastLabel.equals(BeginSymbol) || lastLabel.equals(RewriteSymbol)) {
if (inToken) {
spans.add(new IntPair(tokenStart, i));
}
inToken = true;
tokenStart = i;
} else if (!inToken) {
inToken = true;
tokenStart = i;
}
break;
case ContinuationSymbol:
if (!inToken) {
inToken = true;
tokenStart = i;
}
break;
case BoundarySymbol:
case NosegSymbol:
if (inToken) {
spans.add(new IntPair(tokenStart, i));
}
inToken = true;
tokenStart = i;
break;
case RewriteSymbol:
case "REWAL":
case "REWTA":
if (token.equals("ل")) {
if (inToken) {
spans.add(new IntPair(tokenStart, i));
}
inToken = true;
tokenStart = i;
} else if (!inToken) {
inToken = true;
tokenStart = i;
}
break;
}
}
lastLabel = label;
}
if (inToken) {
spans.add(new IntPair(tokenStart, sequenceLength));
}
return spans;
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class MultiWordStringMatcher method findTargetStringOffsetsExct.
/**
* Finds target string in text span from character start to end (exclusive) and returns offsets
* (does EXCT string matching)
* @param text - String in which to look for the target string
* @param targetString - Target string to look for
* @param start - position to start search
* @param end - position to end search
* @return list of integer pairs indicating the character offsets (begin, end - exclusive)
* at which the targetString can be find
*/
protected List<IntPair> findTargetStringOffsetsExct(String text, String targetString, int start, int end) {
if (start > text.length())
return null;
if (end > text.length())
return null;
List<IntPair> offsets = null;
int i = text.indexOf(targetString, start);
if (i >= 0 && i < end) {
offsets = new ArrayList<>();
}
while (i >= 0 && i < end) {
boolean matched = true;
if (i > 0) {
char charBefore = text.charAt(i - 1);
if (Character.isLetterOrDigit(charBefore)) {
matched = false;
}
}
if (i + targetString.length() < text.length()) {
char charAfter = text.charAt(i + targetString.length());
if (Character.isLetterOrDigit(charAfter)) {
matched = false;
}
}
if (matched) {
offsets.add(new IntPair(i, i + targetString.length()));
i += targetString.length();
} else {
i++;
}
i = text.indexOf(targetString, i);
}
return offsets;
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class MultiWordStringMatcher method findOffsets.
/**
* Finds pattern in text span from character start to end (exclusive) and returns offsets
* @param pattern - pattern to look for
* @param text - String in which to look for the pattern
* @param start - position to start search
* @param end - position to end search
* @return list of integer pairs indicating the character offsets (begin, end - exclusive)
* at which the pattern can be find
*/
public static List<IntPair> findOffsets(Pattern pattern, String text, int start, int end) {
Matcher matcher = pattern.matcher(text);
List<IntPair> offsets = null;
matcher.region(start, end);
int i = (matcher.find()) ? matcher.start() : -1;
if (i >= 0 && i < end) {
offsets = new ArrayList<>();
}
while (i >= 0 && i < end) {
boolean matched = true;
int matchEnd = matcher.end();
if (i > 0) {
char charBefore = text.charAt(i - 1);
if (Character.isLetterOrDigit(charBefore)) {
matched = false;
}
}
if (matchEnd < text.length()) {
char charAfter = text.charAt(matchEnd);
if (Character.isLetterOrDigit(charAfter)) {
matched = false;
}
}
if (matched) {
offsets.add(new IntPair(i, matchEnd));
}
i = (matcher.find()) ? matcher.start() : -1;
}
return offsets;
}
Aggregations