use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.
the class ApplyPatternsMulti method call.
@Override
public Pair<TwoDimensionalCounter<Pair<String, String>, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {
//CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
TwoDimensionalCounter<Pair<String, String>, E> allFreq = new TwoDimensionalCounter<>();
for (String sentid : sentids) {
List<CoreLabel> sent = sents.get(sentid).getTokens();
//FIND_ALL is faster than FIND_NONOVERLAP
Iterable<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.find(sent, SequenceMatcher.FindType.FIND_ALL);
for (SequenceMatchResult<CoreMap> m : matched) {
int s = m.start("$term");
int e = m.end("$term");
E matchedPat = patterns.get(m.pattern());
matchedTokensByPat.add(matchedPat, new Triple<>(sentid, s, e));
String phrase = "";
String phraseLemma = "";
boolean useWordNotLabeled = false;
boolean doNotUse = false;
//find if the neighboring words are labeled - if so - club them together
if (constVars.clubNeighboringLabeledWords) {
for (int i = s - 1; i >= 0; i--) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
s = i + 1;
break;
}
}
for (int i = e; i < sent.size(); i++) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
e = i;
break;
}
}
}
//to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
boolean[] addedindices = new boolean[e - s];
Arrays.fill(addedindices, false);
for (int i = s; i < e; i++) {
CoreLabel l = sent.get(i);
l.set(PatternsAnnotations.MatchedPattern.class, true);
if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class))
l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
l.get(PatternsAnnotations.MatchedPatterns.class).add(matchedPat);
// }
for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
doNotUse = true;
}
}
boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
if (removePhrasesWithStopWords && containsStop) {
doNotUse = true;
} else {
if (!containsStop || !removeStopWordsFromSelectedPhrases) {
if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
useWordNotLabeled = true;
}
phrase += " " + l.word();
phraseLemma += " " + l.lemma();
addedindices[i - s] = true;
}
}
}
for (int i = 0; i < addedindices.length; i++) {
if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
doNotUse = true;
break;
}
}
if (!doNotUse && useWordNotLabeled) {
phrase = phrase.trim();
phraseLemma = phraseLemma.trim();
allFreq.incrementCount(new Pair<>(phrase, phraseLemma), matchedPat, 1.0);
}
}
// for (SurfacePattern pat : patterns.keySet()) {
// String patternStr = pat.toString();
//
// TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
// if (pat == null || p == null)
// throw new RuntimeException("why is the pattern " + pat + " null?");
//
// TokenSequenceMatcher m = p.getMatcher(sent);
// while (m.find()) {
//
// int s = m.start("$term");
// int e = m.end("$term");
//
// String phrase = "";
// String phraseLemma = "";
// boolean useWordNotLabeled = false;
// boolean doNotUse = false;
// for (int i = s; i < e; i++) {
// CoreLabel l = sent.get(i);
// l.set(PatternsAnnotations.MatchedPattern.class, true);
// if (restrictToMatched) {
// tokensMatchedPattern.add(sentid, i);
// }
// for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
// if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
// doNotUse = true;
// }
// }
// boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
// if (removePhrasesWithStopWords && containsStop) {
// doNotUse = true;
// } else {
// if (!containsStop || !removeStopWordsFromSelectedPhrases) {
//
// if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
// useWordNotLabeled = true;
// }
// phrase += " " + l.word();
// phraseLemma += " " + l.lemma();
//
// }
// }
// }
// if (!doNotUse && useWordNotLabeled) {
// phrase = phrase.trim();
// phraseLemma = phraseLemma.trim();
// allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
// }
// }
// }
}
return new Pair<>(allFreq, matchedTokensByPat);
}
use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.
the class CoNLLMentionExtractor method extractGoldMentions.
public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) {
List<CoreMap> sentences = conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class);
List<List<Mention>> allGoldMentions = new ArrayList<>();
CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap();
for (int i = 0; i < sentences.size(); i++) {
allGoldMentions.add(new ArrayList<>());
}
int maxCorefClusterId = -1;
for (String corefIdStr : corefChainMap.keySet()) {
int id = Integer.parseInt(corefIdStr);
if (id > maxCorefClusterId) {
maxCorefClusterId = id;
}
}
int newMentionID = maxCorefClusterId + 1;
for (Map.Entry<String, Collection<CoreMap>> idChainEntry : corefChainMap.entrySet()) {
int id = Integer.parseInt(idChainEntry.getKey());
int clusterMentionCnt = 0;
for (CoreMap m : idChainEntry.getValue()) {
clusterMentionCnt++;
Mention mention = new Mention();
mention.goldCorefClusterID = id;
if (clusterMentionCnt == 1) {
// First mention in cluster
mention.mentionID = id;
mention.originalRef = -1;
} else {
mention.mentionID = newMentionID;
mention.originalRef = id;
newMentionID++;
}
if (maxID < mention.mentionID)
maxID = mention.mentionID;
int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class);
CoreMap sent = sentences.get(sentIndex);
mention.startIndex = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
mention.endIndex = m.get(CoreAnnotations.TokenEndAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
// will be set by arrange
mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);
// Mention dependency graph is the enhanced dependency graph of the sentence
mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
allGoldMentions.get(sentIndex).add(mention);
}
}
return allGoldMentions;
}
use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.
the class Document method findTwinMentionsStrict.
/** Mark twin mentions: All mention boundaries should be matched */
private void findTwinMentionsStrict() {
for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);
// For CoNLL training there are some documents with gold mentions with the same position offsets
// See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
// (Packwood - Roth)
CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
for (Mention g : golds) {
IntPair ip = new IntPair(g.startIndex, g.endIndex);
if (goldMentionPositions.containsKey(ip)) {
StringBuilder existingMentions = new StringBuilder();
for (Mention eg : goldMentionPositions.get(ip)) {
if (existingMentions.length() > 0) {
existingMentions.append(",");
}
existingMentions.append(eg.mentionID);
}
SieveCoreferenceSystem.logger.warning("WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
}
//assert(!goldMentionPositions.containsKey(ip));
goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
}
for (Mention p : predicts) {
IntPair pos = new IntPair(p.startIndex, p.endIndex);
if (goldMentionPositions.containsKey(pos)) {
Collection<Mention> cm = goldMentionPositions.get(pos);
Mention g = cm.iterator().next();
cm.remove(g);
p.mentionID = g.mentionID;
p.twinless = false;
g.twinless = false;
}
}
// temp: for making easy to recognize twinless mention
for (Mention p : predicts) {
if (p.twinless)
p.mentionID += 10000;
}
}
}
use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.
the class DocumentPreprocessor method findTwinMentionsStrict.
/** Mark twin mentions: All mention boundaries should be matched */
private static void findTwinMentionsStrict(Document doc) {
for (int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
List<Mention> golds = doc.goldMentions.get(sentNum);
List<Mention> predicts = doc.predictedMentions.get(sentNum);
// For CoNLL training there are some documents with gold mentions with the same position offsets
// See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
// (Packwood - Roth)
CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
for (Mention g : golds) {
IntPair ip = new IntPair(g.startIndex, g.endIndex);
if (goldMentionPositions.containsKey(ip)) {
StringBuilder existingMentions = new StringBuilder();
for (Mention eg : goldMentionPositions.get(ip)) {
if (existingMentions.length() > 0) {
existingMentions.append(",");
}
existingMentions.append(eg.mentionID);
}
Redwood.log("debug-preprocessor", "WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
}
//assert(!goldMentionPositions.containsKey(ip));
goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
}
for (Mention p : predicts) {
IntPair pos = new IntPair(p.startIndex, p.endIndex);
if (goldMentionPositions.containsKey(pos)) {
Collection<Mention> cm = goldMentionPositions.get(pos);
int minId = Integer.MAX_VALUE;
Mention g = null;
for (Mention m : cm) {
if (m.mentionID < minId) {
g = m;
minId = m.mentionID;
}
}
cm.remove(g);
p.mentionID = g.mentionID;
p.hasTwin = true;
g.hasTwin = true;
}
}
}
}
use of edu.stanford.nlp.util.CollectionValuedMap in project CoreNLP by stanfordnlp.
the class ApplyPatterns method call.
@Override
public Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> call() throws Exception {
// CollectionValuedMap<String, Integer>();
try {
Set<CandidatePhrase> alreadyLabeledPhrases = new HashSet<>();
TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
for (String sentid : sentids) {
List<CoreLabel> sent = sents.get(sentid).getTokens();
for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {
if (pEn.getKey() == null)
throw new RuntimeException("why is the pattern " + pEn + " null?");
TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);
// //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
// m.setFindType(SequenceMatcher.FindType.FIND_ALL);
//Higher branch values makes the faster but uses more memory
m.setBranchLimit(5);
while (m.find()) {
int s = m.start("$term");
int e = m.end("$term");
assert e - s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey() + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;
String phrase = "";
String phraseLemma = "";
boolean useWordNotLabeled = false;
boolean doNotUse = false;
//find if the neighboring words are labeled - if so - club them together
if (constVars.clubNeighboringLabeledWords) {
for (int i = s - 1; i >= 0; i--) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
s = i + 1;
break;
}
}
for (int i = e; i < sent.size(); i++) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
e = i;
break;
}
}
}
//to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
boolean[] addedindices = new boolean[e - s];
Arrays.fill(addedindices, false);
for (int i = s; i < e; i++) {
CoreLabel l = sent.get(i);
l.set(PatternsAnnotations.MatchedPattern.class, true);
if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
SurfacePattern pSur = (SurfacePattern) pEn.getValue();
assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
doNotUse = true;
}
}
boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
if (removePhrasesWithStopWords && containsStop) {
doNotUse = true;
} else {
if (!containsStop || !removeStopWordsFromSelectedPhrases) {
if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
useWordNotLabeled = true;
}
phrase += " " + l.word();
phraseLemma += " " + l.lemma();
addedindices[i - s] = true;
}
}
}
for (int i = 0; i < addedindices.length; i++) {
if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
doNotUse = true;
break;
}
}
if (!doNotUse) {
matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
phrase = phrase.trim();
if (!phrase.isEmpty()) {
phraseLemma = phraseLemma.trim();
CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
if (!useWordNotLabeled)
alreadyLabeledPhrases.add(candPhrase);
}
}
}
}
}
return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
} catch (Exception e) {
e.printStackTrace();
throw e;
}
}
Aggregations