use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class PrepSRLAnnotator method addView.
@Override
protected void addView(TextAnnotation ta) throws AnnotatorException {
List<Constituent> candidates = new ArrayList<>();
for (Constituent c : ta.getView(ViewNames.TOKENS).getConstituents()) {
int tokenId = c.getStartSpan();
if (PrepSRLDataReader.isPrep(ta, tokenId))
candidates.add(c.cloneForNewViewWithDestinationLabel(viewName, DataReader.CANDIDATE));
// Now check bigrams & trigrams
Constituent multiWordPrep = PrepSRLDataReader.isBigramPrep(ta, tokenId, viewName);
if (multiWordPrep != null)
candidates.add(multiWordPrep);
multiWordPrep = PrepSRLDataReader.isTrigramPrep(ta, tokenId, viewName);
if (multiWordPrep != null)
candidates.add(multiWordPrep);
}
SpanLabelView prepositionLabelView = new SpanLabelView(viewName, viewName + "-annotator", ta, 1.0, true);
for (Constituent c : candidates) {
String role = classifier.discreteValue(c);
if (!role.equals(DataReader.CANDIDATE))
prepositionLabelView.addSpanLabel(c.getStartSpan(), c.getEndSpan(), role, 1.0);
}
ta.addView(viewName, prepositionLabelView);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class CorefMUCEvaluator method evaluate.
public void evaluate(ClassificationTester tester, View goldView, View predictionView) {
this.gold = (CoreferenceView) goldView;
this.prediction = (CoreferenceView) predictionView;
// Recall = \sum_i [ |si| - |pOfsi| ] / \sum_i [ |si| - 1 ]
// where si is a true cluster, pOfsi is the set of predicted
// clusters that contain elements of si (i.e. number of predicted clusters having some
// overlap with
// this gold cluster)
int numerator1 = 0;
int denominator1 = 0;
for (Constituent goldCanonicalCons : gold.getCanonicalEntitiesViaRelations()) {
HashSet<Constituent> consInGoldCluster = new HashSet(gold.getCoreferentMentionsViaRelations(goldCanonicalCons));
for (Constituent predCanonicalCons : prediction.getCanonicalEntitiesViaRelations()) {
HashSet<Constituent> consInPredCluster = new HashSet(prediction.getCoreferentMentionsViaRelations(predCanonicalCons));
Set<Constituent> intersection = new HashSet();
for (Constituent cGold : consInGoldCluster) {
for (Constituent cPred : consInPredCluster) if (cPred.equalsWithoutAttributeEqualityCheck(cGold))
intersection.add(cGold);
}
if (!intersection.isEmpty())
numerator1 -= 1;
}
numerator1 += consInGoldCluster.size();
denominator1 += consInGoldCluster.size() - 1;
}
double recall = 1.0 * numerator1 / denominator1;
// Precision is defined dually by reversing the roles of gold and prediction
// Precision = \sum_i [ |siprime| - |pOfsiprime| ] / \sum_i [ |siprime| - 1 ]
// where siprime is a predicted cluster, pOfsiprime is the set of
// true clusters that contain elements of siprime.
int numerator2 = 0;
int denominator2 = 0;
for (Constituent predCanonicalCons : prediction.getCanonicalEntitiesViaRelations()) {
HashSet<Constituent> consInPredCluster = new HashSet(prediction.getCoreferentMentionsViaRelations(predCanonicalCons));
for (Constituent goldCanonicalCons : gold.getCanonicalEntitiesViaRelations()) {
HashSet<Constituent> consInGoldCluster = new HashSet(gold.getCoreferentMentionsViaRelations(goldCanonicalCons));
Set<Constituent> intersection = new HashSet();
for (Constituent cPred : consInPredCluster) {
for (Constituent cGold : consInGoldCluster) {
if (cGold.equalsWithoutAttributeEqualityCheck(cPred))
intersection.add(cGold);
}
}
if (!intersection.isEmpty())
numerator2 -= 1;
}
numerator2 += consInPredCluster.size();
denominator2 += consInPredCluster.size() - 1;
}
double precision = 1.0 * numerator2 / denominator2;
assert (numerator1 == numerator2);
tester.recordCount("coref", denominator1, denominator2, numerator1);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class AnnotationFixer method rationalizeBoundaryAnnotations.
/**
* correct automated annotations (tokenization, sentence splitting) based on gold annotations of entity mentions
* @param ta TextAnnotation with annotated clean text
* @return a map of view names to indexes indicating where violations were found/corrected
*/
public static Map<String, Set<Integer>> rationalizeBoundaryAnnotations(TextAnnotation ta, String viewName) {
Map<String, Set<Integer>> violations = new HashMap<>();
Set<Integer> badSentenceStartIndexes = new HashSet<>();
violations.put(ViewNames.SENTENCE, badSentenceStartIndexes);
View sentences = ta.getView(ViewNames.SENTENCE);
TreeMap<Integer, Constituent> sentenceStarts = new TreeMap<>();
for (Constituent s : sentences) sentenceStarts.put(s.getStartSpan(), s);
Set<Pair<Constituent, Constituent>> sentencesToMerge = new HashSet<>();
View nerMention = ta.getView(viewName);
for (Constituent m : nerMention.getConstituents()) {
Constituent lastSent = null;
for (int sentStart : sentenceStarts.keySet()) {
int mentEnd = m.getEndSpan();
if (// ordered sentence list, so stop after
sentStart > mentEnd)
break;
Constituent currentSent = sentenceStarts.get(sentStart);
int mentStart = m.getStartSpan();
if (sentStart > mentStart && sentStart < mentEnd) {
sentencesToMerge.add(new Pair(lastSent, currentSent));
badSentenceStartIndexes.add(sentStart);
}
lastSent = currentSent;
}
}
Set<Integer> sentStartsProcessed = new HashSet<>();
for (Pair<Constituent, Constituent> sentPair : sentencesToMerge) {
Constituent first = sentPair.getFirst();
Constituent second = sentPair.getSecond();
int firstStart = first.getStartSpan();
int secondStart = second.getStartSpan();
if (sentStartsProcessed.contains(firstStart) || sentStartsProcessed.contains(secondStart)) {
throw new IllegalStateException("more complex boundary constraints than I can currently handle -- " + "more than two consecutive sentences with boundary errors.");
}
Constituent combinedSent = null;
if (null == first.getLabelsToScores())
combinedSent = new Constituent(first.getLabel(), first.getConstituentScore(), ViewNames.SENTENCE, first.getTextAnnotation(), first.getStartSpan(), second.getEndSpan());
else
combinedSent = new Constituent(first.getLabelsToScores(), ViewNames.SENTENCE, first.getTextAnnotation(), first.getStartSpan(), second.getEndSpan());
for (String k : first.getAttributeKeys()) {
combinedSent.addAttribute(k, first.getAttribute(k));
}
for (String k : second.getAttributeKeys()) {
combinedSent.addAttribute(k, first.getAttribute(k));
}
sentences.removeConstituent(first);
sentences.removeConstituent(second);
sentences.addConstituent(combinedSent);
}
ta.setSentences();
return violations;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class SimpleGazetteerAnnotator method addView.
/**
* The view will consist of potentially overlapping constituents representing those tokens that
* matched entries in the gazetteers. Some tokens will match against several gazetteers.
*/
@Override
public void addView(TextAnnotation ta) throws AnnotatorException {
View view = ta.getView(ViewNames.TOKENS);
List<Constituent> constituents = view.getConstituents();
SpanLabelView slv = new SpanLabelView(this.getViewName(), this.getClass().getName(), ta, 1d, true);
for (int constindx = 0; constindx < constituents.size(); constindx++) {
for (int dictindx = 0; dictindx < dictionaries.size(); dictindx++) {
dictionaries.get(dictindx).match(constituents, constindx, slv);
dictionariesIgnoreCase.get(dictindx).match(constituents, constindx, slv);
}
}
ta.addView(slv.getViewName(), slv);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class CurrencyIndicator method getFeatures.
@Override
public Set<Feature> getFeatures(Constituent c) throws EdisonException {
try {
if (!loaded)
synchronized (this) {
// now its changed to be loaded from datastore.
if (!loaded)
loadCurrency(gzip, true);
}
} catch (Exception ex) {
throw new EdisonException(ex);
}
TextAnnotation ta = c.getTextAnnotation();
if (!ta.hasView(VIEW_NAME)) {
try {
addCurrencyView(ta);
} catch (Exception e) {
e.printStackTrace();
}
}
SpanLabelView view = (SpanLabelView) ta.getView(VIEW_NAME);
Set<Feature> features = new LinkedHashSet<>();
for (Constituent cc : view.where(Queries.containedInConstituent(c))) {
if (cc.getEndSpan() == c.getEndSpan()) {
if (cc.getStartSpan() - 1 > c.getEndSpan()) {
// check if this is a number
if (WordLists.NUMBERS.contains(ta.getToken(cc.getStartSpan() - 1).toLowerCase())) {
features.add(CURRENCY);
break;
}
}
} else if (WordFeatureExtractorFactory.numberNormalizer.getWordFeatures(ta, cc.getEndSpan()).size() > 0) {
features.add(CURRENCY);
break;
}
}
return features;
}
Aggregations