use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class AnnotationFixer method rationalizeBoundaryAnnotations.
/**
* correct automated annotations (tokenization, sentence splitting) based on gold annotations of entity mentions
* @param ta TextAnnotation with annotated clean text
* @return a map of view names to indexes indicating where violations were found/corrected
*/
public static Map<String, Set<Integer>> rationalizeBoundaryAnnotations(TextAnnotation ta, String viewName) {
Map<String, Set<Integer>> violations = new HashMap<>();
Set<Integer> badSentenceStartIndexes = new HashSet<>();
violations.put(ViewNames.SENTENCE, badSentenceStartIndexes);
View sentences = ta.getView(ViewNames.SENTENCE);
TreeMap<Integer, Constituent> sentenceStarts = new TreeMap<>();
for (Constituent s : sentences) sentenceStarts.put(s.getStartSpan(), s);
Set<Pair<Constituent, Constituent>> sentencesToMerge = new HashSet<>();
View nerMention = ta.getView(viewName);
for (Constituent m : nerMention.getConstituents()) {
Constituent lastSent = null;
for (int sentStart : sentenceStarts.keySet()) {
int mentEnd = m.getEndSpan();
if (// ordered sentence list, so stop after
sentStart > mentEnd)
break;
Constituent currentSent = sentenceStarts.get(sentStart);
int mentStart = m.getStartSpan();
if (sentStart > mentStart && sentStart < mentEnd) {
sentencesToMerge.add(new Pair(lastSent, currentSent));
badSentenceStartIndexes.add(sentStart);
}
lastSent = currentSent;
}
}
Set<Integer> sentStartsProcessed = new HashSet<>();
for (Pair<Constituent, Constituent> sentPair : sentencesToMerge) {
Constituent first = sentPair.getFirst();
Constituent second = sentPair.getSecond();
int firstStart = first.getStartSpan();
int secondStart = second.getStartSpan();
if (sentStartsProcessed.contains(firstStart) || sentStartsProcessed.contains(secondStart)) {
throw new IllegalStateException("more complex boundary constraints than I can currently handle -- " + "more than two consecutive sentences with boundary errors.");
}
Constituent combinedSent = null;
if (null == first.getLabelsToScores())
combinedSent = new Constituent(first.getLabel(), first.getConstituentScore(), ViewNames.SENTENCE, first.getTextAnnotation(), first.getStartSpan(), second.getEndSpan());
else
combinedSent = new Constituent(first.getLabelsToScores(), ViewNames.SENTENCE, first.getTextAnnotation(), first.getStartSpan(), second.getEndSpan());
for (String k : first.getAttributeKeys()) {
combinedSent.addAttribute(k, first.getAttribute(k));
}
for (String k : second.getAttributeKeys()) {
combinedSent.addAttribute(k, first.getAttribute(k));
}
sentences.removeConstituent(first);
sentences.removeConstituent(second);
sentences.addConstituent(combinedSent);
}
ta.setSentences();
return violations;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class BasicAnnotatorService method addView.
/**
* DOES NOT CACHE THE ADDED VIEW!!!
*
* @param textAnnotation textAnnotation to be modified
* @param viewName name of view to be added
* @return 'true' if textAnnotation was modified
* @throws AnnotatorException
*/
@Override
public boolean addView(TextAnnotation textAnnotation, String viewName) throws AnnotatorException {
boolean isUpdated = false;
if (ViewNames.SENTENCE.equals(viewName) || ViewNames.TOKENS.equals(viewName))
return false;
if (!textAnnotation.hasView(viewName) || forceUpdate) {
isUpdated = true;
if (!viewProviders.containsKey(viewName))
throw new AnnotatorException("View '" + viewName + "' cannot be provided by this AnnotatorService.");
Annotator annotator = viewProviders.get(viewName);
for (String prereqView : annotator.getRequiredViews()) {
addView(textAnnotation, prereqView);
}
View v = annotator.getView(textAnnotation);
textAnnotation.addView(annotator.getViewName(), v);
}
if (isUpdated && throwExceptionIfNotCached)
throwNotCachedException(textAnnotation.getCorpusId(), textAnnotation.getId(), textAnnotation.getText());
return isUpdated;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class AnnotationFixerTest method testAnnotationFixer.
@Test
public void testAnnotationFixer() {
TextAnnotation ta = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(false, 3);
int sentStart = ta.getView(ViewNames.SENTENCE).getConstituents().get(1).getStartSpan();
int cStart = sentStart - 2;
int cEnd = sentStart + 1;
// first, run Fixer with existing view (which must respect sentence boundaries
AnnotationFixer.rationalizeBoundaryAnnotations(ta, ViewNames.PSEUDO_PARSE_STANFORD);
assertEquals(3, ta.getNumberOfSentences());
View constrainingView = new View(VNAME, VNAME, ta, 1.0);
ta.addView(VNAME, constrainingView);
Constituent c = new Constituent("CONSTRAINT", VNAME, ta, cStart, cEnd);
constrainingView.addConstituent(c);
AnnotationFixer.rationalizeBoundaryAnnotations(ta, VNAME);
assertEquals(2, ta.getNumberOfSentences());
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class JsonSerializerTest method verifySerializedJSONObject.
/** Behavior specific to unit tests only. Use with caution */
public static void verifySerializedJSONObject(JsonObject jobj, TextAnnotation ta) {
assertNotNull(jobj);
JsonArray jsonTokenOffsets = jobj.get(JsonSerializer.TOKENOFFSETS).getAsJsonArray();
assertNotNull(jsonTokenOffsets);
assertEquals(ta.getTokens().length, jsonTokenOffsets.size());
Map<IntPair, String> offsetForms = new HashMap<>();
for (int i = 0; i < jsonTokenOffsets.size(); ++i) {
JsonObject offset = (JsonObject) jsonTokenOffsets.get(i);
int start = offset.get(JsonSerializer.STARTCHAROFFSET).getAsInt();
int end = offset.get(JsonSerializer.ENDCHAROFFSET).getAsInt();
String form = offset.get(JsonSerializer.FORM).getAsString();
offsetForms.put(new IntPair(start, end), form);
}
Constituent seventhToken = ta.getView(ViewNames.TOKENS).getConstituents().get(6);
IntPair tokCharOffsets = new IntPair(seventhToken.getStartCharOffset(), seventhToken.getEndCharOffset());
String seventhTokenForm = seventhToken.getSurfaceForm();
String deserializedForm = offsetForms.get(tokCharOffsets);
assertNotNull(deserializedForm);
assertEquals(seventhTokenForm, deserializedForm);
Constituent thirdPos = ta.getView(ViewNames.POS).getConstituents().get(3);
assertEquals(null, thirdPos.getLabelsToScores());
View rhymeRecons = ta.getView("rhyme");
assertNotNull(rhymeRecons);
Relation r = rhymeRecons.getRelations().get(0);
Map<String, Double> relLabelScores = r.getLabelsToScores();
assertNotNull(relLabelScores);
assertEquals(2, relLabelScores.size());
Constituent c = r.getSource();
Map<String, Double> cLabelScores = c.getLabelsToScores();
assertNotNull(cLabelScores);
assertEquals(4, cLabelScores.size());
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class JsonSerializerTest method testSerializerWithCharOffsets.
@Test
public void testSerializerWithCharOffsets() {
View rhymeView = new View("rhyme", "test", ta, 0.4);
Map<String, Double> newLabelsToScores = new TreeMap<String, Double>();
String[] labels = { "eeny", "meeny", "miny", "mo" };
double[] scores = { 0.15, 0.15, 0.3, 0.4 };
for (int i = 0; i < labels.length; ++i) newLabelsToScores.put(labels[i], scores[i]);
Constituent first = new Constituent(newLabelsToScores, "rhyme", ta, 2, 4);
rhymeView.addConstituent(first);
/**
* no constraint on scores -- don't have to sum to 1.0
*/
for (int i = labels.length - 1; i > 0; --i) newLabelsToScores.put(labels[i], scores[3 - i]);
Constituent second = new Constituent(newLabelsToScores, "rhyme", ta, 2, 4);
rhymeView.addConstituent(second);
Map<String, Double> relLabelsToScores = new TreeMap<>();
relLabelsToScores.put("Yes", 0.8);
relLabelsToScores.put("No", 0.2);
Relation rel = new Relation(relLabelsToScores, first, second);
rhymeView.addRelation(rel);
ta.addView("rhyme", rhymeView);
String taJson = SerializationHelper.serializeToJson(ta, true);
logger.info(taJson);
JsonObject jobj = (JsonObject) new JsonParser().parse(taJson);
JsonSerializerTest.verifySerializedJSONObject(jobj, ta);
}
Aggregations