use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class OntonotesNamedEntityReader method nextAnnotation.
/**
* parse the pen treebank parse file, producing an annotation covering the entire file.
* @param data the data from the file, each line.
* @param docid the id representing the document name.
* @return the text annotation.
* @throws AnnotatorException
*/
private XmlTextAnnotation nextAnnotation(String data, String docid) throws AnnotatorException {
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(data, "OntoNotes 5.0", docid);
TextAnnotation ta = xta.getTextAnnotation();
List<SpanInfo> fudge = xta.getXmlMarkup();
// create the named entity vi
View nerView = new SpanLabelView(VIEW_NAME, ta);
for (SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
Pair<String, IntPair> neLabelPair = si.attributes.get("type");
String neLabel = neLabelPair.getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
if (cleanTextNeTokStart == -1 || cleanTextNeTokEnd == -1) {
for (Constituent c : nerView.getConstituents()) {
System.err.println(c);
}
System.err.println("Something wonky in \"" + docid + "\", at " + charOffsets + ", " + cleanTextCharStart + " - " + cleanTextCharEnd + " = " + ta.text.substring(cleanTextCharStart, cleanTextCharEnd));
} else {
if (entityCounts.containsKey(neLabel)) {
entityCounts.put(neLabel, (entityCounts.get(neLabel) + 1));
} else {
entityCounts.put(neLabel, 1);
}
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
}
}
ta.addView(VIEW_NAME, nerView);
return xta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class OntonotesCorefReader method parseLines.
/**
* Lines have no meaning, since the input data is XML. We will construct an
* input buffer, replacing each line with a "\n" so we can use an XML parser
* to produce the data.
*/
@Override
protected TextAnnotation parseLines(ArrayList<String> lines) throws AnnotatorException {
// get the treebank pase data
if (!this.otr.hasNext())
throw new RuntimeException("There were not as many treebank files as there were coref files.");
// get the treebank parse using the ontonotes treebank reader.
TextAnnotation resultTA = this.otr.next();
if (resultTA == null)
return null;
View nerView = null;
String[] nerTokens = null;
TextAnnotation nerTA = null;
// get the named entity data.
if (!this.oner.hasNext()) {
// no NER, return.
return null;
} else {
// All this code is just to get the named entity
XmlTextAnnotation xmlta = this.oner.next();
if (xmlta != null) {
nerTA = xmlta.getTextAnnotation();
if (nerTA == null) {
logger.error("There was no NER text annotation in \"" + this.oner.currentfile + "\"");
return null;
} else {
nerView = nerTA.getView(OntonotesNamedEntityReader.VIEW_NAME);
if (nerView == null) {
logger.error("There was no NER view in \"" + this.oner.currentfile + "\"");
return null;
}
nerTokens = nerTA.getTokens();
}
} else {
// the file did nto exist.
return null;
}
}
// nothing to work on, just return.
if (lines.size() == 0)
return null;
// construct a single string
StringBuffer sb = new StringBuffer(lines.get(0));
for (int i = 1; i < lines.size(); i++) {
sb.append(" ");
sb.append(lines.get(i));
}
// produce a document object.
String text = sb.toString();
Document doc = null;
try {
doc = SimpleXMLParser.getDocument(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8.name())));
} catch (UnsupportedEncodingException | XMLException e) {
throw new AnnotatorException("Could not decode the text from the XML document.");
}
if (doc == null) {
throw new AnnotatorException("Could not decode the text from the XML document.");
}
// Get a list of coref mentions object wrappers, these contain all the info we need
// to construct the coref chains.
ArrayList<CorefMention> hits = new ArrayList<>();
traverse(resultTA, 0, hits, doc.getDocumentElement(), "");
// we have all the hits, organize them into referant chains.
HashMap<String, ArrayList<CorefMention>> chains = new HashMap<>();
for (CorefMention cm : hits) {
ArrayList<CorefMention> chain = chains.get(cm.id);
if (chain == null) {
chain = new ArrayList<CorefMention>();
chains.put(cm.id, chain);
}
chain.add(cm);
}
CoreferenceView corefView = new CoreferenceView(VIEW_NAME, VIEW_NAME, resultTA, 0.0);
for (Entry<String, ArrayList<CorefMention>> entry : chains.entrySet()) {
ArrayList<CorefMention> mentions = entry.getValue();
CorefMention head = mentions.get(0);
Constituent headconst = new Constituent(head.id, VIEW_NAME, resultTA, head.location.getFirst(), head.location.getSecond());
head.constituent = headconst;
// These are added by the addCorefEdges call. corefView.addConstituent(headconst);
if (debug)
System.out.println(head + " -> " + head.constituent.getSurfaceForm());
ArrayList<Constituent> referants = new ArrayList<>();
ArrayList<HashMap<String, String>> attributes = new ArrayList<>();
for (int i = 1; i < mentions.size(); i++) {
CorefMention cm = mentions.get(i);
Constituent constituent = new Constituent(cm.id, VIEW_NAME, resultTA, cm.location.getFirst(), cm.location.getSecond());
cm.constituent = constituent;
// These are added by the addCorefEdges call. corefView.addConstituent(constituent);
referants.add(constituent);
// set up the attributes for the relation, just type and subtype.
HashMap<String, String> attribute = new HashMap<>();
if (cm.type != null)
attribute.put("TYPE", cm.type);
if (cm.subtype != null)
attribute.put("SUBTYPE", cm.subtype);
if (cm.speaker != null)
attribute.put("SPEAKER", cm.speaker);
attributes.add(attribute);
logger.debug(" " + cm + " -> " + cm.constituent.getSurfaceForm());
}
corefView.addCorefEdges(headconst, referants, attributes);
}
// now for each constituent in our view, determine what type of mention it is.
// Here we will project the named entities from the ".name" file onto this annotation
// and create a named entity view
String[] coreftokens = resultTA.getTokens();
// allign token offsets, for each coref token, the corresponding offset of the START of the ner token.
// this aligns token offsets.
int[] tokenAlignment = new int[nerTokens.length];
for (int ci = 0, ni = 0; ci < coreftokens.length && ni < nerTokens.length; ) {
tokenAlignment[ni] = ci;
if (coreftokens[ci].equals(nerTokens[ni])) {
ni++;
ci++;
} else {
// our tokens didn't align. Some symbols are treated differently
// so where we see "&", "$", "-" and so on, there maybe different
// tokenizations, so try to append successive ner tokens to see if they
// then match.
String ctok = coreftokens[ci];
String ntok = nerTokens[ni];
int niplus = 0;
// as long as the appended ner token contains the coref token, possible match
while (true) {
if (ctok.equals(ntok)) {
break;
} else {
if (ctok.contains(ntok)) {
niplus++;
ntok += nerTokens[ni + niplus];
} else if (ntok.contains("-")) {
// check for XML escapes.
if (this.compareWithXMLEscapesIgnoreGarbageIn(ctok, ntok)) {
break;
} else {
niplus++;
if (ni + niplus >= nerTokens.length)
// give up.
break;
ntok += nerTokens[ni + niplus];
}
} else {
StringBuffer stringbuffer = new StringBuffer("\nTokens were simply different in " + this.currentfile + " around " + ni + " and " + ci + "\n");
for (int cci = ci - 10, i = 0; i < 30; i++, cci++) {
stringbuffer.append(" " + coreftokens[cci]);
}
stringbuffer.append('\n');
for (int nni = ni - 8, i = 0; i < 30; i++, nni++) {
if (nni == ni)
stringbuffer.append(" *");
else
stringbuffer.append(" ");
stringbuffer.append(nerTokens[nni]);
}
stringbuffer.append('\n');
logger.error(stringbuffer.toString());
return null;
}
}
}
if (ctok.equals(ntok) || this.compareWithXMLEscapes(ctok, ntok)) {
// we matched
ni += niplus;
ni++;
ci++;
} else {
StringBuffer stringbuffer = new StringBuffer("\nNo alignment in " + this.currentfile + " around " + ni + " and " + ci);
for (int cci = ci - 10, i = 0; i < 30; i++, cci++) {
stringbuffer.append(" " + coreftokens[cci]);
}
stringbuffer.append('\n');
for (int nni = ni - 8, i = 0; i < 30; i++, nni++) {
if (nni == ni)
stringbuffer.append(" *");
else
stringbuffer.append(" ");
stringbuffer.append(nerTokens[nni]);
}
stringbuffer.append('\n');
logger.error(stringbuffer.toString());
return null;
}
}
}
// now transpose the NER view to the coref tokenization.
SpanLabelView tv = new SpanLabelView(OntonotesNamedEntityReader.VIEW_NAME, this.getClass().getCanonicalName(), resultTA, 1.0, true);
for (Constituent c : nerView.getConstituents()) {
int start = tokenAlignment[c.getStartSpan()];
int end = c.getEndSpan() >= tokenAlignment.length ? tokenAlignment[tokenAlignment.length - 1] : tokenAlignment[c.getEndSpan()];
try {
String lbl = c.getLabel();
tv.addSpanLabel(start, end, lbl, c.getConstituentScore());
} catch (IllegalArgumentException iae) {
logger.error("Overlapping labels are not supported.", iae);
}
}
if (resultTA != null) {
resultTA.addView(OntonotesCorefReader.VIEW_NAME, corefView);
resultTA.addView(OntonotesNamedEntityReader.VIEW_NAME, tv);
View posView = (View) resultTA.getView(OntonotesTreebankReader.VIEW_NAME);
// new identify mention types.
for (Constituent c : corefView.getConstituents()) {
this.setMentionType(c, tv, posView);
}
processed++;
}
return resultTA;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class TACReaderTest method main.
public static void main(String[] args) {
TACReader tacReader = null;
try {
tacReader = new TACReader(CORPUS_ROOT, true);
} catch (Exception e) {
e.printStackTrace();
System.err.println("ERROR: " + NAME + ": couldn't instantiate TACReader: " + e.getMessage());
}
String wantedId = "ENG_NW_001278_20130318_F00012HTB.xml";
XmlTextAnnotation outputXmlTa = null;
do {
try {
outputXmlTa = tacReader.next();
} catch (IllegalStateException e) {
e.printStackTrace();
}
} while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && tacReader.hasNext());
if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
TextAnnotation output = outputXmlTa.getTextAnnotation();
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXml = xmlSt.getOrigText();
List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
Set<String> docIdReported = markupAttributes.get(IDOFFSETS);
assert (docIdReported.contains(ID));
assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMakerOntonotesTest method testNestedNames.
/**
* the edit offsets get messed up when there are nested tags.
*/
@Test
public void testNestedNames() {
String text = "He spoke with Paul <ENAMEX TYPE=\"PERSON\"><ENAMEX TYPE=\"PERSON\" E_OFF=\"1\">Paula</ENAMEX> Zahn</ENAMEX> .";
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(text, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
StringTransformation xst = xta.getXmlSt();
for (XmlDocumentProcessor.SpanInfo si : fudge) {
int newTextStart = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getFirst());
int newTextEnd = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getSecond());
String neStr = ta.getText().substring(newTextStart, newTextEnd);
assertTrue(REF_ENTITIES.contains(neStr));
}
}
Aggregations