use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo in project cogcomp-nlp by CogComp.
the class ConvertOntonotesToColumn method getNameTextAnnotation.
/**
* read the file indicated by the argument which is the file name, and path.
* @param file the file to read.
* @param document the data read from the file.
* @return the XmlTextAnnotation containing the text annotation, and xml markup offset data.
* @throws IOException
*/
private static XmlTextAnnotation getNameTextAnnotation(File file) throws IOException {
String document = LineIO.slurp(file.getCanonicalPath());
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<SpanInfo> fudge = xta.getXmlMarkup();
// create the named entity vi
View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
for (SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
String neLabel = si.attributes.get("type").getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
}
ta.addView(ViewNames.NER_ONTONOTES, nerView);
return xta;
}
use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo in project cogcomp-nlp by CogComp.
the class OntonotesNamedEntityReader method nextAnnotation.
/**
* parse the pen treebank parse file, producing an annotation covering the entire file.
* @param data the data from the file, each line.
* @param docid the id representing the document name.
* @return the text annotation.
* @throws AnnotatorException
*/
private XmlTextAnnotation nextAnnotation(String data, String docid) throws AnnotatorException {
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(data, "OntoNotes 5.0", docid);
TextAnnotation ta = xta.getTextAnnotation();
List<SpanInfo> fudge = xta.getXmlMarkup();
// create the named entity vi
View nerView = new SpanLabelView(VIEW_NAME, ta);
for (SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
Pair<String, IntPair> neLabelPair = si.attributes.get("type");
String neLabel = neLabelPair.getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
if (cleanTextNeTokStart == -1 || cleanTextNeTokEnd == -1) {
for (Constituent c : nerView.getConstituents()) {
System.err.println(c);
}
System.err.println("Something wonky in \"" + docid + "\", at " + charOffsets + ", " + cleanTextCharStart + " - " + cleanTextCharEnd + " = " + ta.text.substring(cleanTextCharStart, cleanTextCharEnd));
} else {
if (entityCounts.containsKey(neLabel)) {
entityCounts.put(neLabel, (entityCounts.get(neLabel) + 1));
} else {
entityCounts.put(neLabel, 1);
}
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
}
}
ta.addView(VIEW_NAME, nerView);
return xta;
}
Aggregations