use of edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker in project cogcomp-nlp by CogComp.
the class OntonotesNerReaderExample method main.
public static void main(String[] args) throws ClassNotFoundException, SQLException, IOException {
String inFile = "/shared/corpora/corporaWeb/multi-mode/multi/ontonotes-release-5.0/data/files/data/english/annotations/nw/wsj/00/wsj_0061.name";
// make sure the output directory exists.
// "en"
int counter = 0;
long start = System.currentTimeMillis();
// define all tags with text.
Set<String> tagsWithText = new HashSet<>();
// define the attributes we want to keep for the tags we have.
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
{
Set<String> docAttrs = new HashSet<>();
docAttrs.add("docno");
tagsWithAtts.put("doc", docAttrs);
}
{
Set<String> nameAttrs = new HashSet<>();
nameAttrs.add("type");
tagsWithAtts.put("enamex", nameAttrs);
}
boolean throwExceptionOnXmlParseFail = true;
// we keep everything.
Set<String> dropTags = new HashSet<>();
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
String document = LineIO.slurp(inFile);
XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
System.out.println(ta + "\n");
View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
String cleanText = ta.getText();
for (XmlDocumentProcessor.SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
String neLabel = si.attributes.get("type").getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
System.err.println("ne string: '" + cleanText.substring(cleanTextCharStart, cleanTextCharEnd) + "'");
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
//constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
counter++;
System.out.println("Read " + counter + " documents in " + (System.currentTimeMillis() - start));
System.out.println(nerView.toString());
}
}
use of edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker in project cogcomp-nlp by CogComp.
the class EREDocumentReader method buildXmlTextAnnotationMaker.
/**
* builds an {@link XmlTextAnnotationMaker} expecting ERE annotation. {@link TextAnnotationBuilder} must be
* configured for the target language.
*
* @param textAnnotationBuilder a TextAnnotationBuilder with tokenizer suited to target language.
* @param throwExceptionOnXmlParseFail if 'true', the XmlTextAnnotationMaker will throw an exception if any
* errors are found in the source xml.
* @return an XmlTextAnnotationMaker configured to parse an ERE corpus.
*/
public static XmlTextAnnotationMaker buildXmlTextAnnotationMaker(TextAnnotationBuilder textAnnotationBuilder, EreCorpus ereCorpus, boolean throwExceptionOnXmlParseFail) {
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
Set<String> attributeNames = new HashSet<>();
attributeNames.add(AUTHOR);
attributeNames.add(ID);
attributeNames.add(DATETIME);
tagsWithAtts.put(POST, attributeNames);
attributeNames = new HashSet<>();
attributeNames.add(ID);
tagsWithAtts.put(DOC, attributeNames);
attributeNames = new HashSet<>();
attributeNames.add(ORIG_AUTHOR);
tagsWithAtts.put(QUOTE, attributeNames);
Set<String> deletableSpanTags = new HashSet<>();
// for release 3 only, quoted blocks are NOT annotated
if (EreCorpus.ENR3.equals(ereCorpus))
deletableSpanTags.add(QUOTE);
// implies "delete spans enclosed by these tags"
Set<String> tagsToIgnore = new HashSet<>();
tagsToIgnore.add(IMG);
tagsToIgnore.add(SNIP);
tagsToIgnore.add(SQUISH);
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlParseFail);
return new XmlTextAnnotationMaker(textAnnotationBuilder, xmlProcessor);
}
use of edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMakerTest method main.
// public void testNerReader() {
/**
* non-unit Test, as it needs the ERE corpus.
* @param args
*/
public static void main(String[] args) {
boolean throwExceptionOnXmlTagMiss = true;
XmlTextAnnotationMaker maker = null;
try {
maker = EREDocumentReader.buildEreXmlTextAnnotationMaker(EREDocumentReader.EreCorpus.ENR3.name(), throwExceptionOnXmlTagMiss);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
testWithFile(maker, XML_FILE2);
testWithFile(maker, XML_FILE);
}
Aggregations