use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor in project cogcomp-nlp by CogComp.
the class XmlDocumentProcessorTest method testXmlDocumentProcessor.
@Test
public void testXmlDocumentProcessor() {
/*
<doc id="ENG_DF_001241_20150407_F0000007T">
<headline>
cuba
</headline>
<post id="p1" author="chatmasta" datetime="2015-04-07T14:42:00">
*/
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
Set<String> attributeNames = new HashSet<>();
attributeNames.add("author");
attributeNames.add("id");
attributeNames.add("datetime");
tagsWithAtts.put("post", attributeNames);
attributeNames = new HashSet<>();
attributeNames.add("id");
tagsWithAtts.put("doc", attributeNames);
Set<String> deletableSpanTags = new HashSet<>();
deletableSpanTags.add("quote");
deletableSpanTags.add("distraction");
Set<String> tagsToIgnore = new HashSet<>();
tagsToIgnore.add("img");
tagsToIgnore.add("snip");
// StringTransformation origTextSt = new StringTransformation(ORIG_TEXT);
boolean throwExceptionOnXmlTagMiss = true;
XmlDocumentProcessor proc = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlTagMiss);
Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> nt = proc.processXml(ORIG_TEXT);
// check that we retained the right attributes, cleaned up the text, generated a sensible cleaned text, and can
// recover the offsets of strings in the original text.
StringTransformation st = nt.getFirst();
List<XmlDocumentProcessor.SpanInfo> retainedTagInfo = nt.getSecond();
String cleanText = st.getTransformedText();
assertEquals(ORIG_TEXT, st.getOrigText());
assertEquals(CLEAN_TEXT, cleanText);
// Map<IntPair, String> attrVals = XmlDocumentProcessor.compileAttributeValues(retainedTagInfo);
Map<IntPair, XmlDocumentProcessor.SpanInfo> offsetToSpans = XmlDocumentProcessor.compileOffsetSpanMapping(retainedTagInfo);
assertTrue(offsetToSpans.containsKey(POST_OFFSETS));
XmlDocumentProcessor.SpanInfo spanInfo = offsetToSpans.get(POST_OFFSETS);
assertTrue(spanInfo.attributes.containsKey(AUTHOR));
assertEquals(NAME, spanInfo.attributes.get(AUTHOR).getFirst());
assertEquals(AUTHOR_OFFSETS, spanInfo.attributes.get(AUTHOR).getSecond());
String origAuthStr = st.getOrigText().substring(AUTHOR_OFFSETS.getFirst(), AUTHOR_OFFSETS.getSecond());
assertEquals(NAME, origAuthStr);
assertTrue(offsetToSpans.containsKey(DISTR_OFFSETS));
spanInfo = offsetToSpans.get(DISTR_OFFSETS);
assertTrue(spanInfo.label.equals("distraction"));
assertEquals(DISTR_SUBSTR, ORIG_TEXT.substring(DISTR_OFFSETS.getFirst(), DISTR_OFFSETS.getSecond()));
assertTrue(offsetToSpans.containsKey(IQ_OFFSETS));
int iqStart = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getFirst());
int iqEnd = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getSecond());
// deleted
assertEquals("", cleanText.substring(iqStart, iqEnd));
assertEquals(ORIG_TEXT.indexOf("Whassup"), IQ_OFFSETS.getFirst());
int doStart = cleanText.indexOf("do?");
int doEnd = doStart + 3;
IntPair origYouOffsets = st.getOriginalOffsets(doStart, doEnd);
assertEquals("do?", ORIG_TEXT.substring(origYouOffsets.getFirst(), origYouOffsets.getSecond()));
}
use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor in project cogcomp-nlp by CogComp.
the class OntonotesNerReaderExample method main.
public static void main(String[] args) throws ClassNotFoundException, SQLException, IOException {
String inFile = "/shared/corpora/corporaWeb/multi-mode/multi/ontonotes-release-5.0/data/files/data/english/annotations/nw/wsj/00/wsj_0061.name";
// make sure the output directory exists.
// "en"
int counter = 0;
long start = System.currentTimeMillis();
// define all tags with text.
Set<String> tagsWithText = new HashSet<>();
// define the attributes we want to keep for the tags we have.
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
{
Set<String> docAttrs = new HashSet<>();
docAttrs.add("docno");
tagsWithAtts.put("doc", docAttrs);
}
{
Set<String> nameAttrs = new HashSet<>();
nameAttrs.add("type");
tagsWithAtts.put("enamex", nameAttrs);
}
boolean throwExceptionOnXmlParseFail = true;
// we keep everything.
Set<String> dropTags = new HashSet<>();
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
String document = LineIO.slurp(inFile);
XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
System.out.println(ta + "\n");
View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
String cleanText = ta.getText();
for (XmlDocumentProcessor.SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
String neLabel = si.attributes.get("type").getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
System.err.println("ne string: '" + cleanText.substring(cleanTextCharStart, cleanTextCharEnd) + "'");
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
//constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
counter++;
System.out.println("Read " + counter + " documents in " + (System.currentTimeMillis() - start));
System.out.println(nerView.toString());
}
}
use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor in project cogcomp-nlp by CogComp.
the class EREDocumentReader method buildXmlTextAnnotationMaker.
/**
* builds an {@link XmlTextAnnotationMaker} expecting ERE annotation. {@link TextAnnotationBuilder} must be
* configured for the target language.
*
* @param textAnnotationBuilder a TextAnnotationBuilder with tokenizer suited to target language.
* @param throwExceptionOnXmlParseFail if 'true', the XmlTextAnnotationMaker will throw an exception if any
* errors are found in the source xml.
* @return an XmlTextAnnotationMaker configured to parse an ERE corpus.
*/
public static XmlTextAnnotationMaker buildXmlTextAnnotationMaker(TextAnnotationBuilder textAnnotationBuilder, EreCorpus ereCorpus, boolean throwExceptionOnXmlParseFail) {
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
Set<String> attributeNames = new HashSet<>();
attributeNames.add(AUTHOR);
attributeNames.add(ID);
attributeNames.add(DATETIME);
tagsWithAtts.put(POST, attributeNames);
attributeNames = new HashSet<>();
attributeNames.add(ID);
tagsWithAtts.put(DOC, attributeNames);
attributeNames = new HashSet<>();
attributeNames.add(ORIG_AUTHOR);
tagsWithAtts.put(QUOTE, attributeNames);
Set<String> deletableSpanTags = new HashSet<>();
// for release 3 only, quoted blocks are NOT annotated
if (EreCorpus.ENR3.equals(ereCorpus))
deletableSpanTags.add(QUOTE);
// implies "delete spans enclosed by these tags"
Set<String> tagsToIgnore = new HashSet<>();
tagsToIgnore.add(IMG);
tagsToIgnore.add(SNIP);
tagsToIgnore.add(SQUISH);
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlParseFail);
return new XmlTextAnnotationMaker(textAnnotationBuilder, xmlProcessor);
}
Aggregations