use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class ERENerReader method getTokenOffsets.
/**
* find the token offsets in the TextAnnotation that correspond to the source character offsets for the given
* mention
*
* @param origStartOffset start character offset from xml markup
* @param origEndOffset end character offset from xml markup
* @param mentionForm mention form from xml markup
* @param xmlTa XmlTextAnnotation object storing original xml, transformed text, extracted xml markup,
* and corresponding TextAnnotation
* @return Intpair(-1, -1) if the specified offsets correspond to deleted span (and hence likely a name mention
* in xml metadata, e.g. post author); null if no mapped tokens could be found (possibly, indexes refer
* to the middle of a single token because tokenizer can't segment some strings); or the corresponding
* token indexes
*/
protected IntPair getTokenOffsets(int origStartOffset, int origEndOffset, String mentionForm, XmlTextAnnotation xmlTa) {
StringTransformation st = xmlTa.getXmlSt();
String origStr = st.getOrigText().substring(origStartOffset, origEndOffset);
if (origStr.startsWith(" ") || origStr.startsWith("\n")) {
origStartOffset += 1;
origEndOffset += 1;
}
int adjStart = st.computeModifiedOffsetFromOriginal(origStartOffset);
int adjEnd = st.computeModifiedOffsetFromOriginal(origEndOffset);
if (adjStart == adjEnd) {
// probably, maps to span deleted when creating cleaned-up text
return new IntPair(-1, -1);
}
IntPair returnOffset = null;
int si = 0, ei = 0;
TextAnnotation ta = xmlTa.getTextAnnotation();
String rawText = ta.getText();
String rawStr = rawText.substring(adjStart, adjEnd);
logger.debug("source xml str: '" + origStr + "' (" + origStartOffset + "," + origEndOffset + ")");
try {
si = findStartIndex(adjStart);
ei = findEndIndex(adjEnd, rawText);
returnOffset = new IntPair(si, ei);
} catch (IllegalArgumentException iae) {
logger.error("could not find token offsets for mention form '" + mentionForm + ", start, end orig: (" + origStartOffset + "," + origEndOffset + "); adjusted: (" + adjStart + "," + adjEnd + ").");
System.exit(1);
} catch (RuntimeException re) {
numOffsetErrors++;
logger.error("Error finding text for '{}' at offsets {}:", rawStr, (adjStart + "-" + adjEnd));
boolean siwaszero = false;
if (si == 0) {
siwaszero = true;
}
si = findStartIndexIgnoreError(adjStart);
ei = findEndIndexIgnoreError(adjEnd);
if (siwaszero)
logger.error("Could not find start token : text='" + mentionForm + "' at adjusted offsets " + adjStart + " to " + adjEnd);
else
logger.error("Could not find end token : text='" + mentionForm + "' at adjusted offsets " + adjStart + " to " + adjEnd);
int max = ta.getTokens().length;
int start = si >= 2 ? si - 2 : 0;
int end = (ei + 2) < max ? ei + 2 : max;
StringBuilder bldr = new StringBuilder();
for (int jj = start; jj < end; jj++) {
bldr.append(" ");
if (jj == si)
bldr.append(":");
bldr.append(ta.getToken(jj));
if (jj == ei)
bldr.append(":");
bldr.append(" ");
}
bldr.append("\n");
logger.error(bldr.toString());
}
return returnOffset;
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class EREReaderTest method runTest.
private static XmlTextAnnotation runTest(EreCorpus ereCorpus, String corpusRoot) {
ERENerReader nerReader = null;
boolean addNominalMentions = true;
boolean throwExceptionOnXmlTagMismatch = true;
try {
nerReader = new EREMentionRelationReader(ereCorpus, corpusRoot, throwExceptionOnXmlTagMismatch);
} catch (Exception e) {
e.printStackTrace();
System.err.println("ERROR: " + NAME + ": couldn't instantiate ERENerReader for ERE release " + ereCorpus.name() + ": " + e.getMessage());
}
XmlTextAnnotation outputXmlTa = nerReader.next();
TextAnnotation output = outputXmlTa.getTextAnnotation();
// Test TextAnnotationUtilities.mapTransformedTextAnnotationToSource()
TextAnnotation mappedTa = TextAnnotationUtilities.mapTransformedTextAnnotationToSource(output, outputXmlTa.getXmlSt());
assertEquals(mappedTa.getView(ViewNames.TOKENS).getNumberOfConstituents(), output.getView(ViewNames.TOKENS).getNumberOfConstituents());
assertEquals(mappedTa.getView(ViewNames.SENTENCE).getNumberOfConstituents(), output.getView(ViewNames.SENTENCE).getNumberOfConstituents());
View nerEre = null;
if (addNominalMentions) {
assert (output.hasView(ViewNames.MENTION_ERE));
nerEre = output.getView(ViewNames.MENTION_ERE);
} else {
assert (output.hasView(ViewNames.NER_ERE));
nerEre = output.getView(ViewNames.NER_ERE);
}
assert (nerEre.getConstituents().size() > 0);
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXmlStr = xmlSt.getOrigText();
System.out.println("ERENerReader found " + nerEre.getConstituents().size() + " NER constituents: ");
for (Constituent c : nerEre.getConstituents()) {
System.out.println(TextAnnotationPrintHelper.printConstituent(c));
int start = c.getStartCharOffset();
int end = c.getEndCharOffset();
IntPair origOffsets = xmlSt.getOriginalOffsets(start, end);
String origStr = origXmlStr.substring(origOffsets.getFirst(), origOffsets.getSecond());
System.out.println("Constituent (clean) text: '" + c.getSurfaceForm() + "'");
System.out.println("Original text: '" + origStr + "'\n---------\n");
}
System.out.println("Report: " + nerReader.generateReport());
return outputXmlTa;
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMakerTest method testWithFile.
private static void testWithFile(XmlTextAnnotationMaker maker, String xmlFile) {
String xmlStr = null;
try {
xmlStr = LineIO.slurp(xmlFile);
} catch (FileNotFoundException e) {
e.printStackTrace();
System.exit(-1);
}
XmlTextAnnotation output = maker.createTextAnnotation(xmlStr, "test", "test");
TextAnnotation ta = output.getTextAnnotation();
Sentence firstSentence = ta.getSentence(0);
String firstSentenceText = firstSentence.getText();
System.out.println(firstSentenceText);
Constituent thirdWord = ta.getView(ViewNames.TOKENS).getConstituentsCoveringSpan(2, 3).get(0);
int thirdStartChar = thirdWord.getStartCharOffset();
int thirdEndChar = thirdWord.getEndCharOffset();
String thirdWordForm = thirdWord.getSurfaceForm();
StringTransformation st = output.getXmlSt();
IntPair origSpan = st.getOriginalOffsets(thirdStartChar, thirdEndChar);
// int origStartChar = st.computeOriginalOffset(thirdStartChar);
// int origEndChar = st.computeOriginalOffset(thirdEndChar);
// String origWordForm = xmlStr.substring(origStartChar, origEndChar);
String origWordForm = st.getOrigText().substring(origSpan.getFirst(), origSpan.getSecond());
System.out.println("Third word: " + thirdWordForm);
String transformStr = st.getTransformedText().substring(thirdStartChar, thirdEndChar);
System.out.println("corresponding substring from transformed text: " + transformStr);
System.out.println("original text substring using mapped offsets: " + origWordForm);
if (!transformStr.equals(origWordForm))
System.err.println("ERROR: test failed: word '" + transformStr + "' not identical to original word '" + origWordForm + "'. ");
View mentionView = output.getTextAnnotation().getView(ViewNames.SENTENCE);
for (Constituent c : mentionView.getConstituents()) {
int start = c.getStartCharOffset();
int end = c.getEndCharOffset();
String cleanForm = c.getSurfaceForm();
IntPair sourceSpan = st.getOriginalOffsets(start, end);
System.out.println("------\nclean: " + cleanForm + ", (" + start + ", " + end + ")");
System.out.println("------\nsource: " + st.getOrigText().substring(sourceSpan.getFirst(), sourceSpan.getSecond()) + ", (" + sourceSpan.getFirst() + ", " + sourceSpan.getSecond() + ")\n");
}
List<XmlDocumentProcessor.SpanInfo> markup = output.getXmlMarkup();
Map<IntPair, XmlDocumentProcessor.SpanInfo> markupMap = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
for (IntPair offsets : markupMap.keySet()) {
System.out.print(offsets.getFirst() + "-" + offsets.getSecond() + ": ");
Map<String, Pair<String, IntPair>> attVals = markupMap.get(offsets).attributes;
for (String attType : attVals.keySet()) System.out.println(attType + ": " + attVals.get(attType).getFirst());
System.out.println();
}
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class XmlDocumentProcessorTest method testXmlDocumentProcessor.
@Test
public void testXmlDocumentProcessor() {
/*
<doc id="ENG_DF_001241_20150407_F0000007T">
<headline>
cuba
</headline>
<post id="p1" author="chatmasta" datetime="2015-04-07T14:42:00">
*/
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
Set<String> attributeNames = new HashSet<>();
attributeNames.add("author");
attributeNames.add("id");
attributeNames.add("datetime");
tagsWithAtts.put("post", attributeNames);
attributeNames = new HashSet<>();
attributeNames.add("id");
tagsWithAtts.put("doc", attributeNames);
Set<String> deletableSpanTags = new HashSet<>();
deletableSpanTags.add("quote");
deletableSpanTags.add("distraction");
Set<String> tagsToIgnore = new HashSet<>();
tagsToIgnore.add("img");
tagsToIgnore.add("snip");
// StringTransformation origTextSt = new StringTransformation(ORIG_TEXT);
boolean throwExceptionOnXmlTagMiss = true;
XmlDocumentProcessor proc = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlTagMiss);
Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> nt = proc.processXml(ORIG_TEXT);
// check that we retained the right attributes, cleaned up the text, generated a sensible cleaned text, and can
// recover the offsets of strings in the original text.
StringTransformation st = nt.getFirst();
List<XmlDocumentProcessor.SpanInfo> retainedTagInfo = nt.getSecond();
String cleanText = st.getTransformedText();
assertEquals(ORIG_TEXT, st.getOrigText());
assertEquals(CLEAN_TEXT, cleanText);
// Map<IntPair, String> attrVals = XmlDocumentProcessor.compileAttributeValues(retainedTagInfo);
Map<IntPair, XmlDocumentProcessor.SpanInfo> offsetToSpans = XmlDocumentProcessor.compileOffsetSpanMapping(retainedTagInfo);
assertTrue(offsetToSpans.containsKey(POST_OFFSETS));
XmlDocumentProcessor.SpanInfo spanInfo = offsetToSpans.get(POST_OFFSETS);
assertTrue(spanInfo.attributes.containsKey(AUTHOR));
assertEquals(NAME, spanInfo.attributes.get(AUTHOR).getFirst());
assertEquals(AUTHOR_OFFSETS, spanInfo.attributes.get(AUTHOR).getSecond());
String origAuthStr = st.getOrigText().substring(AUTHOR_OFFSETS.getFirst(), AUTHOR_OFFSETS.getSecond());
assertEquals(NAME, origAuthStr);
assertTrue(offsetToSpans.containsKey(DISTR_OFFSETS));
spanInfo = offsetToSpans.get(DISTR_OFFSETS);
assertTrue(spanInfo.label.equals("distraction"));
assertEquals(DISTR_SUBSTR, ORIG_TEXT.substring(DISTR_OFFSETS.getFirst(), DISTR_OFFSETS.getSecond()));
assertTrue(offsetToSpans.containsKey(IQ_OFFSETS));
int iqStart = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getFirst());
int iqEnd = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getSecond());
// deleted
assertEquals("", cleanText.substring(iqStart, iqEnd));
assertEquals(ORIG_TEXT.indexOf("Whassup"), IQ_OFFSETS.getFirst());
int doStart = cleanText.indexOf("do?");
int doEnd = doStart + 3;
IntPair origYouOffsets = st.getOriginalOffsets(doStart, doEnd);
assertEquals("do?", ORIG_TEXT.substring(origYouOffsets.getFirst(), origYouOffsets.getSecond()));
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class StringTransformationTest method testReduce.
@Test
public void testReduce() {
StringTransformation st = new StringTransformation(REDUCE);
// "http://org.edu.net/killit say it's a leg";
st.transformString(0, 25, "WWW");
String modifiedStr = st.getTransformedText();
assertEquals(REDUCE, st.getOrigText());
assertEquals(REDUCE.length() - 22, modifiedStr.length());
assertEquals(MODREDUCE, modifiedStr);
int modStart = st.computeModifiedOffsetFromOriginal(0);
int modEnd = st.computeModifiedOffsetFromOriginal(25);
assertEquals(0, modStart);
assertEquals(3, modEnd);
/*
* what happens if we query a char in the middle of a deleted sequence?
* -- should map to beginning of that modification
*/
int modMid = st.computeModifiedOffsetFromOriginal(20);
assertEquals(3, modMid);
IntPair origOffsets = st.getOriginalOffsets(0, 3);
assertEquals(0, origOffsets.getFirst());
assertEquals(25, origOffsets.getSecond());
// intermediate edit chars map to same offsets, treated like replacements
origOffsets = st.getOriginalOffsets(1, 2);
assertEquals(1, origOffsets.getFirst());
assertEquals(2, origOffsets.getSecond());
// 1 past the end of the edit
origOffsets = st.getOriginalOffsets(1, 4);
assertEquals(26, origOffsets.getSecond());
}
Aggregations