use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class EREReaderTest method runTest.
private static XmlTextAnnotation runTest(EreCorpus ereCorpus, String corpusRoot) {
ERENerReader nerReader = null;
boolean addNominalMentions = true;
boolean throwExceptionOnXmlTagMismatch = true;
try {
nerReader = new EREMentionRelationReader(ereCorpus, corpusRoot, throwExceptionOnXmlTagMismatch);
} catch (Exception e) {
e.printStackTrace();
System.err.println("ERROR: " + NAME + ": couldn't instantiate ERENerReader for ERE release " + ereCorpus.name() + ": " + e.getMessage());
}
XmlTextAnnotation outputXmlTa = nerReader.next();
TextAnnotation output = outputXmlTa.getTextAnnotation();
// Test TextAnnotationUtilities.mapTransformedTextAnnotationToSource()
TextAnnotation mappedTa = TextAnnotationUtilities.mapTransformedTextAnnotationToSource(output, outputXmlTa.getXmlSt());
assertEquals(mappedTa.getView(ViewNames.TOKENS).getNumberOfConstituents(), output.getView(ViewNames.TOKENS).getNumberOfConstituents());
assertEquals(mappedTa.getView(ViewNames.SENTENCE).getNumberOfConstituents(), output.getView(ViewNames.SENTENCE).getNumberOfConstituents());
View nerEre = null;
if (addNominalMentions) {
assert (output.hasView(ViewNames.MENTION_ERE));
nerEre = output.getView(ViewNames.MENTION_ERE);
} else {
assert (output.hasView(ViewNames.NER_ERE));
nerEre = output.getView(ViewNames.NER_ERE);
}
assert (nerEre.getConstituents().size() > 0);
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXmlStr = xmlSt.getOrigText();
System.out.println("ERENerReader found " + nerEre.getConstituents().size() + " NER constituents: ");
for (Constituent c : nerEre.getConstituents()) {
System.out.println(TextAnnotationPrintHelper.printConstituent(c));
int start = c.getStartCharOffset();
int end = c.getEndCharOffset();
IntPair origOffsets = xmlSt.getOriginalOffsets(start, end);
String origStr = origXmlStr.substring(origOffsets.getFirst(), origOffsets.getSecond());
System.out.println("Constituent (clean) text: '" + c.getSurfaceForm() + "'");
System.out.println("Original text: '" + origStr + "'\n---------\n");
}
System.out.println("Report: " + nerReader.generateReport());
return outputXmlTa;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMakerTest method testWithFile.
private static void testWithFile(XmlTextAnnotationMaker maker, String xmlFile) {
String xmlStr = null;
try {
xmlStr = LineIO.slurp(xmlFile);
} catch (FileNotFoundException e) {
e.printStackTrace();
System.exit(-1);
}
XmlTextAnnotation output = maker.createTextAnnotation(xmlStr, "test", "test");
TextAnnotation ta = output.getTextAnnotation();
Sentence firstSentence = ta.getSentence(0);
String firstSentenceText = firstSentence.getText();
System.out.println(firstSentenceText);
Constituent thirdWord = ta.getView(ViewNames.TOKENS).getConstituentsCoveringSpan(2, 3).get(0);
int thirdStartChar = thirdWord.getStartCharOffset();
int thirdEndChar = thirdWord.getEndCharOffset();
String thirdWordForm = thirdWord.getSurfaceForm();
StringTransformation st = output.getXmlSt();
IntPair origSpan = st.getOriginalOffsets(thirdStartChar, thirdEndChar);
// int origStartChar = st.computeOriginalOffset(thirdStartChar);
// int origEndChar = st.computeOriginalOffset(thirdEndChar);
// String origWordForm = xmlStr.substring(origStartChar, origEndChar);
String origWordForm = st.getOrigText().substring(origSpan.getFirst(), origSpan.getSecond());
System.out.println("Third word: " + thirdWordForm);
String transformStr = st.getTransformedText().substring(thirdStartChar, thirdEndChar);
System.out.println("corresponding substring from transformed text: " + transformStr);
System.out.println("original text substring using mapped offsets: " + origWordForm);
if (!transformStr.equals(origWordForm))
System.err.println("ERROR: test failed: word '" + transformStr + "' not identical to original word '" + origWordForm + "'. ");
View mentionView = output.getTextAnnotation().getView(ViewNames.SENTENCE);
for (Constituent c : mentionView.getConstituents()) {
int start = c.getStartCharOffset();
int end = c.getEndCharOffset();
String cleanForm = c.getSurfaceForm();
IntPair sourceSpan = st.getOriginalOffsets(start, end);
System.out.println("------\nclean: " + cleanForm + ", (" + start + ", " + end + ")");
System.out.println("------\nsource: " + st.getOrigText().substring(sourceSpan.getFirst(), sourceSpan.getSecond()) + ", (" + sourceSpan.getFirst() + ", " + sourceSpan.getSecond() + ")\n");
}
List<XmlDocumentProcessor.SpanInfo> markup = output.getXmlMarkup();
Map<IntPair, XmlDocumentProcessor.SpanInfo> markupMap = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
for (IntPair offsets : markupMap.keySet()) {
System.out.print(offsets.getFirst() + "-" + offsets.getSecond() + ": ");
Map<String, Pair<String, IntPair>> attVals = markupMap.get(offsets).attributes;
for (String attType : attVals.keySet()) System.out.println(attType + ": " + attVals.get(attType).getFirst());
System.out.println();
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class OntonotesNerReaderExample method main.
public static void main(String[] args) throws ClassNotFoundException, SQLException, IOException {
String inFile = "/shared/corpora/corporaWeb/multi-mode/multi/ontonotes-release-5.0/data/files/data/english/annotations/nw/wsj/00/wsj_0061.name";
// make sure the output directory exists.
// "en"
int counter = 0;
long start = System.currentTimeMillis();
// define all tags with text.
Set<String> tagsWithText = new HashSet<>();
// define the attributes we want to keep for the tags we have.
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
{
Set<String> docAttrs = new HashSet<>();
docAttrs.add("docno");
tagsWithAtts.put("doc", docAttrs);
}
{
Set<String> nameAttrs = new HashSet<>();
nameAttrs.add("type");
tagsWithAtts.put("enamex", nameAttrs);
}
boolean throwExceptionOnXmlParseFail = true;
// we keep everything.
Set<String> dropTags = new HashSet<>();
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
String document = LineIO.slurp(inFile);
XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
System.out.println(ta + "\n");
View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
String cleanText = ta.getText();
for (XmlDocumentProcessor.SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
String neLabel = si.attributes.get("type").getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
System.err.println("ne string: '" + cleanText.substring(cleanTextCharStart, cleanTextCharEnd) + "'");
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
counter++;
System.out.println("Read " + counter + " documents in " + (System.currentTimeMillis() - start));
System.out.println(nerView.toString());
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StanfordAnalyzer method tokenizeTextSpan.
/**
* given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
* to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
* IntPair[] is the corresponding list of character offsets with respect to <b>the original
* text</b>.
*
* @param textSpan
*/
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
Annotation document = new Annotation(textSpan);
pipeline.annotate(document);
List<CoreLabel> tokens = new ArrayList<>();
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
int[] sen_ends = new int[sentences.size()];
int sen_idx = 0;
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
tokens.add(token);
}
sen_ends[sen_idx++] = tokens.size();
}
String[] surfaces = new String[tokens.size()];
IntPair[] tokenCharOffsets = new IntPair[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
surfaces[i] = tokens.get(i).originalText();
tokenCharOffsets[i] = new IntPair(tokens.get(i).beginPosition(), tokens.get(i).endPosition());
// System.out.println(surfaces[i]);
// System.out.println(tokenCharOffsets[i]);
}
return new Tokenization(surfaces, tokenCharOffsets, sen_ends);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ThaiTokenizer method getTextAnnotation.
public TextAnnotation getTextAnnotation(String text) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
boundary.setText(text);
int start = boundary.first();
for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
// System.out.println(start+" "+end+" "+text.length());
String sur = text.substring(start, end);
if (sur.trim().isEmpty()) {
// sen_ends.add(surfaces.size());
continue;
}
surfaces.add(sur);
offsets.add(new IntPair(start, end));
}
if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
sen_ends.add(surfaces.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
// System.out.println(text);
// System.out.println(offsets);
// System.out.println(sen_ends);
TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
return ta;
}
Aggregations