use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class OntonotesNamedEntityReader method main.
/**
* This class will read the ontonotes data from the provided directory, and write the resulting
* NER view data to the specified output directory in CoNLL column format. It will retain
* the directory structure of the original data.
* @param args command lines args specify input data directory, language and output directory.
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length < 3) {
System.err.println("This executable requires three arguments:\n" + " OntonotesTreebankReader <OntoNotes Directory> <language> <output_directory>");
System.exit(-1);
}
String topdir = args[0];
String outputdir = args[2];
OntonotesNamedEntityReader otr = new OntonotesNamedEntityReader(topdir, args[1]);
int count = 0;
final boolean producejson = true;
while (otr.hasNext()) {
XmlTextAnnotation xta = otr.next();
String path = otr.currentfile;
if (producejson) {
try {
String json = SerializationHelper.serializeToJson(xta.getTextAnnotation());
String outfile = otr.currentfile.replace(topdir, args[2]);
File outputfile = new File(outfile);
outputfile.getParentFile().mkdirs();
try (PrintWriter out = new PrintWriter(outputfile)) {
out.print(json);
}
} catch (Throwable t) {
System.out.println(otr.currentfile + " produced the incorrect offset.");
}
} else {
TextAnnotation ta = xta.getTextAnnotation();
path = outputdir + path.substring(topdir.length());
path += ".conll";
CoNLL2002Writer.writeViewInCoNLL2003Format(ta.getView(VIEW_NAME), ta, path);
}
count++;
if ((count % 10) == 0)
System.out.println("Completed " + count + " of " + otr.filelist.size());
}
System.out.println(otr.generateReport());
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class ConvertOntonotesToColumn method getNameTextAnnotation.
/**
* read the file indicated by the argument which is the file name, and path.
* @param file the file to read.
* @param document the data read from the file.
* @return the XmlTextAnnotation containing the text annotation, and xml markup offset data.
* @throws IOException
*/
private static XmlTextAnnotation getNameTextAnnotation(File file) throws IOException {
String document = LineIO.slurp(file.getCanonicalPath());
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<SpanInfo> fudge = xta.getXmlMarkup();
// create the named entity vi
View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
for (SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
String neLabel = si.attributes.get("type").getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
}
ta.addView(ViewNames.NER_ONTONOTES, nerView);
return xta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class MultilingualEreReaderTest method testReader.
private static void testReader(EREEventReader reader) {
assertTrue(reader.hasNext());
XmlTextAnnotation xmlTa = reader.next();
TextAnnotation ta = xmlTa.getTextAnnotation();
assertTrue(ta.hasView(ViewNames.MENTION_ERE));
assertTrue(ta.getView(ViewNames.MENTION_ERE).getConstituents().size() > 5);
assertTrue(ta.getView(ViewNames.MENTION_ERE).getRelations().size() > 0);
assertTrue(ta.hasView(ViewNames.EVENT_ERE));
assertTrue(ta.getView(ViewNames.EVENT_ERE).getConstituents().size() > 1);
assertTrue(ta.getView(ViewNames.EVENT_ERE).getRelations().size() > 2);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class MentionDetectionTest method testHeadInference.
@Test
public void testHeadInference() {
EREMentionRelationReader ereMentionRelationReader = null;
try {
String path = "src/test/resources/ERE";
ereMentionRelationReader = new EREMentionRelationReader(EREDocumentReader.EreCorpus.ENR3, path, false);
POSAnnotator posAnnotator = new POSAnnotator();
MentionAnnotator mentionAnnotator = new MentionAnnotator();
for (XmlTextAnnotation xta : ereMentionRelationReader) {
TextAnnotation ta = xta.getTextAnnotation();
ta.addView(posAnnotator);
mentionAnnotator.addView(ta);
if (ta.getView("MENTION").getNumberOfConstituents() < 60) {
fail("Mention Head predicted performance dropped");
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMaker method createTextAnnotation.
/**
* A method for creating
* {@link TextAnnotation} by
* tokenizing the given text string.
*
* @param xmlText Raw xml text from corpus document
* @param corpusId corpus identifier
* @param docId text identifier
* @return an XmlTextAnnotation with the cleaned text (StringTransformation), TextAnnotation for
* the cleaned text, and xml markup extracted from source
*/
public XmlTextAnnotation createTextAnnotation(String xmlText, String corpusId, String docId) {
logger.debug("processing text from document {}", docId);
Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> cleanResults = xmlProcessor.processXml(xmlText);
TextAnnotation ta = taBuilder.createTextAnnotation(corpusId, docId, cleanResults.getFirst().getTransformedText());
return new XmlTextAnnotation(cleanResults.getFirst(), ta, cleanResults.getSecond());
}
Aggregations