Search in sources :

Example 1 with XmlTextAnnotationMaker

use of edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker in project cogcomp-nlp by CogComp.

the class OntonotesNerReaderExample method main.

public static void main(String[] args) throws ClassNotFoundException, SQLException, IOException {
    String inFile = "/shared/corpora/corporaWeb/multi-mode/multi/ontonotes-release-5.0/data/files/data/english/annotations/nw/wsj/00/wsj_0061.name";
    // make sure the output directory exists.
    // "en"
    int counter = 0;
    long start = System.currentTimeMillis();
    // define all tags with text.
    Set<String> tagsWithText = new HashSet<>();
    // define the attributes we want to keep for the tags we have.
    Map<String, Set<String>> tagsWithAtts = new HashMap<>();
    {
        Set<String> docAttrs = new HashSet<>();
        docAttrs.add("docno");
        tagsWithAtts.put("doc", docAttrs);
    }
    {
        Set<String> nameAttrs = new HashSet<>();
        nameAttrs.add("type");
        tagsWithAtts.put("enamex", nameAttrs);
    }
    boolean throwExceptionOnXmlParseFail = true;
    // we keep everything.
    Set<String> dropTags = new HashSet<>();
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    String document = LineIO.slurp(inFile);
    XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
    System.out.println(ta + "\n");
    View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
    String cleanText = ta.getText();
    for (XmlDocumentProcessor.SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            String neLabel = si.attributes.get("type").getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            System.err.println("ne string: '" + cleanText.substring(cleanTextCharStart, cleanTextCharEnd) + "'");
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            //constituent token indexing uses one-past-the-end
            Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
            nerView.addConstituent(neCon);
        }
        counter++;
        System.out.println("Read " + counter + " documents in " + (System.currentTimeMillis() - start));
        System.out.println(nerView.toString());
    }
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)

Example 2 with XmlTextAnnotationMaker

use of edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker in project cogcomp-nlp by CogComp.

the class EREDocumentReader method buildXmlTextAnnotationMaker.

/**
     * builds an {@link XmlTextAnnotationMaker} expecting ERE annotation.  {@link TextAnnotationBuilder} must be
     * configured for the target language.
     *
     * @param textAnnotationBuilder a TextAnnotationBuilder with tokenizer suited to target language.
     * @param throwExceptionOnXmlParseFail if 'true', the XmlTextAnnotationMaker will throw an exception if any
     *                                     errors are found in the source xml.
     * @return an XmlTextAnnotationMaker configured to parse an ERE corpus.
     */
public static XmlTextAnnotationMaker buildXmlTextAnnotationMaker(TextAnnotationBuilder textAnnotationBuilder, EreCorpus ereCorpus, boolean throwExceptionOnXmlParseFail) {
    Map<String, Set<String>> tagsWithAtts = new HashMap<>();
    Set<String> attributeNames = new HashSet<>();
    attributeNames.add(AUTHOR);
    attributeNames.add(ID);
    attributeNames.add(DATETIME);
    tagsWithAtts.put(POST, attributeNames);
    attributeNames = new HashSet<>();
    attributeNames.add(ID);
    tagsWithAtts.put(DOC, attributeNames);
    attributeNames = new HashSet<>();
    attributeNames.add(ORIG_AUTHOR);
    tagsWithAtts.put(QUOTE, attributeNames);
    Set<String> deletableSpanTags = new HashSet<>();
    // for release 3 only, quoted blocks are NOT annotated
    if (EreCorpus.ENR3.equals(ereCorpus))
        deletableSpanTags.add(QUOTE);
    // implies "delete spans enclosed by these tags"
    Set<String> tagsToIgnore = new HashSet<>();
    tagsToIgnore.add(IMG);
    tagsToIgnore.add(SNIP);
    tagsToIgnore.add(SQUISH);
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlParseFail);
    return new XmlTextAnnotationMaker(textAnnotationBuilder, xmlProcessor);
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)

Example 3 with XmlTextAnnotationMaker

use of edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker in project cogcomp-nlp by CogComp.

the class XmlTextAnnotationMakerTest method main.

// public void testNerReader() {
/**
     * non-unit Test, as it needs the ERE corpus.
     * @param args
     */
public static void main(String[] args) {
    boolean throwExceptionOnXmlTagMiss = true;
    XmlTextAnnotationMaker maker = null;
    try {
        maker = EREDocumentReader.buildEreXmlTextAnnotationMaker(EREDocumentReader.EreCorpus.ENR3.name(), throwExceptionOnXmlTagMiss);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    }
    testWithFile(maker, XML_FILE2);
    testWithFile(maker, XML_FILE);
}
Also used : XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) FileNotFoundException(java.io.FileNotFoundException)

Aggregations

XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)3 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)2 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)1 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)1 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)1 FileNotFoundException (java.io.FileNotFoundException)1