Search in sources :

Example 26 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class EREReaderTest method runTest.

private static XmlTextAnnotation runTest(EreCorpus ereCorpus, String corpusRoot) {
    ERENerReader nerReader = null;
    boolean addNominalMentions = true;
    boolean throwExceptionOnXmlTagMismatch = true;
    try {
        nerReader = new EREMentionRelationReader(ereCorpus, corpusRoot, throwExceptionOnXmlTagMismatch);
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("ERROR: " + NAME + ": couldn't instantiate ERENerReader for ERE release " + ereCorpus.name() + ": " + e.getMessage());
    }
    XmlTextAnnotation outputXmlTa = nerReader.next();
    TextAnnotation output = outputXmlTa.getTextAnnotation();
    // Test TextAnnotationUtilities.mapTransformedTextAnnotationToSource()
    TextAnnotation mappedTa = TextAnnotationUtilities.mapTransformedTextAnnotationToSource(output, outputXmlTa.getXmlSt());
    assertEquals(mappedTa.getView(ViewNames.TOKENS).getNumberOfConstituents(), output.getView(ViewNames.TOKENS).getNumberOfConstituents());
    assertEquals(mappedTa.getView(ViewNames.SENTENCE).getNumberOfConstituents(), output.getView(ViewNames.SENTENCE).getNumberOfConstituents());
    View nerEre = null;
    if (addNominalMentions) {
        assert (output.hasView(ViewNames.MENTION_ERE));
        nerEre = output.getView(ViewNames.MENTION_ERE);
    } else {
        assert (output.hasView(ViewNames.NER_ERE));
        nerEre = output.getView(ViewNames.NER_ERE);
    }
    assert (nerEre.getConstituents().size() > 0);
    StringTransformation xmlSt = outputXmlTa.getXmlSt();
    String origXmlStr = xmlSt.getOrigText();
    System.out.println("ERENerReader found " + nerEre.getConstituents().size() + " NER constituents: ");
    for (Constituent c : nerEre.getConstituents()) {
        System.out.println(TextAnnotationPrintHelper.printConstituent(c));
        int start = c.getStartCharOffset();
        int end = c.getEndCharOffset();
        IntPair origOffsets = xmlSt.getOriginalOffsets(start, end);
        String origStr = origXmlStr.substring(origOffsets.getFirst(), origOffsets.getSecond());
        System.out.println("Constituent (clean) text: '" + c.getSurfaceForm() + "'");
        System.out.println("Original text: '" + origStr + "'\n---------\n");
    }
    System.out.println("Report: " + nerReader.generateReport());
    return outputXmlTa;
}
Also used : EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader) ERENerReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.ERENerReader) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IOException(java.io.IOException)

Example 27 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class XmlTextAnnotationMakerTest method testWithFile.

private static void testWithFile(XmlTextAnnotationMaker maker, String xmlFile) {
    String xmlStr = null;
    try {
        xmlStr = LineIO.slurp(xmlFile);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        System.exit(-1);
    }
    XmlTextAnnotation output = maker.createTextAnnotation(xmlStr, "test", "test");
    TextAnnotation ta = output.getTextAnnotation();
    Sentence firstSentence = ta.getSentence(0);
    String firstSentenceText = firstSentence.getText();
    System.out.println(firstSentenceText);
    Constituent thirdWord = ta.getView(ViewNames.TOKENS).getConstituentsCoveringSpan(2, 3).get(0);
    int thirdStartChar = thirdWord.getStartCharOffset();
    int thirdEndChar = thirdWord.getEndCharOffset();
    String thirdWordForm = thirdWord.getSurfaceForm();
    StringTransformation st = output.getXmlSt();
    IntPair origSpan = st.getOriginalOffsets(thirdStartChar, thirdEndChar);
    // int origStartChar = st.computeOriginalOffset(thirdStartChar);
    // int origEndChar = st.computeOriginalOffset(thirdEndChar);
    // String origWordForm = xmlStr.substring(origStartChar, origEndChar);
    String origWordForm = st.getOrigText().substring(origSpan.getFirst(), origSpan.getSecond());
    System.out.println("Third word: " + thirdWordForm);
    String transformStr = st.getTransformedText().substring(thirdStartChar, thirdEndChar);
    System.out.println("corresponding substring from transformed text: " + transformStr);
    System.out.println("original text substring using mapped offsets: " + origWordForm);
    if (!transformStr.equals(origWordForm))
        System.err.println("ERROR: test failed: word '" + transformStr + "' not identical to original word '" + origWordForm + "'. ");
    View mentionView = output.getTextAnnotation().getView(ViewNames.SENTENCE);
    for (Constituent c : mentionView.getConstituents()) {
        int start = c.getStartCharOffset();
        int end = c.getEndCharOffset();
        String cleanForm = c.getSurfaceForm();
        IntPair sourceSpan = st.getOriginalOffsets(start, end);
        System.out.println("------\nclean: " + cleanForm + ", (" + start + ", " + end + ")");
        System.out.println("------\nsource: " + st.getOrigText().substring(sourceSpan.getFirst(), sourceSpan.getSecond()) + ", (" + sourceSpan.getFirst() + ", " + sourceSpan.getSecond() + ")\n");
    }
    List<XmlDocumentProcessor.SpanInfo> markup = output.getXmlMarkup();
    Map<IntPair, XmlDocumentProcessor.SpanInfo> markupMap = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
    for (IntPair offsets : markupMap.keySet()) {
        System.out.print(offsets.getFirst() + "-" + offsets.getSecond() + ": ");
        Map<String, Pair<String, IntPair>> attVals = markupMap.get(offsets).attributes;
        for (String attType : attVals.keySet()) System.out.println(attType + ": " + attVals.get(attType).getFirst());
        System.out.println();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 28 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class OntonotesNerReaderExample method main.

public static void main(String[] args) throws ClassNotFoundException, SQLException, IOException {
    String inFile = "/shared/corpora/corporaWeb/multi-mode/multi/ontonotes-release-5.0/data/files/data/english/annotations/nw/wsj/00/wsj_0061.name";
    // make sure the output directory exists.
    // "en"
    int counter = 0;
    long start = System.currentTimeMillis();
    // define all tags with text.
    Set<String> tagsWithText = new HashSet<>();
    // define the attributes we want to keep for the tags we have.
    Map<String, Set<String>> tagsWithAtts = new HashMap<>();
    {
        Set<String> docAttrs = new HashSet<>();
        docAttrs.add("docno");
        tagsWithAtts.put("doc", docAttrs);
    }
    {
        Set<String> nameAttrs = new HashSet<>();
        nameAttrs.add("type");
        tagsWithAtts.put("enamex", nameAttrs);
    }
    boolean throwExceptionOnXmlParseFail = true;
    // we keep everything.
    Set<String> dropTags = new HashSet<>();
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    String document = LineIO.slurp(inFile);
    XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
    System.out.println(ta + "\n");
    View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
    String cleanText = ta.getText();
    for (XmlDocumentProcessor.SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            String neLabel = si.attributes.get("type").getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            System.err.println("ne string: '" + cleanText.substring(cleanTextCharStart, cleanTextCharEnd) + "'");
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            // constituent token indexing uses one-past-the-end
            Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
            nerView.addConstituent(neCon);
        }
        counter++;
        System.out.println("Read " + counter + " documents in " + (System.currentTimeMillis() - start));
        System.out.println(nerView.toString());
    }
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)

Example 29 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StanfordAnalyzer method tokenizeTextSpan.

/**
 * given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
 * to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
 * IntPair[] is the corresponding list of character offsets with respect to <b>the original
 * text</b>.
 *
 * @param textSpan
 */
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
    Annotation document = new Annotation(textSpan);
    pipeline.annotate(document);
    List<CoreLabel> tokens = new ArrayList<>();
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    int[] sen_ends = new int[sentences.size()];
    int sen_idx = 0;
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            tokens.add(token);
        }
        sen_ends[sen_idx++] = tokens.size();
    }
    String[] surfaces = new String[tokens.size()];
    IntPair[] tokenCharOffsets = new IntPair[tokens.size()];
    for (int i = 0; i < tokens.size(); i++) {
        surfaces[i] = tokens.get(i).originalText();
        tokenCharOffsets[i] = new IntPair(tokens.get(i).beginPosition(), tokens.get(i).endPosition());
    // System.out.println(surfaces[i]);
    // System.out.println(tokenCharOffsets[i]);
    }
    return new Tokenization(surfaces, tokenCharOffsets, sen_ends);
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 30 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ThaiTokenizer method getTextAnnotation.

public TextAnnotation getTextAnnotation(String text) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
    boundary.setText(text);
    int start = boundary.first();
    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        // System.out.println(start+" "+end+" "+text.length());
        String sur = text.substring(start, end);
        if (sur.trim().isEmpty()) {
            // sen_ends.add(surfaces.size());
            continue;
        }
        surfaces.add(sur);
        offsets.add(new IntPair(start, end));
    }
    if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
        sen_ends.add(surfaces.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    int[] ends = new int[sen_ends.size()];
    for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
    // System.out.println(text);
    // System.out.println(offsets);
    // System.out.println(sen_ends);
    TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
    return ta;
}
Also used : Locale(java.util.Locale) ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) BreakIterator(java.text.BreakIterator) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)129 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)39 ArrayList (java.util.ArrayList)27 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)26 Test (org.junit.Test)21 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)18 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)14 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)8 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)7 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)6 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)5 Matcher (java.util.regex.Matcher)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 IOException (java.io.IOException)4 JsonObject (com.google.gson.JsonObject)3 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)3 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)3