Search in sources :

Example 11 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class XmlTextAnnotationMakerTest method testWithFile.

private static void testWithFile(XmlTextAnnotationMaker maker, String xmlFile) {
    String xmlStr = null;
    try {
        xmlStr = LineIO.slurp(xmlFile);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        System.exit(-1);
    }
    XmlTextAnnotation output = maker.createTextAnnotation(xmlStr, "test", "test");
    TextAnnotation ta = output.getTextAnnotation();
    Sentence firstSentence = ta.getSentence(0);
    String firstSentenceText = firstSentence.getText();
    System.out.println(firstSentenceText);
    Constituent thirdWord = ta.getView(ViewNames.TOKENS).getConstituentsCoveringSpan(2, 3).get(0);
    int thirdStartChar = thirdWord.getStartCharOffset();
    int thirdEndChar = thirdWord.getEndCharOffset();
    String thirdWordForm = thirdWord.getSurfaceForm();
    StringTransformation st = output.getXmlSt();
    IntPair origSpan = st.getOriginalOffsets(thirdStartChar, thirdEndChar);
    //        int origStartChar = st.computeOriginalOffset(thirdStartChar);
    //        int origEndChar = st.computeOriginalOffset(thirdEndChar);
    //        String origWordForm = xmlStr.substring(origStartChar, origEndChar);
    String origWordForm = st.getOrigText().substring(origSpan.getFirst(), origSpan.getSecond());
    System.out.println("Third word: " + thirdWordForm);
    String transformStr = st.getTransformedText().substring(thirdStartChar, thirdEndChar);
    System.out.println("corresponding substring from transformed text: " + transformStr);
    System.out.println("original text substring using mapped offsets: " + origWordForm);
    if (!transformStr.equals(origWordForm))
        System.err.println("ERROR: test failed: word '" + transformStr + "' not identical to original word '" + origWordForm + "'. ");
    View mentionView = output.getTextAnnotation().getView(ViewNames.SENTENCE);
    for (Constituent c : mentionView.getConstituents()) {
        int start = c.getStartCharOffset();
        int end = c.getEndCharOffset();
        String cleanForm = c.getSurfaceForm();
        IntPair sourceSpan = st.getOriginalOffsets(start, end);
        System.out.println("------\nclean: " + cleanForm + ", (" + start + ", " + end + ")");
        System.out.println("------\nsource: " + st.getOrigText().substring(sourceSpan.getFirst(), sourceSpan.getSecond()) + ", (" + sourceSpan.getFirst() + ", " + sourceSpan.getSecond() + ")\n");
    }
    List<XmlDocumentProcessor.SpanInfo> markup = output.getXmlMarkup();
    Map<IntPair, XmlDocumentProcessor.SpanInfo> markupMap = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
    for (IntPair offsets : markupMap.keySet()) {
        System.out.print(offsets.getFirst() + "-" + offsets.getSecond() + ": ");
        Map<String, Pair<String, IntPair>> attVals = markupMap.get(offsets).attributes;
        for (String attType : attVals.keySet()) System.out.println(attType + ": " + attVals.get(attType).getFirst());
        System.out.println();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 12 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class HeadFinderDependencyViewGenerator method getDependencyTree.

public static TreeView getDependencyTree(TextAnnotation input, String parseViewName, String dependencyViewName) {
    CollinsHeadDependencyParser depParser = new CollinsHeadDependencyParser(false);
    TreeView parseTreeView = (TreeView) input.getView(parseViewName);
    TreeView depTreeView = new TreeView(dependencyViewName, viewGenerator, input, 1d);
    int size = 0;
    for (int i = 0; i < input.getNumberOfSentences(); i++) {
        if (parseTreeView.getTree(i) != null) {
            Constituent parseTreeRoot = parseTreeView.getRootConstituent(i);
            Tree<Pair<String, Integer>> labeledDependencyTree = depParser.getLabeledDependencyTree(parseTreeRoot);
            try {
                depTreeView.setDependencyTree(i, labeledDependencyTree);
            } catch (IllegalStateException e) {
                System.err.println(parseTreeView);
                System.err.println("Unlabeled dependency tree (for debugging): ");
                System.err.println(depParser.getDependencyTree(parseTreeRoot));
                throw e;
            }
            size += input.getSentence(i).size();
            int nConstituents = depTreeView.getNumberOfConstituents();
            if (nConstituents != size) {
                logger.error("{} nodes in dependency tree, " + "{} tokens in text so far", nConstituents, size);
                Set<Integer> set = new LinkedHashSet<>();
                for (int tokenId = 0; tokenId < size; tokenId++) {
                    set.add(tokenId);
                }
                for (Constituent c : depTreeView.getConstituents()) {
                    set.remove(c.getStartSpan());
                }
                StringBuilder sb = new StringBuilder();
                for (int tokenId : set) {
                    sb.append(input.getToken(tokenId)).append(" ");
                }
                logger.error("Dependency tree does not cover tokens: {}", sb.toString());
            }
        }
    }
    return depTreeView;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) CollinsHeadDependencyParser(edu.illinois.cs.cogcomp.nlp.utilities.CollinsHeadDependencyParser) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 13 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class NombankFields method createPredicate.

@Override
public Constituent createPredicate(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield) {
    Tree<Pair<String, IntPair>> l = yield.get(predicateTerminal);
    int start = l.getLabel().getSecond().getFirst();
    Constituent predicate = new Constituent("Predicate", viewName, ta, start, start + 1);
    predicate.addAttribute(PropbankReader.LemmaIdentifier, lemma);
    predicate.addAttribute(PropbankReader.SenseIdentifier, sense);
    return predicate;
}
Also used : Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 14 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class PropbankFields method createPredicate.

public Constituent createPredicate(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield) {
    Tree<Pair<String, IntPair>> l = yield.get(predicateTerminal);
    int start = l.getLabel().getSecond().getFirst();
    Constituent predicate = new Constituent("Predicate", viewName, ta, start, start + 1);
    predicate.addAttribute(PropbankReader.LemmaIdentifier, lemma);
    predicate.addAttribute(PropbankReader.SenseIdentifier, sense);
    predicate.addAttribute(PropbankReader.FormIdentifier, PropbankReader.Forms.getForm(inflection.charAt(0)).name());
    predicate.addAttribute(PropbankReader.TenseIdentifier, PropbankReader.Tenses.getTense(inflection.charAt(1)).name());
    predicate.addAttribute(PropbankReader.AspectIdentifier, PropbankReader.Aspects.getAspect(inflection.charAt(2)).name());
    predicate.addAttribute(PropbankReader.PersonIdentifier, PropbankReader.Person.getPerson(inflection.charAt(3)).name());
    predicate.addAttribute(PropbankReader.VoiceIdentifier, PropbankReader.Voices.getVoice(inflection.charAt(4)).name());
    predicate.addAttribute(PropbankReader.Tagger, tagger);
    return predicate;
}
Also used : Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 15 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ACE_BN_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags, boolean is2004) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = is2004 ? Pattern.compile("<DOCNO>(.*?)</DOCNO>") : Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentID, docID);
    pattern = is2004 ? Pattern.compile("<DATE_TIME>(.*?)</DATE_TIME>") : Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    if (is2004) {
        pattern = Pattern.compile("<TEXT>(.*?)<TURN>|<TURN>(.*?)<TURN>|<TURN>(.*?)</TEXT>|<TEXT>(.*?)</TEXT>");
    } else {
        pattern = Pattern.compile("<TURN>(.*?)</TURN>");
    }
    matcher = pattern.matcher(content);
    int regionStart = 0;
    while (matcher.find(regionStart)) {
        // Pick the first non-empty group.
        for (int i = 1; i <= matcher.groupCount(); ++i) {
            if (matcher.group(i) != null) {
                text = (matcher.group(i)).trim();
                break;
            }
        }
        int index4 = content.indexOf(text);
        Paragraph para4 = new Paragraph(index4, text);
        Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
        paragraphs.add(pair4);
        if (is2004) {
            // Hack to move back to the overlapping <TURN> tag
            regionStart = matcher.end() - 6;
        } else {
            regionStart = matcher.end();
        }
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        String paraContent = paragraphs.get(i).getSecond().content;
        int offsetWithFiltering = contentRemovingTags.indexOf(paraContent, index);
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        index += paraContent.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Aggregations

Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)59 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)35 ArrayList (java.util.ArrayList)17 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)10 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)7 Matcher (java.util.regex.Matcher)7 Paragraph (edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)6 HashMap (java.util.HashMap)6 Pattern (java.util.regex.Pattern)6 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)3 SenseInstance (edu.illinois.cs.cogcomp.verbsense.jlis.SenseInstance)3 SenseStructure (edu.illinois.cs.cogcomp.verbsense.jlis.SenseStructure)3 JsonObject (com.google.gson.JsonObject)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)2 ITransformer (edu.illinois.cs.cogcomp.core.transformers.ITransformer)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 Annotation (edu.stanford.nlp.pipeline.Annotation)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 LinkedHashSet (java.util.LinkedHashSet)2