Search in sources :

Example 86 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class JsonSerializerTest method verifyDeserializedJsonString.

/** Behavior specific to unit tests only. Use with caution */
public static void verifyDeserializedJsonString(String json, TextAnnotation ta) throws Exception {
    TextAnnotation ta2 = SerializationHelper.deserializeFromJson(json);
    assertEquals(ta2.getCorpusId(), ta.getCorpusId());
    assertEquals(ta2.getId(), ta.getId());
    assertEquals(ta2.getNumberOfSentences(), ta.getNumberOfSentences());
    assertEquals(ta2.getSentence(1), ta.getSentence(1));
    assertEquals(ta2.getSentenceFromToken(2), ta.getSentenceFromToken(2));
    assertEquals(ta2.getTokenIdFromCharacterOffset(5), ta.getTokenIdFromCharacterOffset(5));
    assertEquals(ta2.getToken(4), ta.getToken(4));
    assertEquals(ta2.getAvailableViews(), ta.getAvailableViews());
    assertEquals(Arrays.toString(ta2.getTokensInSpan(1, 3)), Arrays.toString(ta.getTokensInSpan(1, 3)));
    assertEquals(ta2.getText(), ta.getText());
    Constituent seventhToken = ta.getView(ViewNames.TOKENS).getConstituents().get(6);
    IntPair tokCharOffsets = new IntPair(seventhToken.getStartCharOffset(), seventhToken.getEndCharOffset());
    String seventhTokenForm = seventhToken.getSurfaceForm();
    Constituent seventhTokenCopy = ta2.getView(ViewNames.TOKENS).getConstituents().get(6);
    IntPair tokCharOffsets2 = new IntPair(seventhTokenCopy.getStartCharOffset(), seventhTokenCopy.getEndCharOffset());
    String seventhTokenForm2 = seventhTokenCopy.getSurfaceForm();
    assertEquals(seventhTokenForm, seventhTokenForm2);
    assertEquals(tokCharOffsets, tokCharOffsets2);
}
Also used : TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 87 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class GoldLabel method getArgument.

List<Constituent> getArgument(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield, boolean mergeContiguousCArgs) {
    String[] parts = propSpanInfo.split("\\*");
    List<Pair<IntPair, Boolean>> spans = new ArrayList<>();
    boolean someR = false;
    for (String part : parts) {
        if (part.length() == 0)
            continue;
        for (String s : part.split(",")) {
            if (s.length() == 0)
                continue;
            Pair<String, IntPair> info = getSpan(s, yield);
            String nonTerminal = info.getFirst();
            IntPair span = info.getSecond();
            if (span.getFirst() < 0 || span.getFirst() >= span.getSecond())
                continue;
            boolean r = false;
            if (nonTerminal.startsWith("WH")) {
                r = true;
                someR = true;
            }
            spans.add(new Pair<>(span, r));
        }
    }
    Collections.sort(spans, new Comparator<Pair<IntPair, Boolean>>() {

        @Override
        public int compare(Pair<IntPair, Boolean> arg0, Pair<IntPair, Boolean> arg1) {
            if (arg0.getFirst().getFirst() < arg1.getFirst().getFirst())
                return -1;
            else if (arg0.getFirst().getFirst() == arg1.getFirst().getFirst())
                return 0;
            else
                return 1;
        }
    });
    if (!someR && mergeContiguousCArgs) {
        spans = mergeCArgs(spans);
    }
    boolean first = true;
    List<Constituent> arg = new ArrayList<>();
    for (Pair<IntPair, Boolean> item : spans) {
        String label = this.label;
        if (item.getSecond() && spans.size() > 1) {
            label = "R-" + label;
        } else {
            if (first) {
                first = false;
            } else {
                label = "C-" + label;
            }
        }
        Constituent constituent = new Constituent(label, viewName, ta, item.getFirst().getFirst(), item.getFirst().getSecond());
        if (h != null) {
            constituent.addAttribute(AbstractSRLAnnotationReader.HyphenTagInfo, h);
        }
        arg.add(constituent);
    }
    return arg;
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 88 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class GoldLabel method addAnnotation.

private void addAnnotation(TextAnnotation ta) {
    Tree<String> tree = ParseUtils.getParseTree(ViewNames.PARSE_GOLD, ta, 0);
    Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
    List<Tree<Pair<String, IntPair>>> yield = spanLabeledTree.getYield();
    PredicateArgumentView pav = new PredicateArgumentView(srlViewName, "AnnotatedTreebank", ta, 1.0);
    Set<Integer> predicates = new HashSet<>();
    for (Fields fields : goldFields.get(ta.getId())) {
        Constituent predicate = fields.createPredicate(ta, srlViewName, yield);
        if (predicates.contains(predicate.getStartSpan()))
            continue;
        predicates.add(predicate.getStartSpan());
        List<Constituent> args = new ArrayList<>();
        List<String> labels = new ArrayList<>();
        List<Double> scores = new ArrayList<>();
        // We need to make sure that the One-Argument-Per-Span constraint is
        // respected. Yes sir, we do, even if the data says otherwise!
        Set<IntPair> seenSpans = new HashSet<>();
        for (GoldLabel arg : fields.getGoldLabels()) {
            List<Constituent> aa = arg.getArgument(ta, srlViewName, yield, mergeContiguousCArgs);
            List<Constituent> filtered = new ArrayList<>();
            for (Constituent possibleArg : aa) {
                if (seenSpans.contains(possibleArg.getSpan()))
                    continue;
                seenSpans.add(possibleArg.getSpan());
                filtered.add(possibleArg);
            }
            addArguments(ta, predicate, args, labels, scores, arg, filtered);
        }
        // for each arg
        pav.addPredicateArguments(predicate, args, labels.toArray(new String[labels.size()]), ArrayUtilities.asDoubleArray(scores));
    }
    if (pav.getPredicates().size() > 0)
        ta.addView(srlViewName, pav);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) PredicateArgumentView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView) Tree(edu.illinois.cs.cogcomp.core.datastructures.trees.Tree) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 89 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class GoldLabel method mergeCArgs.

protected List<Pair<IntPair, Boolean>> mergeCArgs(List<Pair<IntPair, Boolean>> spans) {
    if (spans.size() <= 1)
        return spans;
    List<Pair<IntPair, Boolean>> list = new ArrayList<>();
    IntPair prev = null;
    boolean r = true;
    for (Pair<IntPair, Boolean> p : spans) {
        if (prev == null) {
            prev = p.getFirst();
            r = p.getSecond();
        } else {
            if (p.getFirst().getFirst() == prev.getSecond()) {
                prev = new IntPair(prev.getFirst(), p.getFirst().getSecond());
                r &= p.getSecond();
            } else {
                list.add(new Pair<>(prev, r));
                prev = p.getFirst();
                r = p.getSecond();
            }
        }
    }
    list.add(new Pair<>(prev, r));
    assert list.size() <= spans.size();
    if (spans.size() > 0)
        assert list.size() > 0;
    return list;
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 90 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class CoNLLNerReader method loadCoNLLfile.

/**
     * This loads filename into a textannotation.
     *
     * @param filename
     * @return
     * @throws FileNotFoundException
     */
public static TextAnnotation loadCoNLLfile(String filename) throws FileNotFoundException {
    logger.info("Reading: " + filename);
    List<String> lines = LineIO.read(filename);
    List<IntPair> spans = new ArrayList<>();
    List<String> labels = new ArrayList<>();
    List<Integer> sentenceEndPositions = new ArrayList<>();
    StringBuilder text = new StringBuilder();
    int start = -1;
    String label = "";
    int i = 0;
    for (String line : lines) {
        String[] sline = line.split("\t");
        if (line.startsWith("B-")) {
            // two consecutive entities.
            if (start > -1) {
                // peel off a constituent if it exists.
                spans.add(new IntPair(start, i));
                labels.add(label);
            }
            start = i;
            label = sline[0].split("-")[1];
        } else if (sline[0].startsWith("I-")) {
        // don't do anything....
        } else {
            // this is a sentence boundary.
            if (line.trim().length() == 0) {
                // in case there are multiple empty lines at the end.
                if (!sentenceEndPositions.contains(i) && i > 0) {
                    sentenceEndPositions.add(i);
                }
            }
            // it's O or it's empty
            if (start > -1) {
                // peel off a constituent if it exists.
                spans.add(new IntPair(start, i));
                labels.add(label);
            }
            label = "";
            start = -1;
        }
        // add the word form to the sentence.
        if (sline.length > 5 && !sline[5].equals("-DOCSTART-") && sline[5].trim().length() > 0) {
            text.append(sline[5] + " ");
            i++;
        }
    }
    // in case the very last line is an NE.
    if (start > -1) {
        spans.add(new IntPair(start, i));
        labels.add(label);
    }
    // in case there are no empty lines.
    if (!sentenceEndPositions.contains(i)) {
        sentenceEndPositions.add(i);
    }
    // we jump through these hoops so we can give the TA an id.
    String filenameonly = IOUtils.getFileName(filename);
    List<String[]> tokenizedSentences = Collections.singletonList(text.toString().split(" "));
    TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens("", filenameonly, tokenizedSentences);
    SpanLabelView sentview = new SpanLabelView(ViewNames.SENTENCE, "UserSpecified", ta, 1d);
    ta.addView(ViewNames.SENTENCE, sentview);
    int sentstart = 0;
    for (int s : sentenceEndPositions) {
        sentview.addSpanLabel(sentstart, s, ViewNames.SENTENCE, 1d);
        sentstart = s;
    }
    SpanLabelView emptyview = new SpanLabelView(ViewNames.NER_CONLL, "UserSpecified", ta, 1d);
    ta.addView(ViewNames.NER_CONLL, emptyview);
    for (int k = 0; k < labels.size(); k++) {
        label = labels.get(k);
        IntPair span = spans.get(k);
        Constituent c = new Constituent(label, ViewNames.NER_CONLL, ta, span.getFirst(), span.getSecond());
        emptyview.addConstituent(c);
    }
    return ta;
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)103 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)32 Test (org.junit.Test)20 ArrayList (java.util.ArrayList)19 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)18 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)14 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)13 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)6 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)5 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 Matcher (java.util.regex.Matcher)4 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 Annotation (edu.stanford.nlp.pipeline.Annotation)3