Search in sources :

Example 1 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class TextAnnotation method getSpansMatching.

public List<IntPair> getSpansMatching(String text) {
    if (allSpans == null) {
        synchronized (this) {
            if (allSpans == null) {
                this.allSpans = TCollections.synchronizedMap(new TIntObjectHashMap<ArrayList<IntPair>>());
                /**
                 * creates a hash for each contiguous substring, and creates an entry for it in
                 * allSpans NOTE: spans previously used "at-the-end" indexing but then the
                 * offsets won't agree with actual constituents, so CHANGED IT 2016/05/11. MS
                 */
                for (int start = 0; start < this.size() - 1; start++) {
                    StringBuilder sb = new StringBuilder();
                    for (int end = start; end < this.size(); end++) {
                        String token = tokens[end];
                        token = token.replaceAll("``", "\"").replaceAll("''", "\"");
                        token = SentenceUtils.convertFromPTBBrackets(token);
                        sb.append(token).append(" ");
                        int hash = sb.toString().trim().hashCode();
                        if (!allSpans.containsKey(hash))
                            allSpans.put(hash, new ArrayList<IntPair>());
                        List<IntPair> list = allSpans.get(hash);
                        list.add(new IntPair(start, end + 1));
                    }
                }
            }
        }
    }
    int hashCode = text.trim().hashCode();
    int length = text.split("\\s+").length;
    List<IntPair> list = allSpans.get(hashCode);
    if (list == null)
        list = new ArrayList<>();
    return list;
}
Also used : TIntObjectHashMap(gnu.trove.map.hash.TIntObjectHashMap) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 2 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class DummyTextAnnotationGenerator method addSrlFrame.

private static void addSrlFrame(PredicateArgumentView srlView, String viewName, TextAnnotation ta, IntPair verbSRLPredicate, String sense, Map<IntPair, String> srlArgs) {
    Constituent predicate = new Constituent("predicate", viewName, ta, verbSRLPredicate.getFirst(), verbSRLPredicate.getSecond());
    predicate.addAttribute(PredicateArgumentView.LemmaIdentifier, lemmasAll[verbSRLPredicate.getFirst()]);
    predicate.addAttribute(PredicateArgumentView.SenseIdentifer, sense);
    List<Constituent> args = new ArrayList<>();
    List<String> tempArgLabels = new ArrayList<>();
    for (IntPair span : srlArgs.keySet()) {
        args.add(new Constituent("argument", viewName, ta, span.getFirst(), span.getSecond()));
        tempArgLabels.add(srlArgs.get(span));
    }
    String[] argLabels = tempArgLabels.toArray(new String[args.size()]);
    double[] scores = new double[args.size()];
    srlView.addPredicateArguments(predicate, args, argLabels, scores);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 3 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class DummyTextAnnotationGenerator method generateAnnotatedTextAnnotation.

public static TextAnnotation generateAnnotatedTextAnnotation(String[] viewsToAdd, boolean withNoisyLabels, int sentenceNum) {
    // we can do at-most 3 sentences, for now
    if (sentenceNum > 3) {
        logger.error("Currently this function supports at most 3 sentences per TextAnnotation. If you need more, " + "you have to augment this function");
        throw new RuntimeException();
    }
    // at least one sentence
    if (sentenceNum < 1) {
        logger.error("The requested TextAnnotation has to have at least one sentence. ");
        throw new RuntimeException();
    }
    List<String[]> annotatedTokenizedStringArrayAll = new ArrayList<>();
    annotatedTokenizedStringArrayAll.addAll(annotatedTokenizedStringArray1);
    if (sentenceNum > 1)
        annotatedTokenizedStringArrayAll.addAll(annotatedTokenizedStringArray2);
    if (sentenceNum > 2)
        annotatedTokenizedStringArrayAll.addAll(annotatedTokenizedStringArray3);
    TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(annotatedTokenizedStringArrayAll);
    for (String viewName : viewsToAdd) {
        switch(viewName) {
            case ViewNames.SENTENCE:
                SpanLabelView sentView = new SpanLabelView(ViewNames.SENTENCE, ta);
                sentView.addConstituent(new Constituent("sent1", ViewNames.SENTENCE, ta, 0, pos1.length));
                if (sentenceNum > 1)
                    sentView.addConstituent(new Constituent("sent2", ViewNames.SENTENCE, ta, pos1.length, pos1.length + pos2.length));
                if (sentenceNum > 2)
                    sentView.addConstituent(new Constituent("sent3", ViewNames.SENTENCE, ta, pos1.length + pos2.length, pos1.length + pos2.length + pos3.length));
                ta.addView(ViewNames.SENTENCE, sentView);
                ta.setSentences();
                break;
            case ViewNames.POS:
                TokenLabelView posView = new TokenLabelView(viewName, ta);
                String[] pos1Overall = withNoisyLabels ? pos_noisy1 : pos1;
                String[] pos2Overall = withNoisyLabels ? pos_noisy2 : pos2;
                String[] pos3Overall = withNoisyLabels ? pos_noisy3 : pos3;
                for (int i = 0; i < pos1Overall.length; i++) posView.addTokenLabel(i, pos1Overall[i], 1.0);
                if (sentenceNum > 1)
                    for (int i = 0; i < pos2Overall.length; i++) posView.addTokenLabel(pos1Overall.length + i, pos2Overall[i], 1.0);
                if (sentenceNum > 2)
                    for (int i = 0; i < pos3Overall.length; i++) posView.addTokenLabel(pos1Overall.length + pos2Overall.length + i, pos3Overall[i], 1.0);
                ta.addView(viewName, posView);
                break;
            case ViewNames.LEMMA:
                TokenLabelView lemmaView = new TokenLabelView(ViewNames.LEMMA, ta);
                String[] lemmaOveral1 = withNoisyLabels ? lemmas_noisy1 : lemmas1;
                String[] lemmaOveral2 = withNoisyLabels ? lemmas_noisy2 : lemmas2;
                String[] lemmaOveral3 = withNoisyLabels ? lemmas_noisy3 : lemmas3;
                for (int i = 0; i < lemmaOveral1.length; i++) lemmaView.addTokenLabel(i, lemmaOveral1[i], 1.0);
                if (sentenceNum > 1)
                    for (int i = 0; i < lemmaOveral2.length; i++) lemmaView.addTokenLabel(lemmaOveral1.length + i, lemmaOveral2[i], 1.0);
                if (sentenceNum > 2)
                    for (int i = 0; i < lemmaOveral3.length; i++) lemmaView.addTokenLabel(lemmaOveral1.length + lemmaOveral2.length + i, lemmaOveral3[i], 1.0);
                ta.addView(viewName, lemmaView);
                break;
            case ViewNames.SHALLOW_PARSE:
                SpanLabelView chunkView = new SpanLabelView(ViewNames.SHALLOW_PARSE, ta);
                Map<IntPair, String> chunkOverall1 = withNoisyLabels ? chunks_noisy1 : chunks1;
                Map<IntPair, String> chunkOverall2 = withNoisyLabels ? chunks_noisy2 : chunks2;
                Map<IntPair, String> chunkOverall3 = withNoisyLabels ? chunks_noisy3 : chunks3;
                for (IntPair span : chunkOverall1.keySet()) chunkView.addSpanLabel(span.getFirst(), span.getSecond(), chunkOverall1.get(span), 1.0);
                if (sentenceNum > 1)
                    for (IntPair span : chunkOverall2.keySet()) chunkView.addSpanLabel(span.getFirst(), span.getSecond(), chunkOverall2.get(span), 1.0);
                if (sentenceNum > 2)
                    for (IntPair span : chunkOverall3.keySet()) chunkView.addSpanLabel(span.getFirst(), span.getSecond(), chunkOverall3.get(span), 1.0);
                ta.addView(ViewNames.SHALLOW_PARSE, chunkView);
                break;
            case ViewNames.NER_CONLL:
                SpanLabelView nerView = new SpanLabelView(ViewNames.NER_CONLL, ta);
                Map<IntPair, String> nerSource1 = withNoisyLabels ? ner_noisy1 : ner1;
                Map<IntPair, String> nerSource2 = withNoisyLabels ? ner_noisy2 : ner2;
                Map<IntPair, String> nerSource3 = withNoisyLabels ? ner_noisy3 : ner3;
                for (IntPair span : nerSource1.keySet()) nerView.addSpanLabel(span.getFirst(), span.getSecond(), nerSource1.get(span), 1.0);
                if (sentenceNum > 1)
                    for (IntPair span : nerSource2.keySet()) nerView.addSpanLabel(span.getFirst(), span.getSecond(), nerSource2.get(span), 1.0);
                if (sentenceNum > 2)
                    for (IntPair span : nerSource3.keySet()) nerView.addSpanLabel(span.getFirst(), span.getSecond(), nerSource3.get(span), 1.0);
                ta.addView(ViewNames.NER_CONLL, nerView);
                break;
            case ViewNames.PARSE_GOLD:
            case ViewNames.PARSE_STANFORD:
            case ViewNames.PARSE_BERKELEY:
            case ViewNames.PSEUDO_PARSE_STANFORD:
            case ViewNames.PARSE_CHARNIAK:
                TreeView parseView = new TreeView(viewName, ta);
                String treeOveral1 = withNoisyLabels ? tree_noisy1 : tree1;
                String treeOveral2 = withNoisyLabels ? tree_noisy2 : tree2;
                String treeOveral3 = withNoisyLabels ? tree_noisy3 : tree3;
                parseView.setParseTree(0, TreeParserFactory.getStringTreeParser().parse(treeOveral1));
                if (sentenceNum > 1)
                    parseView.setParseTree(1, TreeParserFactory.getStringTreeParser().parse(treeOveral2));
                if (sentenceNum > 2)
                    parseView.setParseTree(2, TreeParserFactory.getStringTreeParser().parse(treeOveral3));
                ta.addView(viewName, parseView);
                break;
            case ViewNames.SRL_VERB:
                PredicateArgumentView verbSRLView = new PredicateArgumentView(viewName, ta);
                addSrlFrame(verbSRLView, viewName, ta, verbSRLPredicate1, (withNoisyLabels ? verbSRLPredicateSense_noisy : verbSRLPredicateSense), (withNoisyLabels ? verbSRLArgs_noisy1 : verbSRLArgs1));
                if (sentenceNum > 1) {
                    addSrlFrame(verbSRLView, viewName, ta, verbSRLPredicate2, (withNoisyLabels ? verbSRLPredicateSense_noisy : verbSRLPredicateSense), (withNoisyLabels ? verbSRLArgs_noisy2 : verbSRLArgs2));
                }
                if (sentenceNum > 2) {
                    addSrlFrame(verbSRLView, viewName, ta, verbSRLPredicate3, (withNoisyLabels ? verbSRLPredicateSense_noisy : verbSRLPredicateSense), (withNoisyLabels ? verbSRLArgs_noisy3 : verbSRLArgs3));
                    addSrlFrame(verbSRLView, viewName, ta, verbSRLPredicate4, (withNoisyLabels ? verbSRLPredicateSense_noisy : verbSRLPredicateSense), (withNoisyLabels ? verbSRLArgs_noisy4 : verbSRLArgs4));
                }
                ta.addView(viewName, verbSRLView);
                break;
            case ViewNames.NER_ONTONOTES:
                // For now the NER views are going to be empty
                ta.addView(viewName, new SpanLabelView(viewName, ta));
            default:
                logger.error("Cannot provide annotation for {}", viewName);
        }
    }
    return ta;
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 4 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class BrownClusterViewGenerator method addView.

@Override
public void addView(TextAnnotation ta) {
    lazyLoadClusters();
    SpanLabelView view = new SpanLabelView(getViewName(), "BrownClusters", ta, 1.0, true);
    Map<String, List<IntPair>> m = getMatchingSpans(ta);
    for (Entry<String, List<IntPair>> entry : m.entrySet()) {
        String label = entry.getKey();
        Set<IntPair> added = new LinkedHashSet<>();
        for (IntPair p : entry.getValue()) {
            // don't add nested constituents of the same type
            boolean foundContainer = false;
            for (IntPair p1 : added) {
                if (p1 == p)
                    continue;
                if (p1.getFirst() <= p.getFirst() && p1.getSecond() >= p.getSecond()) {
                    foundContainer = true;
                    break;
                }
            }
            if (!foundContainer) {
                view.addSpanLabel(p.getFirst(), p.getSecond(), label, 1.0);
                added.add(p);
            }
        }
    }
    ta.addView(getViewName(), view);
}
Also used : SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 5 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class GazetteerViewGenerator method addView.

private void addView(String label, int labelId, SpanLabelView view, TIntObjectHashMap<ArrayList<IntPair>> allSpans) {
    logger.debug("Adding gazetteer {}", label);
    List<IntPair> matches = new ArrayList<>();
    int[] pattern = this.patterns.get(labelId);
    int[] len = this.lengths.get(labelId);
    for (int i = 0; i < pattern.length; i++) {
        int hashCode = pattern[i];
        int length = len[i];
        if (allSpans.containsKey(hashCode)) {
            List<IntPair> list = allSpans.get(hashCode);
            for (IntPair pair : list) {
                if (pair.getSecond() - pair.getFirst() == length)
                    matches.add(pair);
            }
        }
    }
    Set<IntPair> added = new LinkedHashSet<>();
    for (IntPair p : matches) {
        // don't add nested constituents of the same type
        boolean foundContainer = false;
        for (IntPair p1 : added) {
            if (p1 == p)
                continue;
            if (p1.getFirst() <= p.getFirst() && p1.getSecond() >= p.getSecond()) {
                foundContainer = true;
                break;
            }
        }
        if (!foundContainer) {
            view.addSpanLabel(p.getFirst(), p.getSecond(), label, 1.0);
            added.add(p);
        }
    }
}
Also used : TIntArrayList(gnu.trove.list.array.TIntArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)129 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)39 ArrayList (java.util.ArrayList)27 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)26 Test (org.junit.Test)21 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)18 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)14 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)8 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)7 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)6 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)5 Matcher (java.util.regex.Matcher)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 IOException (java.io.IOException)4 JsonObject (com.google.gson.JsonObject)3 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)3 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)3