Search in sources :

Example 36 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StanfordAnalyzer method tokenizeSentence.

/**
 * given a sentence, return a set of tokens and their character offsets
 *
 * @param sentenceText The sentence string
 * @return A {@link Pair} containing the array of tokens and their character offsets
 */
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentenceText) {
    Annotation document = new Annotation(sentenceText);
    pipeline.annotate(document);
    List<CoreLabel> tokens = new ArrayList<>();
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    int[] sen_ends = new int[sentences.size()];
    int sen_idx = 0;
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            tokens.add(token);
        }
        sen_ends[sen_idx++] = tokens.size();
    }
    String[] surfaces = new String[tokens.size()];
    IntPair[] tokenCharOffsets = new IntPair[tokens.size()];
    for (int i = 0; i < tokens.size(); i++) {
        surfaces[i] = tokens.get(i).originalText();
        tokenCharOffsets[i] = new IntPair(tokens.get(i).beginPosition(), tokens.get(i).endPosition());
    // System.out.println(surfaces[i]);
    // System.out.println(tokenCharOffsets[i]);
    }
    return new Pair(surfaces, tokenCharOffsets);
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 37 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StanfordAnalyzer method getTextAnnotation.

public TextAnnotation getTextAnnotation(String text) {
    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreLabel> tokens = new ArrayList<>();
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    int[] sen_ends = new int[sentences.size()];
    int sen_idx = 0;
    for (CoreMap sentence : sentences) {
        for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            tokens.add(token);
        }
        sen_ends[sen_idx++] = tokens.size();
    }
    String[] surfaces = new String[tokens.size()];
    IntPair[] tokenCharOffsets = new IntPair[tokens.size()];
    for (int i = 0; i < tokens.size(); i++) {
        surfaces[i] = tokens.get(i).originalText();
        tokenCharOffsets[i] = new IntPair(tokens.get(i).beginPosition(), tokens.get(i).endPosition());
    // System.out.println(surfaces[i]);
    // System.out.println(tokenCharOffsets[i]);
    }
    // System.out.println(sen_ends[0]);
    TextAnnotation ta = new TextAnnotation("", "", text, tokenCharOffsets, surfaces, sen_ends);
    return ta;
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 38 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class WhiteSpaceTokenizer method getTextAnnotation.

// public TextAnnotation getTextAnnotation(String text){
// text = text.replaceAll("\n", " ");
// String[] sentences = text.split("\\.");
// String new_text = "";
// List<IntPair> offsets = new ArrayList<>();
// List<String> surfaces = new ArrayList<>();
// List<Integer> sen_ends = new ArrayList<>();
// for(String sen: sentences){
// String[] tokens = sen.trim().split("\\s+");
// for(String token: tokens) {
// offsets.add(new IntPair(text.length(), text.length()+token.length()));
// surfaces.add(token);
// new_text += token+" ";
// }
// sen_ends.add(offsets.size());
// }
// 
// IntPair[] offs = new IntPair[offsets.size()];
// offs = offsets.toArray(offs);
// String[] surfs = new String[surfaces.size()];
// surfs = surfaces.toArray(surfs);
// int[] ends = new int[sen_ends.size()];
// for(int i = 0; i < sen_ends.size(); i++)
// ends[i] = sen_ends.get(i);
// 
// 
// TextAnnotation ta = new TextAnnotation("", "", text, offs,
// surfs, ends);
// return ta;
// 
// }
public TextAnnotation getTextAnnotation(String text) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    String t = "";
    int t_start = -1;
    int i;
    for (i = 0; i < text.length(); i++) {
        String c = text.substring(i, i + 1);
        if (c.trim().isEmpty()) {
            if (!t.isEmpty()) {
                surfaces.add(t);
                offsets.add(new IntPair(t_start, i));
                t = "";
            }
        } else if (c.equals(".") || c.equals("\n")) {
            if (!t.isEmpty()) {
                surfaces.add(t);
                offsets.add(new IntPair(t_start, i));
            }
            surfaces.add(c);
            offsets.add(new IntPair(i, i + 1));
            t = "";
            sen_ends.add(surfaces.size());
        } else {
            if (t.isEmpty())
                t_start = i;
            t += c;
        }
    }
    if (!t.isEmpty()) {
        surfaces.add(t);
        offsets.add(new IntPair(t_start, i));
        sen_ends.add(surfaces.size());
    }
    if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()) {
        sen_ends.add(surfaces.size());
    }
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    int[] ends = new int[sen_ends.size()];
    for (i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
    if (ends[ends.length - 1] != surfaces.size()) {
        System.out.println(ends[ends.length - 1]);
        System.out.println(surfaces.size());
        System.exit(-1);
    }
    if (offs.length == 0 || surfs.length == 0)
        return null;
    TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
    return ta;
}
Also used : ArrayList(java.util.ArrayList) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 39 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class WhiteSpaceTokenizer method tokenizeTextSpan.

/**
 * given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
 * to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
 * IntPair[] is the corresponding list of character offsets with respect to <b>the original
 * text</b>.
 *
 * @param textSpan
 */
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    int i;
    int prev = 0;
    String prevc = null;
    for (i = 0; i < textSpan.length(); i++) {
        String c = textSpan.substring(i, i + 1);
        if ((c.equals(".") && prevc != null && !prevc.toUpperCase().equals(prevc)) || c.equals("\n")) {
            String sentence = textSpan.substring(prev, i);
            if (!sentence.trim().isEmpty()) {
                Pair<String[], IntPair[]> tokens = tokenizeSentence(sentence);
                for (String token : tokens.getFirst()) surfaces.add(token);
                for (IntPair offset : tokens.getSecond()) offsets.add(new IntPair(offset.getFirst() + prev, offset.getSecond() + prev));
                if (c.equals(".")) {
                    surfaces.add(".");
                    offsets.add(new IntPair(i, i + 1));
                }
                sen_ends.add(surfaces.size());
            }
            prev = i + 1;
        }
        prevc = c;
    }
    if (prev < textSpan.length() && !textSpan.substring(prev, textSpan.length()).trim().isEmpty()) {
        Pair<String[], IntPair[]> tokens = tokenizeSentence(textSpan.substring(prev, textSpan.length()));
        for (String token : tokens.getFirst()) surfaces.add(token);
        for (IntPair offset : tokens.getSecond()) offsets.add(new IntPair(offset.getFirst() + prev, offset.getSecond() + prev));
        sen_ends.add(surfaces.size());
    }
    if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()) {
        sen_ends.add(surfaces.size());
    }
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    int[] ends = new int[sen_ends.size()];
    for (i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
    return new Tokenization(surfs, offs, ends);
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 40 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class JsonSerializerTest method verifySerializedJSONObject.

/**
 * Behavior specific to unit tests only. Use with caution
 */
public static void verifySerializedJSONObject(JsonObject jobj, TextAnnotation ta) {
    assertNotNull(jobj);
    JsonArray jsonTokenOffsets = jobj.get(JsonSerializer.TOKENOFFSETS).getAsJsonArray();
    assertNotNull(jsonTokenOffsets);
    assertEquals(ta.getTokens().length, jsonTokenOffsets.size());
    Map<IntPair, String> offsetForms = new HashMap<>();
    for (int i = 0; i < jsonTokenOffsets.size(); ++i) {
        JsonObject offset = (JsonObject) jsonTokenOffsets.get(i);
        int start = offset.get(JsonSerializer.STARTCHAROFFSET).getAsInt();
        int end = offset.get(JsonSerializer.ENDCHAROFFSET).getAsInt();
        String form = offset.get(JsonSerializer.FORM).getAsString();
        offsetForms.put(new IntPair(start, end), form);
    }
    Constituent seventhToken = ta.getView(ViewNames.TOKENS).getConstituents().get(6);
    IntPair tokCharOffsets = new IntPair(seventhToken.getStartCharOffset(), seventhToken.getEndCharOffset());
    String seventhTokenForm = seventhToken.getSurfaceForm();
    String deserializedForm = offsetForms.get(tokCharOffsets);
    assertNotNull(deserializedForm);
    assertEquals(seventhTokenForm, deserializedForm);
    Constituent thirdPos = ta.getView(ViewNames.POS).getConstituents().get(3);
    assertEquals(null, thirdPos.getLabelsToScores());
    View rhymeRecons = ta.getView("rhyme");
    assertNotNull(rhymeRecons);
    Relation r = rhymeRecons.getRelations().get(0);
    Map<String, Double> relLabelScores = r.getLabelsToScores();
    assertNotNull(relLabelScores);
    assertEquals(2, relLabelScores.size());
    Constituent c = r.getSource();
    Map<String, Double> cLabelScores = c.getLabelsToScores();
    assertNotNull(cLabelScores);
    assertEquals(4, cLabelScores.size());
}
Also used : JsonArray(com.google.gson.JsonArray) Relation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Relation) HashMap(java.util.HashMap) JsonObject(com.google.gson.JsonObject) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)129 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)39 ArrayList (java.util.ArrayList)27 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)26 Test (org.junit.Test)21 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)18 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)14 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)8 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)7 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)6 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)5 Matcher (java.util.regex.Matcher)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 IOException (java.io.IOException)4 JsonObject (com.google.gson.JsonObject)3 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)3 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)3