Search in sources :

Example 31 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ChineseTokenizer method getTextAnnotation1.

public TextAnnotation getTextAnnotation1(String text) {
    if (text.trim().isEmpty())
        return null;
    text = trad2simp(text);
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    String[] lines = text.split("\n");
    int idx = 0;
    for (String line : lines) {
        if (line.trim().isEmpty())
            continue;
        String[] sentences = line.split("。");
        for (int i = 0; i < sentences.length; i++) {
            String sentence = sentences[i];
            if (sentence.trim().isEmpty())
                continue;
            List<String> segs = segmenter.segmentString(sentence);
            for (String seg : segs) {
                idx = text.indexOf(seg, idx);
                if (!containsHanScript(seg)) {
                    surfaces.add(seg);
                    offsets.add(new IntPair(idx, idx + seg.length()));
                } else {
                    for (int j = 0; j < seg.length(); j++) {
                        String ch = seg.substring(j, j + 1);
                        surfaces.add(ch);
                        offsets.add(new IntPair(idx + j, idx + j + 1));
                    }
                }
                idx += seg.length();
            }
            if (i < sentences.length - 1) {
                surfaces.add("。");
                idx = text.indexOf("。", idx);
                offsets.add(new IntPair(idx, ++idx));
            }
            if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size())
                sen_ends.add(surfaces.size());
        }
    }
    // for(int i = 0; i < surfaces.size(); i++){
    // System.out.println(i+" "+surfaces.get(i)+" "+offsets.get(i));
    // }
    // System.out.println(sen_ends);
    // System.out.println(surfaces.size()+" "+offsets.size()+" "+sen_ends.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    int[] ends = new int[sen_ends.size()];
    for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
    if (surfs.length == 0)
        return null;
    TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
    return ta;
}
Also used : TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 32 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ChineseTokenizer method tokenizeTextSpan.

/**
 * given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
 * to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
 * IntPair[] is the corresponding list of character offsets with respect to <b>the original
 * text</b>.
 *
 * @param text
 */
@Override
public Tokenization tokenizeTextSpan(String text) {
    if (text.trim().isEmpty())
        return new Tokenization(new String[] {}, new IntPair[] {}, new int[] {});
    text = trad2simp(text);
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    String[] lines = text.split("\n");
    int idx = 0;
    for (String line : lines) {
        if (line.trim().isEmpty())
            continue;
        String[] sentences = line.split("。");
        for (int i = 0; i < sentences.length; i++) {
            String sentence = sentences[i];
            if (sentence.trim().isEmpty())
                continue;
            List<String> segs = segmenter.segmentString(sentence);
            for (String seg : segs) {
                idx = text.indexOf(seg, idx);
                if (!containsHanScript(seg)) {
                    surfaces.add(seg);
                    offsets.add(new IntPair(idx, idx + seg.length()));
                } else {
                    for (int j = 0; j < seg.length(); j++) {
                        String ch = seg.substring(j, j + 1);
                        surfaces.add(ch);
                        offsets.add(new IntPair(idx + j, idx + j + 1));
                    }
                }
                idx += seg.length();
            }
            if (i < sentences.length - 1) {
                surfaces.add("。");
                idx = text.indexOf("。", idx);
                offsets.add(new IntPair(idx, ++idx));
            }
            if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size())
                sen_ends.add(surfaces.size());
        }
    }
    // for(int i = 0; i < surfaces.size(); i++){
    // System.out.println(i+" "+surfaces.get(i)+" "+offsets.get(i));
    // }
    // System.out.println(sen_ends);
    // System.out.println(surfaces.size()+" "+offsets.size()+" "+sen_ends.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    int[] ends = new int[sen_ends.size()];
    for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
    return new Tokenization(surfs, offs, ends);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 33 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ChineseTokenizer method oldGetTextAnnotation.

public TextAnnotation oldGetTextAnnotation(String text) {
    if (text.trim().isEmpty())
        return null;
    // text = trad2simp(text);
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    String[] lines = text.split("。");
    int idx = 0;
    for (int i = 0; i < lines.length; i++) {
        String line = lines[i];
        List<String> segs = segmenter.segmentString(line);
        for (String seg : segs) {
            if (seg.length() > 1 && seg.endsWith("人")) {
                surfaces.add(seg.substring(0, seg.length() - 1));
                idx = text.indexOf(seg, idx);
                offsets.add(new IntPair(idx, idx + seg.length() - 1));
                surfaces.add(seg.substring(seg.length() - 1, seg.length()));
                offsets.add(new IntPair(idx + seg.length() - 1, idx + seg.length()));
                idx += seg.length();
            } else {
                surfaces.add(seg);
                idx = text.indexOf(seg, idx);
                offsets.add(new IntPair(idx, idx + seg.length()));
                idx += seg.length();
            }
        }
        if (i < lines.length - 1) {
            surfaces.add("。");
            idx = text.indexOf("。", idx);
            offsets.add(new IntPair(idx, ++idx));
        }
        sen_ends.add(surfaces.size());
    }
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    int[] ends = new int[sen_ends.size()];
    for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
    if (surfs.length == 0)
        return null;
    TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
    return ta;
}
Also used : TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 34 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class JapaneseTokenizer method tokenizeTextSpan.

/**
 * The text annotation requires all tokens and sentences to be references in terms of their
 * offsets from the start of the text they represent. This method will first identify and record
 * the sentence offsets, it will then use the kuromoji tokenizer to tokenize words within
 * sentences.
 */
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
    if (textSpan.trim().isEmpty())
        return null;
    // first identify sentence ends, as required by TextAnnotation.
    ArrayList<Integer> lastTokenIndexes = new ArrayList<>();
    ArrayList<String> tokens = new ArrayList<>();
    ArrayList<IntPair> tokenBoundries = new ArrayList<>();
    int lastCharPosition = 0;
    int tokenCounter = 0;
    for (int i = 0; i < textSpan.length(); i++) {
        switch(textSpan.charAt(i)) {
            case '。':
            case '!':
            case '?':
            case '.':
            case ';':
                // found the end of a sentence, process and convert to global
                Pair<String[], IntPair[]> tokenDefs = this.tokenizeSentence(textSpan.substring(lastCharPosition, i + 1));
                // adjust all the offsets, increment the IntPair by the sentence starting char offset.
                int which = 0;
                for (IntPair ip : tokenDefs.getSecond()) {
                    ip.setFirst(ip.getFirst() + lastCharPosition);
                    ip.setSecond(ip.getSecond() + lastCharPosition);
                    // add our token, and it's offset to the arrays.
                    tokens.add(tokenDefs.getFirst()[which++]);
                    tokenBoundries.add(ip);
                }
                tokenCounter += tokenDefs.getFirst().length;
                lastTokenIndexes.add(tokenCounter);
                lastCharPosition = i + 1;
                break;
        }
    }
    // if there is dangling text, just add it to the next sentence, we force a sentence termination at end of text
    if (lastCharPosition != textSpan.length()) {
        // found the end of a sentence, process and convert to global
        Pair<String[], IntPair[]> tokenDefs = this.tokenizeSentence(textSpan.substring(lastCharPosition, textSpan.length()));
        // adjust all the offsets, increment the IntPair by the sentence starting char offset.
        int which = 0;
        for (IntPair ip : tokenDefs.getSecond()) {
            ip.setFirst(ip.getFirst() + lastCharPosition);
            ip.setSecond(ip.getSecond() + lastCharPosition);
            // add our token, and it's offset to the arrays.
            tokens.add(tokenDefs.getFirst()[which++]);
            tokenBoundries.add(ip);
        }
        tokenCounter += tokenDefs.getFirst().length;
        lastTokenIndexes.add(tokenCounter);
    }
    int[] sentenceEnds = new int[lastTokenIndexes.size()];
    int i = 0;
    for (Integer lastToken : lastTokenIndexes) {
        sentenceEnds[i++] = lastToken;
    }
    String[] toks = tokens.toArray(new String[tokens.size()]);
    IntPair[] charOffsetArray = tokenBoundries.toArray(new IntPair[tokenBoundries.size()]);
    Tokenization tokenized = new Tokenization(toks, charOffsetArray, sentenceEnds);
    return tokenized;
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 35 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class CharacterTokenizer method tokenizeSentence.

/**
 * given a sentence, return a set of tokens and their character offsets
 *
 * @param sentence The sentence string
 * @return A {@link Pair} containing the array of tokens and their character offsets
 */
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    for (int i = 0; i < sentence.length(); i++) {
        String c = sentence.substring(i, i + 1).trim();
        if (!c.isEmpty() && !c.equals(" ")) {
            surfaces.add(c);
            offsets.add(new IntPair(i, i + 1));
        }
    }
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    return new Pair<>(surfs, offs);
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)129 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)39 ArrayList (java.util.ArrayList)27 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)26 Test (org.junit.Test)21 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)18 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)14 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)8 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)7 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)6 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)5 Matcher (java.util.regex.Matcher)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 IOException (java.io.IOException)4 JsonObject (com.google.gson.JsonObject)3 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)3 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)3