use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ChineseTokenizer method getTextAnnotation1.
public TextAnnotation getTextAnnotation1(String text) {
if (text.trim().isEmpty())
return null;
text = trad2simp(text);
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
String[] lines = text.split("\n");
int idx = 0;
for (String line : lines) {
if (line.trim().isEmpty())
continue;
String[] sentences = line.split("。");
for (int i = 0; i < sentences.length; i++) {
String sentence = sentences[i];
if (sentence.trim().isEmpty())
continue;
List<String> segs = segmenter.segmentString(sentence);
for (String seg : segs) {
idx = text.indexOf(seg, idx);
if (!containsHanScript(seg)) {
surfaces.add(seg);
offsets.add(new IntPair(idx, idx + seg.length()));
} else {
for (int j = 0; j < seg.length(); j++) {
String ch = seg.substring(j, j + 1);
surfaces.add(ch);
offsets.add(new IntPair(idx + j, idx + j + 1));
}
}
idx += seg.length();
}
if (i < sentences.length - 1) {
surfaces.add("。");
idx = text.indexOf("。", idx);
offsets.add(new IntPair(idx, ++idx));
}
if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size())
sen_ends.add(surfaces.size());
}
}
// for(int i = 0; i < surfaces.size(); i++){
// System.out.println(i+" "+surfaces.get(i)+" "+offsets.get(i));
// }
// System.out.println(sen_ends);
// System.out.println(surfaces.size()+" "+offsets.size()+" "+sen_ends.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
if (surfs.length == 0)
return null;
TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
return ta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ChineseTokenizer method tokenizeTextSpan.
/**
* given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
* to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
* IntPair[] is the corresponding list of character offsets with respect to <b>the original
* text</b>.
*
* @param text
*/
@Override
public Tokenization tokenizeTextSpan(String text) {
if (text.trim().isEmpty())
return new Tokenization(new String[] {}, new IntPair[] {}, new int[] {});
text = trad2simp(text);
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
String[] lines = text.split("\n");
int idx = 0;
for (String line : lines) {
if (line.trim().isEmpty())
continue;
String[] sentences = line.split("。");
for (int i = 0; i < sentences.length; i++) {
String sentence = sentences[i];
if (sentence.trim().isEmpty())
continue;
List<String> segs = segmenter.segmentString(sentence);
for (String seg : segs) {
idx = text.indexOf(seg, idx);
if (!containsHanScript(seg)) {
surfaces.add(seg);
offsets.add(new IntPair(idx, idx + seg.length()));
} else {
for (int j = 0; j < seg.length(); j++) {
String ch = seg.substring(j, j + 1);
surfaces.add(ch);
offsets.add(new IntPair(idx + j, idx + j + 1));
}
}
idx += seg.length();
}
if (i < sentences.length - 1) {
surfaces.add("。");
idx = text.indexOf("。", idx);
offsets.add(new IntPair(idx, ++idx));
}
if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size())
sen_ends.add(surfaces.size());
}
}
// for(int i = 0; i < surfaces.size(); i++){
// System.out.println(i+" "+surfaces.get(i)+" "+offsets.get(i));
// }
// System.out.println(sen_ends);
// System.out.println(surfaces.size()+" "+offsets.size()+" "+sen_ends.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
return new Tokenization(surfs, offs, ends);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ChineseTokenizer method oldGetTextAnnotation.
public TextAnnotation oldGetTextAnnotation(String text) {
if (text.trim().isEmpty())
return null;
// text = trad2simp(text);
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
String[] lines = text.split("。");
int idx = 0;
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
List<String> segs = segmenter.segmentString(line);
for (String seg : segs) {
if (seg.length() > 1 && seg.endsWith("人")) {
surfaces.add(seg.substring(0, seg.length() - 1));
idx = text.indexOf(seg, idx);
offsets.add(new IntPair(idx, idx + seg.length() - 1));
surfaces.add(seg.substring(seg.length() - 1, seg.length()));
offsets.add(new IntPair(idx + seg.length() - 1, idx + seg.length()));
idx += seg.length();
} else {
surfaces.add(seg);
idx = text.indexOf(seg, idx);
offsets.add(new IntPair(idx, idx + seg.length()));
idx += seg.length();
}
}
if (i < lines.length - 1) {
surfaces.add("。");
idx = text.indexOf("。", idx);
offsets.add(new IntPair(idx, ++idx));
}
sen_ends.add(surfaces.size());
}
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
if (surfs.length == 0)
return null;
TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
return ta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class JapaneseTokenizer method tokenizeTextSpan.
/**
* The text annotation requires all tokens and sentences to be references in terms of their
* offsets from the start of the text they represent. This method will first identify and record
* the sentence offsets, it will then use the kuromoji tokenizer to tokenize words within
* sentences.
*/
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
if (textSpan.trim().isEmpty())
return null;
// first identify sentence ends, as required by TextAnnotation.
ArrayList<Integer> lastTokenIndexes = new ArrayList<>();
ArrayList<String> tokens = new ArrayList<>();
ArrayList<IntPair> tokenBoundries = new ArrayList<>();
int lastCharPosition = 0;
int tokenCounter = 0;
for (int i = 0; i < textSpan.length(); i++) {
switch(textSpan.charAt(i)) {
case '。':
case '!':
case '?':
case '.':
case ';':
// found the end of a sentence, process and convert to global
Pair<String[], IntPair[]> tokenDefs = this.tokenizeSentence(textSpan.substring(lastCharPosition, i + 1));
// adjust all the offsets, increment the IntPair by the sentence starting char offset.
int which = 0;
for (IntPair ip : tokenDefs.getSecond()) {
ip.setFirst(ip.getFirst() + lastCharPosition);
ip.setSecond(ip.getSecond() + lastCharPosition);
// add our token, and it's offset to the arrays.
tokens.add(tokenDefs.getFirst()[which++]);
tokenBoundries.add(ip);
}
tokenCounter += tokenDefs.getFirst().length;
lastTokenIndexes.add(tokenCounter);
lastCharPosition = i + 1;
break;
}
}
// if there is dangling text, just add it to the next sentence, we force a sentence termination at end of text
if (lastCharPosition != textSpan.length()) {
// found the end of a sentence, process and convert to global
Pair<String[], IntPair[]> tokenDefs = this.tokenizeSentence(textSpan.substring(lastCharPosition, textSpan.length()));
// adjust all the offsets, increment the IntPair by the sentence starting char offset.
int which = 0;
for (IntPair ip : tokenDefs.getSecond()) {
ip.setFirst(ip.getFirst() + lastCharPosition);
ip.setSecond(ip.getSecond() + lastCharPosition);
// add our token, and it's offset to the arrays.
tokens.add(tokenDefs.getFirst()[which++]);
tokenBoundries.add(ip);
}
tokenCounter += tokenDefs.getFirst().length;
lastTokenIndexes.add(tokenCounter);
}
int[] sentenceEnds = new int[lastTokenIndexes.size()];
int i = 0;
for (Integer lastToken : lastTokenIndexes) {
sentenceEnds[i++] = lastToken;
}
String[] toks = tokens.toArray(new String[tokens.size()]);
IntPair[] charOffsetArray = tokenBoundries.toArray(new IntPair[tokenBoundries.size()]);
Tokenization tokenized = new Tokenization(toks, charOffsetArray, sentenceEnds);
return tokenized;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class CharacterTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param sentence The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
for (int i = 0; i < sentence.length(); i++) {
String c = sentence.substring(i, i + 1).trim();
if (!c.isEmpty() && !c.equals(" ")) {
surfaces.add(c);
offsets.add(new IntPair(i, i + 1));
}
}
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
return new Pair<>(surfs, offs);
}
Aggregations