use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StanfordAnalyzer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param sentenceText The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentenceText) {
Annotation document = new Annotation(sentenceText);
pipeline.annotate(document);
List<CoreLabel> tokens = new ArrayList<>();
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
int[] sen_ends = new int[sentences.size()];
int sen_idx = 0;
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
tokens.add(token);
}
sen_ends[sen_idx++] = tokens.size();
}
String[] surfaces = new String[tokens.size()];
IntPair[] tokenCharOffsets = new IntPair[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
surfaces[i] = tokens.get(i).originalText();
tokenCharOffsets[i] = new IntPair(tokens.get(i).beginPosition(), tokens.get(i).endPosition());
// System.out.println(surfaces[i]);
// System.out.println(tokenCharOffsets[i]);
}
return new Pair(surfaces, tokenCharOffsets);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StanfordAnalyzer method getTextAnnotation.
public TextAnnotation getTextAnnotation(String text) {
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreLabel> tokens = new ArrayList<>();
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
int[] sen_ends = new int[sentences.size()];
int sen_idx = 0;
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
tokens.add(token);
}
sen_ends[sen_idx++] = tokens.size();
}
String[] surfaces = new String[tokens.size()];
IntPair[] tokenCharOffsets = new IntPair[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
surfaces[i] = tokens.get(i).originalText();
tokenCharOffsets[i] = new IntPair(tokens.get(i).beginPosition(), tokens.get(i).endPosition());
// System.out.println(surfaces[i]);
// System.out.println(tokenCharOffsets[i]);
}
// System.out.println(sen_ends[0]);
TextAnnotation ta = new TextAnnotation("", "", text, tokenCharOffsets, surfaces, sen_ends);
return ta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class WhiteSpaceTokenizer method getTextAnnotation.
// public TextAnnotation getTextAnnotation(String text){
// text = text.replaceAll("\n", " ");
// String[] sentences = text.split("\\.");
// String new_text = "";
// List<IntPair> offsets = new ArrayList<>();
// List<String> surfaces = new ArrayList<>();
// List<Integer> sen_ends = new ArrayList<>();
// for(String sen: sentences){
// String[] tokens = sen.trim().split("\\s+");
// for(String token: tokens) {
// offsets.add(new IntPair(text.length(), text.length()+token.length()));
// surfaces.add(token);
// new_text += token+" ";
// }
// sen_ends.add(offsets.size());
// }
//
// IntPair[] offs = new IntPair[offsets.size()];
// offs = offsets.toArray(offs);
// String[] surfs = new String[surfaces.size()];
// surfs = surfaces.toArray(surfs);
// int[] ends = new int[sen_ends.size()];
// for(int i = 0; i < sen_ends.size(); i++)
// ends[i] = sen_ends.get(i);
//
//
// TextAnnotation ta = new TextAnnotation("", "", text, offs,
// surfs, ends);
// return ta;
//
// }
public TextAnnotation getTextAnnotation(String text) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
String t = "";
int t_start = -1;
int i;
for (i = 0; i < text.length(); i++) {
String c = text.substring(i, i + 1);
if (c.trim().isEmpty()) {
if (!t.isEmpty()) {
surfaces.add(t);
offsets.add(new IntPair(t_start, i));
t = "";
}
} else if (c.equals(".") || c.equals("\n")) {
if (!t.isEmpty()) {
surfaces.add(t);
offsets.add(new IntPair(t_start, i));
}
surfaces.add(c);
offsets.add(new IntPair(i, i + 1));
t = "";
sen_ends.add(surfaces.size());
} else {
if (t.isEmpty())
t_start = i;
t += c;
}
}
if (!t.isEmpty()) {
surfaces.add(t);
offsets.add(new IntPair(t_start, i));
sen_ends.add(surfaces.size());
}
if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()) {
sen_ends.add(surfaces.size());
}
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
if (ends[ends.length - 1] != surfaces.size()) {
System.out.println(ends[ends.length - 1]);
System.out.println(surfaces.size());
System.exit(-1);
}
if (offs.length == 0 || surfs.length == 0)
return null;
TextAnnotation ta = new TextAnnotation("", "", text, offs, surfs, ends);
return ta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class WhiteSpaceTokenizer method tokenizeTextSpan.
/**
* given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
* to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
* IntPair[] is the corresponding list of character offsets with respect to <b>the original
* text</b>.
*
* @param textSpan
*/
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
int i;
int prev = 0;
String prevc = null;
for (i = 0; i < textSpan.length(); i++) {
String c = textSpan.substring(i, i + 1);
if ((c.equals(".") && prevc != null && !prevc.toUpperCase().equals(prevc)) || c.equals("\n")) {
String sentence = textSpan.substring(prev, i);
if (!sentence.trim().isEmpty()) {
Pair<String[], IntPair[]> tokens = tokenizeSentence(sentence);
for (String token : tokens.getFirst()) surfaces.add(token);
for (IntPair offset : tokens.getSecond()) offsets.add(new IntPair(offset.getFirst() + prev, offset.getSecond() + prev));
if (c.equals(".")) {
surfaces.add(".");
offsets.add(new IntPair(i, i + 1));
}
sen_ends.add(surfaces.size());
}
prev = i + 1;
}
prevc = c;
}
if (prev < textSpan.length() && !textSpan.substring(prev, textSpan.length()).trim().isEmpty()) {
Pair<String[], IntPair[]> tokens = tokenizeSentence(textSpan.substring(prev, textSpan.length()));
for (String token : tokens.getFirst()) surfaces.add(token);
for (IntPair offset : tokens.getSecond()) offsets.add(new IntPair(offset.getFirst() + prev, offset.getSecond() + prev));
sen_ends.add(surfaces.size());
}
if (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()) {
sen_ends.add(surfaces.size());
}
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
return new Tokenization(surfs, offs, ends);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class JsonSerializerTest method verifySerializedJSONObject.
/**
* Behavior specific to unit tests only. Use with caution
*/
public static void verifySerializedJSONObject(JsonObject jobj, TextAnnotation ta) {
assertNotNull(jobj);
JsonArray jsonTokenOffsets = jobj.get(JsonSerializer.TOKENOFFSETS).getAsJsonArray();
assertNotNull(jsonTokenOffsets);
assertEquals(ta.getTokens().length, jsonTokenOffsets.size());
Map<IntPair, String> offsetForms = new HashMap<>();
for (int i = 0; i < jsonTokenOffsets.size(); ++i) {
JsonObject offset = (JsonObject) jsonTokenOffsets.get(i);
int start = offset.get(JsonSerializer.STARTCHAROFFSET).getAsInt();
int end = offset.get(JsonSerializer.ENDCHAROFFSET).getAsInt();
String form = offset.get(JsonSerializer.FORM).getAsString();
offsetForms.put(new IntPair(start, end), form);
}
Constituent seventhToken = ta.getView(ViewNames.TOKENS).getConstituents().get(6);
IntPair tokCharOffsets = new IntPair(seventhToken.getStartCharOffset(), seventhToken.getEndCharOffset());
String seventhTokenForm = seventhToken.getSurfaceForm();
String deserializedForm = offsetForms.get(tokCharOffsets);
assertNotNull(deserializedForm);
assertEquals(seventhTokenForm, deserializedForm);
Constituent thirdPos = ta.getView(ViewNames.POS).getConstituents().get(3);
assertEquals(null, thirdPos.getLabelsToScores());
View rhymeRecons = ta.getView("rhyme");
assertNotNull(rhymeRecons);
Relation r = rhymeRecons.getRelations().get(0);
Map<String, Double> relLabelScores = r.getLabelsToScores();
assertNotNull(relLabelScores);
assertEquals(2, relLabelScores.size());
Constituent c = r.getSource();
Map<String, Double> cLabelScores = c.getLabelsToScores();
assertNotNull(cLabelScores);
assertEquals(4, cLabelScores.size());
}
Aggregations