use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class PlainTextReader method parseTextRaw.
/**
* This method will normalize and parse the raw text returning a representation of sentences,
* where each sentence is a primitive array of words as strings. This representation is more
* compatible with the new core data structures which no long take vectors.
*
* @param text the text to parse.
* @return a list of sentences represented as an array of words.
*/
public static List<String[]> parseTextRaw(String text) {
text = normalizeText(text);
// sentences split by newlines. will keep
ArrayList<String> sentences1 = new ArrayList<>();
// used...
if (ParametersForLbjCode.currentParameters.forceNewSentenceOnLineBreaks || ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) {
StringTokenizer st = new StringTokenizer(text, "\n");
while (st.hasMoreTokens()) sentences1.add(st.nextToken());
} else
sentences1.add(text);
// we add Lbj sentence splitting on
ArrayList<String> sentences2 = new ArrayList<>();
// top.
if (!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) {
for (String aSentences1 : sentences1) {
SentenceSplitter parser = new SentenceSplitter(new String[] { aSentences1 });
Sentence s = (Sentence) parser.next();
while (s != null) {
sentences2.add(s.text);
s = (Sentence) parser.next();
}
}
} else
sentences2 = sentences1;
ArrayList<String[]> res = new ArrayList<>();
// tokenizing
for (String sentenceText : sentences2) {
if (sentenceText.length() > 0) {
// this is just a formatting issue with LBJ sentence splitter that can happen
if (sentenceText.charAt(sentenceText.length() - 1) == '.' && !ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting)
sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . ";
// now tokenizing for real...
String[] sentence = sentenceText.split("[ \\n\\t]");
if (sentence.length > 0) {
// fixing a bug in LBJ sentence splitter if needed
if ((!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.length == 1 && res.size() > 0 && (sentence[0].equals("\"") || sentence[0].equals("''") || sentence[0].equals("'"))) {
int where = res.size() - 1;
String[] tmp = res.remove(where);
if (tmp == null) {
tmp = new String[0];
}
int len = tmp.length;
String[] newtmp = new String[len + 1];
System.arraycopy(tmp, 0, newtmp, 0, len);
newtmp[len] = sentence[0];
res.add(newtmp);
} else
res.add(sentence);
}
}
}
return res;
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testWhitespaceBehavior.
/**
* Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
* xml markup has been replaced with whitespace of equal span.
*/
@Test
public void testWhitespaceBehavior() {
String origText = null;
try {
origText = LineIO.slurp(INFILE);
} catch (FileNotFoundException e) {
e.printStackTrace();
fail(e.getMessage());
}
Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
Matcher xmlMatcher = xmlTagPattern.matcher(origText);
StringBuilder cleanTextBldr = new StringBuilder();
int lastAppendedCharOffset = 0;
while (xmlMatcher.find()) {
int start = xmlMatcher.start();
int end = xmlMatcher.end();
cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
lastAppendedCharOffset = end;
}
cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
String cleanText = cleanTextBldr.toString();
// count whitespace chars in string
// check token offsets in tokens returned by SentenceSplitter
Pattern sun = Pattern.compile("\\w*Sun\\w*");
Matcher sunMatcher = sun.matcher(cleanText);
Set<IntPair> sunSpans = new HashSet<>();
while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
Sentence[] sents = splitter.splitAll();
Sentence s = sents[0];
LinkedVector words = s.wordSplit();
for (int i = 0; i < words.size(); ++i) {
Word firstWord = (Word) words.get(0);
if ("Sun".equals(firstWord.form)) {
IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
assertTrue(sunSpans.contains(tokenCharOffsets));
}
}
StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
String tok = tokenInfo.getTokens()[i];
if (tok.equals("Sun")) {
IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
if (!sunSpans.contains(tokCharOffsets)) {
String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
}
assertTrue(sunSpans.contains(tokCharOffsets));
}
}
TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
assertNotNull(statefulTa);
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class PlainTextReader method parseTextRaw.
/**
* This method will normalize and parse the raw text returning a representation of sentences,
* where each sentence is a primitive array of words as strings. This representation is more
* compatible with the new core data structures which no long take vectors.
*
* @param text the text to parse.
* @return a list of sentences represented as an array of words.
*/
public static List<String[]> parseTextRaw(String text, ParametersForLbjCode cp) {
text = normalizeText(text, cp);
// sentences split by newlines. will keep
ArrayList<String> sentences1 = new ArrayList<>();
// used...
if (cp.forceNewSentenceOnLineBreaks || cp.keepOriginalFileTokenizationAndSentenceSplitting) {
StringTokenizer st = new StringTokenizer(text, "\n");
while (st.hasMoreTokens()) sentences1.add(st.nextToken());
} else
sentences1.add(text);
// we add Lbj sentence splitting on
ArrayList<String> sentences2 = new ArrayList<>();
// top.
if (!cp.keepOriginalFileTokenizationAndSentenceSplitting) {
for (String aSentences1 : sentences1) {
SentenceSplitter parser = new SentenceSplitter(new String[] { aSentences1 });
Sentence s = (Sentence) parser.next();
while (s != null) {
sentences2.add(s.text);
s = (Sentence) parser.next();
}
}
} else
sentences2 = sentences1;
ArrayList<String[]> res = new ArrayList<>();
// tokenizing
for (String sentenceText : sentences2) {
if (sentenceText.length() > 0) {
// this is just a formatting issue with LBJ sentence splitter that can happen
if (sentenceText.charAt(sentenceText.length() - 1) == '.' && !cp.keepOriginalFileTokenizationAndSentenceSplitting)
sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . ";
// now tokenizing for real...
String[] sentence = sentenceText.split("[ \\n\\t]");
if (sentence.length > 0) {
// fixing a bug in LBJ sentence splitter if needed
if ((!cp.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.length == 1 && res.size() > 0 && (sentence[0].equals("\"") || sentence[0].equals("''") || sentence[0].equals("'"))) {
int where = res.size() - 1;
String[] tmp = res.remove(where);
if (tmp == null) {
tmp = new String[0];
}
int len = tmp.length;
String[] newtmp = new String[len + 1];
System.arraycopy(tmp, 0, newtmp, 0, len);
newtmp[len] = sentence[0];
res.add(newtmp);
} else
res.add(sentence);
}
}
}
return res;
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class PlainTextReader method sentenceSplitAndTokenizeText.
public static Vector<Vector<String>> sentenceSplitAndTokenizeText(String text, ParametersForLbjCode cp) {
text = normalizeText(text, cp);
// sentences split by newlines. will keep just
Vector<String> sentences1 = new Vector<>();
// on newlines is used...
if (cp.forceNewSentenceOnLineBreaks || cp.keepOriginalFileTokenizationAndSentenceSplitting) {
StringTokenizer st = new StringTokenizer(text, "\n");
while (st.hasMoreTokens()) sentences1.addElement(st.nextToken());
} else
sentences1.addElement(text);
// we add Lbj sentence splitting on top.
Vector<String> sentences2 = new Vector<>();
if (!cp.keepOriginalFileTokenizationAndSentenceSplitting) {
for (int i = 0; i < sentences1.size(); i++) {
SentenceSplitter parser = new SentenceSplitter(new String[] { sentences1.elementAt(i) });
Sentence s = (Sentence) parser.next();
while (s != null) {
sentences2.addElement(s.text);
s = (Sentence) parser.next();
}
}
} else
sentences2 = sentences1;
Vector<Vector<String>> res = new Vector<>();
// tokenizing
for (int i = 0; i < sentences2.size(); i++) {
String sentenceText = sentences2.elementAt(i);
if (sentenceText.length() > 0) {
// this is just a formatting issue with LBJ sentence splitter that can happen
if (sentenceText.charAt(sentenceText.length() - 1) == '.' && !cp.keepOriginalFileTokenizationAndSentenceSplitting)
sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . ";
// now tokenizing for real...
StringTokenizer st = new StringTokenizer(sentenceText, " \n\t");
Vector<String> sentence = new Vector<>();
while (st.hasMoreTokens()) sentence.addElement(st.nextToken());
if (sentence.size() > 0) {
// fixing a bug in LBJ sentence splitter if needed
if ((!cp.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.size() == 1 && res.size() > 0 && (sentence.elementAt(0).equals("\"") || sentence.elementAt(0).equals("''") || sentence.elementAt(0).equals("'")))
res.elementAt(res.size() - 1).add(sentence.elementAt(0));
else
res.addElement(sentence);
}
}
}
return res;
}
use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class PlainTextReader method sentenceSplitAndTokenizeText.
public static Vector<Vector<String>> sentenceSplitAndTokenizeText(String text) {
text = normalizeText(text);
// sentences split by newlines. will keep just
Vector<String> sentences1 = new Vector<>();
// on newlines is used...
if (ParametersForLbjCode.currentParameters.forceNewSentenceOnLineBreaks || ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) {
StringTokenizer st = new StringTokenizer(text, "\n");
while (st.hasMoreTokens()) sentences1.addElement(st.nextToken());
} else
sentences1.addElement(text);
// we add Lbj sentence splitting on top.
Vector<String> sentences2 = new Vector<>();
if (!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) {
for (int i = 0; i < sentences1.size(); i++) {
SentenceSplitter parser = new SentenceSplitter(new String[] { sentences1.elementAt(i) });
Sentence s = (Sentence) parser.next();
while (s != null) {
sentences2.addElement(s.text);
s = (Sentence) parser.next();
}
}
} else
sentences2 = sentences1;
Vector<Vector<String>> res = new Vector<>();
// tokenizing
for (int i = 0; i < sentences2.size(); i++) {
String sentenceText = sentences2.elementAt(i);
if (sentenceText.length() > 0) {
// this is just a formatting issue with LBJ sentence splitter that can happen
if (sentenceText.charAt(sentenceText.length() - 1) == '.' && !ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting)
sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . ";
// now tokenizing for real...
StringTokenizer st = new StringTokenizer(sentenceText, " \n\t");
Vector<String> sentence = new Vector<>();
while (st.hasMoreTokens()) sentence.addElement(st.nextToken());
if (sentence.size() > 0) {
// fixing a bug in LBJ sentence splitter if needed
if ((!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.size() == 1 && res.size() > 0 && (sentence.elementAt(0).equals("\"") || sentence.elementAt(0).equals("''") || sentence.elementAt(0).equals("'")))
res.elementAt(res.size() - 1).add(sentence.elementAt(0));
else
res.addElement(sentence);
}
}
}
return res;
}
Aggregations