use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class HashCollisionReport method main.
/**
* Read each test file in the directory, tokenize and create the token view. Then check for
* collisions.
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length == 0)
error("Must pass in the name of a directory with files to test against.");
File dir = new File(args[0]);
if (!dir.exists()) {
error("The directory did not exist : " + dir);
}
if (!dir.isDirectory()) {
error("The path was not a directory : " + dir);
}
File[] files = dir.listFiles();
for (File file : files) {
if (file.isFile()) {
String normal = FileUtils.readFileToString(file);
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
HashMap<Integer, Constituent> hashmap = new HashMap<>();
// is already used, if it is report it.
for (Constituent c : normalToks) {
int code = c.hashCode();
if (hashmap.containsKey(code)) {
Constituent dup = hashmap.get(code);
System.err.println(c + " == " + dup);
} else {
hashmap.put(code, c);
}
}
}
}
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testSentenceSplitOnMultipleNewlines.
/**
* Parse an empty string.
*/
@Test
public void testSentenceSplitOnMultipleNewlines() {
TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String text = "Mary loves Dick. Dick loves Jane.";
TextAnnotation taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
text = "Mary loves Dick\n\nDick loves Jane.";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
text = "Mary loves Dick\n\n\nDick loves Jane.";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
text = "Mary loves Dick\n\n\n\nDick loves Jane.\n\n";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
text = "\n\nMary loves Dick\n\n\n\nDick loves Jane.\n\n";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testLowerCaseAcronymEndWithDot.
/**
* Test sentence splitter behavior when a there is a lower cased acronym followed immediately by a dot.
*/
@Test
public void testLowerCaseAcronymEndWithDot() {
TokenizerTextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String text = "I was born in Urbana, Il. in 1992.";
TextAnnotation ta = tab.createTextAnnotation(text);
assertEquals(ta.getNumberOfSentences(), 1);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testDateTokenization.
/**
* Parse out a date, which will hopefully look like a date.
*/
@Test
public void testDateTokenization() {
TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String tmp = "One two, three-four-five 10/23/2018 at 5:20pm one? Of course not! Be well, stranger. Bye-bye!";
TextAnnotation taA = bldr.createTextAnnotation("test", "test", tmp);
String[] toks = taA.getTokens();
assertEquals(toks[8], "10/23/2018");
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testPeriodContract.
/**
* Test file extensions.
*/
@Test
public void testPeriodContract() {
TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String tmp = "Info is in tokenizer.pdf or the palatar.MOV file. The next sentence is a structure unto itself.";
TextAnnotation taA = bldr.createTextAnnotation("test", "test", tmp);
String[] toks = taA.getTokens();
assertEquals(toks[3], "tokenizer.pdf");
assertEquals(toks[6], "palatar.MOV");
tmp = "I am the man from U.N.C.L.E., but you are not at the U.N. now.";
taA = bldr.createTextAnnotation("test", "test", tmp);
toks = taA.getTokens();
assertEquals(toks[5], "U.N.C.L.E.");
assertEquals(toks[13], "U.N.");
tmp = "The head of Inefficient Machine Co. Edward Doolally later relented.";
taA = bldr.createTextAnnotation("test", "test", tmp);
toks = taA.getTokens();
assertEquals(taA.getNumberOfSentences(), 1);
}
Aggregations