use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class TestBrownClusterFeatureExtractor method test.
@Test
public final void test() {
int[] prefixLengths = new int[] { 4, 6, 10, 20 };
BrownClusterFeatureExtractor bcfex1 = BrownClusterFeatureExtractor.instance1000;
BrownClusterFeatureExtractor bcfex2 = null;
try {
bcfex2 = new BrownClusterFeatureExtractor("bllip", "brownBllipClusters", prefixLengths);
} catch (EdisonException e) {
e.printStackTrace();
fail(e.getMessage());
}
BrownClusterFeatureExtractor bcfex3 = null;
try {
bcfex3 = new BrownClusterFeatureExtractor("wiki", "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt", prefixLengths);
} catch (EdisonException e) {
e.printStackTrace();
fail(e.getMessage());
}
TokenizerTextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = taBldr.createTextAnnotation("test", "test", "This test sentence has Joynt and Lieberknecht and Fibonnaci in it " + "just to exercise possible brown cluster hits in resources used by NER.");
Set<Feature> feats = new HashSet<>();
for (int wordIndex = 0; wordIndex < ta.size(); ++wordIndex) try {
feats.addAll(bcfex1.getWordFeatures(ta, wordIndex));
feats.addAll(bcfex2.getWordFeatures(ta, wordIndex));
feats.addAll(bcfex3.getWordFeatures(ta, wordIndex));
} catch (EdisonException e) {
e.printStackTrace();
fail(e.getMessage());
}
assertTrue(ta.hasView(ViewNames.BROWN_CLUSTERS + "_wiki"));
String[] featArray = new String[feats.size()];
int i = 0;
for (Feature f : feats) featArray[i++] = f.toString();
Arrays.sort(featArray);
String actualOutput = StringUtils.join(",", featArray);
assertEquals(expectedOutput, actualOutput);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class TemporalNormalizerBenchmark method testTemporalChunker.
/**
* Normalize the dataset using our Chunker for temporal phrases extraction
* @param outputFolder
* @param verbose
* @throws Exception
*/
public void testTemporalChunker(String outputFolder, boolean verbose) throws Exception {
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(false, false));
ResourceManager nerRm = new TemporalChunkerConfigurator().getDefaultConfig();
IOUtilities.existsInClasspath(TemporalChunkerAnnotator.class, nerRm.getString("modelDirPath"));
java.util.logging.Logger.getLogger("HeidelTimeStandalone").setLevel(Level.OFF);
List<TextAnnotation> taList = new ArrayList<>();
long preprocessTime = System.currentTimeMillis();
POSAnnotator annotator = new POSAnnotator();
for (int j = 0; j < testText.size(); j++) {
TextAnnotation ta = tab.createTextAnnotation("corpus", "id", testText.get(j));
try {
annotator.getView(ta);
} catch (AnnotatorException e) {
fail("AnnotatorException thrown!\n" + e.getMessage());
}
taList.add(ta);
}
if (verbose) {
System.out.println("Start");
}
long startTime = System.currentTimeMillis();
File outDir = new File(outputFolder);
if (!outDir.exists()) {
outDir.mkdir();
}
for (int j = 0; j < testText.size(); j++) {
tca.addDocumentCreationTime(DCTs.get(j));
TextAnnotation ta = taList.get(j);
try {
tca.addView(ta);
} catch (AnnotatorException e) {
fail("Exception while adding TIMEX3 VIEW " + e.getStackTrace());
}
String outputFileName = "./" + outputFolder + "/" + docIDs.get(j) + ".tml";
if (verbose) {
System.out.println(docIDs.get(j));
for (TimexChunk tc : tca.getTimex()) {
System.out.println(tc.toTIMEXString());
}
System.out.println("\n");
}
tca.write2Text(outputFileName, docIDs.get(j), testText.get(j));
tca.deleteTimex();
}
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
if (verbose) {
System.out.println("Process time: " + totalTime);
System.out.println("Preprocess + process time: " + (endTime - preprocessTime));
}
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class HashCollisionReport method main.
/**
* Read each test file in the directory, tokenize and create the token view. Then check for
* collisions.
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length == 0)
error("Must pass in the name of a directory with files to test against.");
File dir = new File(args[0]);
if (!dir.exists()) {
error("The directory did not exist : " + dir);
}
if (!dir.isDirectory()) {
error("The path was not a directory : " + dir);
}
File[] files = dir.listFiles();
for (File file : files) {
if (file.isFile()) {
String normal = FileUtils.readFileToString(file);
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
HashMap<Integer, Constituent> hashmap = new HashMap<>();
// is already used, if it is report it.
for (Constituent c : normalToks) {
int code = c.hashCode();
if (hashmap.containsKey(code)) {
Constituent dup = hashmap.get(code);
System.err.println(c + " == " + dup);
} else {
hashmap.put(code, c);
}
}
}
}
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testSentenceSplitOnMultipleNewlines.
/**
* Parse an empty string.
*/
@Test
public void testSentenceSplitOnMultipleNewlines() {
TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String text = "Mary loves Dick. Dick loves Jane.";
TextAnnotation taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
text = "Mary loves Dick\n\nDick loves Jane.";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
text = "Mary loves Dick\n\n\nDick loves Jane.";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
text = "Mary loves Dick\n\n\n\nDick loves Jane.\n\n";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
text = "\n\nMary loves Dick\n\n\n\nDick loves Jane.\n\n";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 2);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testLowerCaseAcronymEndWithDot.
/**
* Test sentence splitter behavior when a there is a lower cased acronym followed immediately by a dot.
*/
@Test
public void testLowerCaseAcronymEndWithDot() {
TokenizerTextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String text = "I was born in Urbana, Il. in 1992.";
TextAnnotation ta = tab.createTextAnnotation(text);
assertEquals(ta.getNumberOfSentences(), 1);
}
Aggregations