Search in sources :

Example 11 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class TestBrownClusterFeatureExtractor method test.

@Test
public final void test() {
    int[] prefixLengths = new int[] { 4, 6, 10, 20 };
    BrownClusterFeatureExtractor bcfex1 = BrownClusterFeatureExtractor.instance1000;
    BrownClusterFeatureExtractor bcfex2 = null;
    try {
        bcfex2 = new BrownClusterFeatureExtractor("bllip", "brownBllipClusters", prefixLengths);
    } catch (EdisonException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    BrownClusterFeatureExtractor bcfex3 = null;
    try {
        bcfex3 = new BrownClusterFeatureExtractor("wiki", "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt", prefixLengths);
    } catch (EdisonException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    TokenizerTextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation ta = taBldr.createTextAnnotation("test", "test", "This test sentence has Joynt and Lieberknecht and Fibonnaci in it " + "just to exercise possible brown cluster hits in resources used by NER.");
    Set<Feature> feats = new HashSet<>();
    for (int wordIndex = 0; wordIndex < ta.size(); ++wordIndex) try {
        feats.addAll(bcfex1.getWordFeatures(ta, wordIndex));
        feats.addAll(bcfex2.getWordFeatures(ta, wordIndex));
        feats.addAll(bcfex3.getWordFeatures(ta, wordIndex));
    } catch (EdisonException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    assertTrue(ta.hasView(ViewNames.BROWN_CLUSTERS + "_wiki"));
    String[] featArray = new String[feats.size()];
    int i = 0;
    for (Feature f : feats) featArray[i++] = f.toString();
    Arrays.sort(featArray);
    String actualOutput = StringUtils.join(",", featArray);
    assertEquals(expectedOutput, actualOutput);
}
Also used : TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) EdisonException(edu.illinois.cs.cogcomp.edison.utilities.EdisonException) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Feature(edu.illinois.cs.cogcomp.edison.features.Feature) BrownClusterFeatureExtractor(edu.illinois.cs.cogcomp.edison.features.factory.BrownClusterFeatureExtractor) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 12 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class TemporalNormalizerBenchmark method testTemporalChunker.

/**
 * Normalize the dataset using our Chunker for temporal phrases extraction
 * @param outputFolder
 * @param verbose
 * @throws Exception
 */
public void testTemporalChunker(String outputFolder, boolean verbose) throws Exception {
    TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(false, false));
    ResourceManager nerRm = new TemporalChunkerConfigurator().getDefaultConfig();
    IOUtilities.existsInClasspath(TemporalChunkerAnnotator.class, nerRm.getString("modelDirPath"));
    java.util.logging.Logger.getLogger("HeidelTimeStandalone").setLevel(Level.OFF);
    List<TextAnnotation> taList = new ArrayList<>();
    long preprocessTime = System.currentTimeMillis();
    POSAnnotator annotator = new POSAnnotator();
    for (int j = 0; j < testText.size(); j++) {
        TextAnnotation ta = tab.createTextAnnotation("corpus", "id", testText.get(j));
        try {
            annotator.getView(ta);
        } catch (AnnotatorException e) {
            fail("AnnotatorException thrown!\n" + e.getMessage());
        }
        taList.add(ta);
    }
    if (verbose) {
        System.out.println("Start");
    }
    long startTime = System.currentTimeMillis();
    File outDir = new File(outputFolder);
    if (!outDir.exists()) {
        outDir.mkdir();
    }
    for (int j = 0; j < testText.size(); j++) {
        tca.addDocumentCreationTime(DCTs.get(j));
        TextAnnotation ta = taList.get(j);
        try {
            tca.addView(ta);
        } catch (AnnotatorException e) {
            fail("Exception while adding TIMEX3 VIEW " + e.getStackTrace());
        }
        String outputFileName = "./" + outputFolder + "/" + docIDs.get(j) + ".tml";
        if (verbose) {
            System.out.println(docIDs.get(j));
            for (TimexChunk tc : tca.getTimex()) {
                System.out.println(tc.toTIMEXString());
            }
            System.out.println("\n");
        }
        tca.write2Text(outputFileName, docIDs.get(j), testText.get(j));
        tca.deleteTimex();
    }
    long endTime = System.currentTimeMillis();
    long totalTime = endTime - startTime;
    if (verbose) {
        System.out.println("Process time: " + totalTime);
        System.out.println("Preprocess + process time: " + (endTime - preprocessTime));
    }
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TimexChunk(edu.illinois.cs.cogcomp.temporal.normalizer.main.timex2interval.TimexChunk) POSAnnotator(edu.illinois.cs.cogcomp.pos.POSAnnotator) ArrayList(java.util.ArrayList) AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Example 13 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class HashCollisionReport method main.

/**
 * Read each test file in the directory, tokenize and create the token view. Then check for
 * collisions.
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    if (args.length == 0)
        error("Must pass in the name of a directory with files to test against.");
    File dir = new File(args[0]);
    if (!dir.exists()) {
        error("The directory did not exist : " + dir);
    }
    if (!dir.isDirectory()) {
        error("The path was not a directory : " + dir);
    }
    File[] files = dir.listFiles();
    for (File file : files) {
        if (file.isFile()) {
            String normal = FileUtils.readFileToString(file);
            TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
            TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
            List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
            HashMap<Integer, Constituent> hashmap = new HashMap<>();
            // is already used, if it is report it.
            for (Constituent c : normalToks) {
                int code = c.hashCode();
                if (hashmap.containsKey(code)) {
                    Constituent dup = hashmap.get(code);
                    System.err.println(c + " == " + dup);
                } else {
                    hashmap.put(code, c);
                }
            }
        }
    }
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) HashMap(java.util.HashMap) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) File(java.io.File) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 14 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testSentenceSplitOnMultipleNewlines.

/**
 * Parse an empty string.
 */
@Test
public void testSentenceSplitOnMultipleNewlines() {
    TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
    String text = "Mary loves Dick. Dick loves Jane.";
    TextAnnotation taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 2);
    text = "Mary loves Dick\n\nDick loves Jane.";
    taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 2);
    text = "Mary loves Dick\n\n\nDick loves Jane.";
    taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 2);
    text = "Mary loves Dick\n\n\n\nDick loves Jane.\n\n";
    taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 2);
    text = "\n\nMary loves Dick\n\n\n\nDick loves Jane.\n\n";
    taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 2);
}
Also used : TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Test(org.junit.Test)

Example 15 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testLowerCaseAcronymEndWithDot.

/**
 * Test sentence splitter behavior when a there is a lower cased acronym followed immediately by a dot.
 */
@Test
public void testLowerCaseAcronymEndWithDot() {
    TokenizerTextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
    String text = "I was born in Urbana, Il. in 1992.";
    TextAnnotation ta = tab.createTextAnnotation(text);
    assertEquals(ta.getNumberOfSentences(), 1);
}
Also used : TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Test(org.junit.Test)

Aggregations

TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)42 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)31 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)29 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)19 Test (org.junit.Test)16 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)12 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)11 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)9 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)9 Properties (java.util.Properties)7 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)6 ChunkerAnnotator (edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)5 POSTaggerAnnotator (edu.stanford.nlp.pipeline.POSTaggerAnnotator)5 ParserAnnotator (edu.stanford.nlp.pipeline.ParserAnnotator)5 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)4 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)4 ChunkerConfigurator (edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator)3 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)2