Examples with IllinoisTokenizer - edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer

Example 1 with IllinoisTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer in project cogcomp-nlp by CogComp.

the class TestWriteSVMLightFormat method runBeforeAllTests.

@BeforeClass
public static void runBeforeAllTests() {
    // "brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt"
    bcfex = BrownClusterFeatureExtractor.instance1000;
    taBldr = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
    ta = taBldr.createTextAnnotation("test", "test", "This test sentence has Joynt and Lieberknecht and Fibonnaci in it " + "just to exercise possible brown cluster hits in resources used by NER.");
    ta2 = taBldr.createTextAnnotation("test", "test2", "Why Joynt should have anything to do beyond JFK and Jimmy Carter " + "is beyond your oh-so-humble British writer.");
    feats = new ArrayList<>();
    for (int wordIndex = 0; wordIndex < ta.size(); ++wordIndex) try {
        feats.addAll(bcfex.getWordFeatures(ta, wordIndex));
    } catch (EdisonException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    feats2 = new ArrayList<>();
    for (int wordIndex = 0; wordIndex < ta2.size(); ++wordIndex) try {
        feats2.addAll(bcfex.getWordFeatures(ta2, wordIndex));
    } catch (EdisonException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}

Also used : IllinoisTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) BeforeClass(org.junit.BeforeClass)

Example 2 with IllinoisTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer in project cogcomp-nlp by CogComp.

the class BulkTokenizer method main.

/**
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    parseArgs(args);
    if (file == null) {
        System.err.println("Must provide a file or directory name on the command line.");
        return;
    }
    File[] files;
    File nf = new File(file);
    if (nf.isDirectory())
        files = new File(args[0]).listFiles();
    else {
        files = new File[1];
        files[0] = nf;
    }
    ArrayList<String> datas = readAllFiles(files);
    BufferedWriter fw = new BufferedWriter(new FileWriter(new File("tokenizerdiffs.out")));
    final TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    if (profile) {
        System.out.println("Starting profiling");
        while (true) {
            for (String data : datas) {
                stab.createTextAnnotation(data);
            }
        }
    } else {
        System.out.println("Starting new annotations");
        long nt = System.currentTimeMillis();
        ArrayList<TextAnnotation> newannotations = new ArrayList<TextAnnotation>();
        final TextAnnotationBuilder ntab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
        for (String data : datas) {
            TextAnnotation ta = ntab.createTextAnnotation(data);
            newannotations.add(ta);
        }
        nt = System.currentTimeMillis() - nt;
        System.out.println("Starting old annotations");
        long ot = System.currentTimeMillis();
        ArrayList<TextAnnotation> oldannotations = new ArrayList<TextAnnotation>();
        final TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
        for (String data : datas) {
            TextAnnotation ta = tab.createTextAnnotation(data);
            oldannotations.add(ta);
        }
        ot = System.currentTimeMillis() - ot;
        System.out.println("new way = " + nt + ", old way = " + ot);
        int good = 0, bad = 0;
        for (int i = 0; i < oldannotations.size(); i++) {
            File file = files[i];
            TextAnnotation newone = newannotations.get(i);
            TextAnnotation oldone = oldannotations.get(i);
            if (newone.sentences().equals(oldone.sentences())) {
                good++;
            } else {
                bad++;
                fw.write("-" + file + "\n");
                if (verbose) {
                    List<Sentence> newsentences = newone.sentences();
                    List<Sentence> oldsentences = oldone.sentences();
                    int max = newsentences.size() > oldsentences.size() ? newsentences.size() : oldsentences.size();
                    boolean sentencewritten = false;
                    for (int j = 0; j < max; j++) {
                        String news = newsentences.size() > j ? newsentences.get(j).toString() : "???";
                        String olds = oldsentences.size() > j ? oldsentences.get(j).toString() : "???";
                        if (!compareSentences(olds, news)) {
                            if (!sentencewritten) {
                                sentencewritten = true;
                                fw.write("-" + file + "\n");
                                fw.write(newone.toString() + "\n");
                            }
                            fw.write(" new : " + news + "\n old : " + olds + "\n");
                        }
                    }
                }
            }
        }
        fw.close();
        System.out.println(good + " correct, " + bad + " wrong.");
    }
}

Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) FileWriter(java.io.FileWriter) IllinoisTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer) ArrayList(java.util.ArrayList) BufferedWriter(java.io.BufferedWriter) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) File(java.io.File) Sentence(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)

Example 3 with IllinoisTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer in project cogcomp-nlp by CogComp.

the class LlmStringComparator method initialize.

private void initialize(ResourceManager rm_, Comparator<String, EntailmentResult> comparator) throws IOException {
    ResourceManager fullRm = new SimConfigurator().getConfig(rm_);
    double threshold = fullRm.getDouble(SimConfigurator.LLM_ENTAILMENT_THRESHOLD.key);
    tokenizer = new IllinoisTokenizer();
    this.comparator = comparator;
    filter = new WordListFilter(fullRm);
    neAligner = new Aligner<String, EntailmentResult>(new NEComparator(), filter);
    aligner = new Aligner<String, EntailmentResult>(comparator, filter);
    scorer = new GreedyAlignmentScorer<String>(threshold);
}

Also used : SimConfigurator(edu.illinois.cs.cogcomp.config.SimConfigurator) EntailmentResult(edu.illinois.cs.cogcomp.mrcs.dataStructures.EntailmentResult) IllinoisTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer) WordListFilter(edu.illinois.cs.cogcomp.llm.align.WordListFilter) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)

Aggregations

IllinoisTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)2 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)1 SimConfigurator (edu.illinois.cs.cogcomp.config.SimConfigurator)1 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)1 WordListFilter (edu.illinois.cs.cogcomp.llm.align.WordListFilter)1 EntailmentResult (edu.illinois.cs.cogcomp.mrcs.dataStructures.EntailmentResult)1 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileWriter (java.io.FileWriter)1 ArrayList (java.util.ArrayList)1 BeforeClass (org.junit.BeforeClass)1