Search in sources :

Example 41 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class ESADatalessTest method getTextAnnotation.

private TextAnnotation getTextAnnotation(String text) {
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation ta = taBuilder.createTextAnnotation(text);
    return ta;
}
Also used : TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Example 42 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class TokenizerValidation method main.

/**
 * @param args
 * @throws SQLException
 */
public static void main(String[] args) throws SQLException {
    parseArgs(args);
    // create both tokenizers
    TextAnnotationBuilder statefulBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotationBuilder ilBuilder = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
    String snt = "At .5 or 3.5 decibles.";
    TextAnnotation tr = statefulBuilder.createTextAnnotation("test1", "state", snt);
    List<Constituent> t = tr.getView(ViewNames.TOKENS).getConstituents();
    TextAnnotation tr2 = ilBuilder.createTextAnnotation("test1", "state", snt);
    List<Constituent> t2 = tr2.getView(ViewNames.TOKENS).getConstituents();
    int idx = 0;
    for (Constituent ntt : t) System.out.println(ntt.getSurfaceForm());
    System.err.println();
    for (Constituent ntt : t2) System.out.println(ntt.getSurfaceForm());
    Connection con = getConnection();
    String sentencequery = "SELECT s.id,s.no_trace_string FROM sentence s, document d, subcorpus c \n" + "    where s.document_id = d.id AND d.subcorpus_id = c.id AND c.language_id = 'en'";
    // issue the query, process one string at a time.
    int counter = 0;
    int bad = 0;
    try (ResultSet rs1 = con.createStatement().executeQuery(sentencequery)) {
        while (rs1.next()) {
            counter++;
            // System.out.println("counter = "+counter);
            String id = rs1.getString(1);
            String sentence = rs1.getString(2);
            if (sentence.length() == 0)
                continue;
            sentence = sentence.replaceAll(" 's ", "'s ");
            sentence = sentence.replaceAll(" 'S ", "'S ");
            sentence = sentence.replaceAll(" 'm ", "'m ");
            sentence = sentence.replaceAll(" 're ", "'re ");
            sentence = sentence.replaceAll(" 'nt ", "'nt ");
            sentence = sentence.replaceAll(" 've ", "'ve ");
            sentence = sentence.replaceAll(" 'd ", "'d ");
            sentence = sentence.replaceAll(" 'll ", "'ll ");
            sentence = sentence.replaceAll(" do n't ", " don't ");
            TextAnnotation stateful = null;
            try {
                stateful = statefulBuilder.createTextAnnotation("test1", "state", sentence);
            } catch (ArrayIndexOutOfBoundsException aioobe) {
                System.err.println("Bad Sentence : " + sentence);
                System.exit(1);
                ;
            }
            TextAnnotation il = ilBuilder.createTextAnnotation("test2", "il", sentence);
            List<Constituent> statefulToks = stateful.getView(ViewNames.TOKENS).getConstituents();
            List<Constituent> ilToks = il.getView(ViewNames.TOKENS).getConstituents();
            // get the provided tokens.
            int sidx = 0;
            int iidx = 0;
            for (; true; ) {
                if (sidx < statefulToks.size() && iidx < ilToks.size()) {
                    String stok = statefulToks.get(sidx).getSurfaceForm();
                    String itok = ilToks.get(iidx).getSurfaceForm();
                    if (!stok.equals(itok)) {
                        System.out.println(sentence);
                        System.out.println("stateful:" + stok + " il:" + itok);
                        bad++;
                        break;
                    }
                } else {
                    if (statefulToks.size() != ilToks.size()) {
                        System.out.println(sentence);
                        System.out.println("stateful size:" + statefulToks.size() + " il size:" + ilToks.size());
                        bad++;
                    }
                    break;
                }
                sidx++;
                iidx++;
            }
        /**
         * String tokequery ="select t.word from token t where t.id like '%"+id+
         * "' AND part_of_speech!='-NONE-'"; try (ResultSet rs2 =
         * con.createStatement().executeQuery(tokequery)) { int iindex = 0; int sindex = 0;
         * while (rs2.next()) { String word = rs2.getString(1); if (sindex >=
         * statefulToks.size()) { System.out.println("On token '"+word+
         * ", stateful parsing revealed too few tokens."); } else { String stok =
         * statefulToks.get(sindex).getSurfaceForm(); if (!word.equals(stok))
         * System.out.println("On token '"+word+"', stateful parsing token was '"+stok+"'");
         * } if (iindex >= ilToks.size()) { System.out.println("On token '"+word+
         * "', illinois parsing revealed too few tokens."); } else { String stok =
         * ilToks.get(iindex).getSurfaceForm(); if (!word.equals(stok))
         * System.out.println("On token '"+word+"', illinois parsing token was '"+stok+"'");
         *
         * } iindex++; sindex++; } }
         */
        }
    }
    System.out.println("Done of " + counter + ", " + bad + " were bad.");
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) Connection(java.sql.Connection) ResultSet(java.sql.ResultSet) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Aggregations

TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)42 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)31 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)29 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)19 Test (org.junit.Test)16 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)12 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)11 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)9 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)9 Properties (java.util.Properties)7 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)6 ChunkerAnnotator (edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)5 POSTaggerAnnotator (edu.stanford.nlp.pipeline.POSTaggerAnnotator)5 ParserAnnotator (edu.stanford.nlp.pipeline.ParserAnnotator)5 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)4 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)4 ChunkerConfigurator (edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator)3 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)2