Examples with StatefulTokenizer - edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer

Example 16 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class BulkTokenizer method main.

/**
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    parseArgs(args);
    if (file == null) {
        System.err.println("Must provide a file or directory name on the command line.");
        return;
    }
    File[] files;
    File nf = new File(file);
    if (nf.isDirectory())
        files = new File(args[0]).listFiles();
    else {
        files = new File[1];
        files[0] = nf;
    }
    ArrayList<String> datas = readAllFiles(files);
    BufferedWriter fw = new BufferedWriter(new FileWriter(new File("tokenizerdiffs.out")));
    final TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    if (profile) {
        System.out.println("Starting profiling");
        while (true) {
            for (String data : datas) {
                stab.createTextAnnotation(data);
            }
        }
    } else {
        System.out.println("Starting new annotations");
        long nt = System.currentTimeMillis();
        ArrayList<TextAnnotation> newannotations = new ArrayList<TextAnnotation>();
        final TextAnnotationBuilder ntab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
        for (String data : datas) {
            TextAnnotation ta = ntab.createTextAnnotation(data);
            newannotations.add(ta);
        }
        nt = System.currentTimeMillis() - nt;
        System.out.println("Starting old annotations");
        long ot = System.currentTimeMillis();
        ArrayList<TextAnnotation> oldannotations = new ArrayList<TextAnnotation>();
        final TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new IllinoisTokenizer());
        for (String data : datas) {
            TextAnnotation ta = tab.createTextAnnotation(data);
            oldannotations.add(ta);
        }
        ot = System.currentTimeMillis() - ot;
        System.out.println("new way = " + nt + ", old way = " + ot);
        int good = 0, bad = 0;
        for (int i = 0; i < oldannotations.size(); i++) {
            File file = files[i];
            TextAnnotation newone = newannotations.get(i);
            TextAnnotation oldone = oldannotations.get(i);
            if (newone.sentences().equals(oldone.sentences())) {
                good++;
            } else {
                bad++;
                fw.write("-" + file + "\n");
                if (verbose) {
                    List<Sentence> newsentences = newone.sentences();
                    List<Sentence> oldsentences = oldone.sentences();
                    int max = newsentences.size() > oldsentences.size() ? newsentences.size() : oldsentences.size();
                    boolean sentencewritten = false;
                    for (int j = 0; j < max; j++) {
                        String news = newsentences.size() > j ? newsentences.get(j).toString() : "???";
                        String olds = oldsentences.size() > j ? oldsentences.get(j).toString() : "???";
                        if (!compareSentences(olds, news)) {
                            if (!sentencewritten) {
                                sentencewritten = true;
                                fw.write("-" + file + "\n");
                                fw.write(newone.toString() + "\n");
                            }
                            fw.write(" new : " + news + "\n old : " + olds + "\n");
                        }
                    }
                }
            }
        }
        fw.close();
        System.out.println(good + " correct, " + bad + " wrong.");
    }
}

Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) FileWriter(java.io.FileWriter) IllinoisTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer) ArrayList(java.util.ArrayList) BufferedWriter(java.io.BufferedWriter) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) File(java.io.File) Sentence(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)

Example 17 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class ACERelationTester method testRandomText.

public static void testRandomText(String text) {
    String corpus = "";
    String textId = "";
    TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
    try {
        POSAnnotator pos_annotator = new POSAnnotator();
        ChunkerAnnotator chunker = new ChunkerAnnotator(true);
        chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
        Properties stanfordProps = new Properties();
        stanfordProps.put("annotators", "pos, parse");
        stanfordProps.put("parse.originalDependencies", true);
        stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
        stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
        POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
        ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
        StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
        MentionAnnotator mentionAnnotator = new MentionAnnotator("ACE_TYPE");
        RelationAnnotator relationAnnotator = new RelationAnnotator();
        ta.addView(pos_annotator);
        stanfordDepHandler.addView(ta);
        chunker.addView(ta);
        mentionAnnotator.addView(ta);
        relationAnnotator.addView(ta);
        for (Relation r : ta.getView(ViewNames.RELATION).getRelations()) {
            IOHelper.printRelation(r);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Also used : ChunkerConfigurator(edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator) TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) ParserAnnotator(edu.stanford.nlp.pipeline.ParserAnnotator) POSAnnotator(edu.illinois.cs.cogcomp.pos.POSAnnotator) MentionAnnotator(org.cogcomp.md.MentionAnnotator) Properties(java.util.Properties) ChunkerAnnotator(edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator) POSTaggerAnnotator(edu.stanford.nlp.pipeline.POSTaggerAnnotator) Relation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Relation) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) StanfordDepHandler(edu.illinois.cs.cogcomp.pipeline.handlers.StanfordDepHandler)

Example 18 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class Demo method main.

public static void main(String[] args) throws IOException, AnnotatorException {
    Options options = new Options();
    Option inputtext = new Option("t", "text", true, "input text to be processed");
    inputtext.setRequired(false);
    options.addOption(inputtext);
    CommandLineParser parser = new DefaultParser();
    HelpFormatter formatter = new HelpFormatter();
    try {
        CommandLine cmd = parser.parse(options, args);
        String defaultText = "The flu season is winding down, and it has killed 105 children so far - about the average toll.\n" + "\n" + "The season started about a month earlier than usual, sparking concerns it might turn into the worst in " + "a decade. It ended up being very hard on the elderly, but was moderately severe overall, according to " + "the Centers for Disease Control and Prevention.\n" + "\n" + "Six of the pediatric deaths were reported in the last week, and it's possible there will be more, said " + "the CDC's Dr. Michael Jhung said Friday.\n" + "\n" + "Roughly 100 children die in an average flu season. One exception was the swine flu pandemic of " + "2009-2010, when 348 children died.\n" + "\n" + "The CDC recommends that all children ages 6 months and older be vaccinated against flu each season, " + "though only about half get a flu shot or nasal spray.\n" + "\n" + "All but four of the children who died were old enough to be vaccinated, but 90 percent of them did " + "not get vaccinated, CDC officials said.\n" + "\n" + "This year's vaccine was considered effective in children, though it didn't work very well in older " + "people. And the dominant flu strain early in the season was one that tends to " + "cause more severe illness.\n" + "\n" + "The government only does a national flu death count for children. But it does track hospitalization " + "rates for people 65 and older, and those statistics have been grim.\n" + "\n" + "In that group, 177 out of every 100,000 were hospitalized with flu-related illness in the past " + "several months. That's more than 2 1/2 times higher than any other recent season.\n" + "\n" + "This flu season started in early December, a month earlier than usual, and peaked by the end " + "of year. Since then, flu reports have been dropping off throughout the country.\n" + "\n" + "\"We appear to be getting close to the end of flu season,\" Jhung said.";
        String text = cmd.getOptionValue("text", defaultText);
        TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
        TextAnnotation ta = tab.createTextAnnotation("corpus", "id", text);
        POSAnnotator annotator = new POSAnnotator();
        try {
            annotator.getView(ta);
        } catch (AnnotatorException e) {
            fail("AnnotatorException thrown!\n" + e.getMessage());
        }
        Properties rmProps = new TemporalChunkerConfigurator().getDefaultConfig().getProperties();
        rmProps.setProperty("useHeidelTime", "False");
        TemporalChunkerAnnotator tca = new TemporalChunkerAnnotator(new ResourceManager(rmProps));
        tca.addView(ta);
        View temporalViews = ta.getView(ViewNames.TIMEX3);
        List<Constituent> constituents = temporalViews.getConstituents();
        System.out.printf("There're %d time expressions (TIMEX) in total.\n", constituents.size());
        for (Constituent c : constituents) {
            System.out.printf("TIMEX #%d: Text=%s, Type=%s, Value=%s\n", constituents.indexOf(c), c, c.getAttribute("type"), c.getAttribute("value"));
        }
    } catch (ParseException e) {
        System.out.println(e.getMessage());
        formatter.printHelp("Temporal Normalizer Demo", options);
        System.exit(1);
    }
}

Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) POSAnnotator(edu.illinois.cs.cogcomp.pos.POSAnnotator) AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) Properties(java.util.Properties) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 19 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class SemEvalMentionReader method readTrainFile.

public List<TextAnnotation> readTrainFile(String fileName, String mode) {
    List<String> sentences = new ArrayList<>();
    List<String> types = new ArrayList<>();
    List<TextAnnotation> ret = new ArrayList<>();
    int counter = 0;
    if (mode.equals("TRAIN")) {
        try (BufferedReader br = new BufferedReader(new FileReader(fileName))) {
            String line;
            while ((line = br.readLine()) != null) {
                if (counter % 4 == 0) {
                    String curSentence = line.split("\t")[1];
                    if (curSentence.charAt(0) == '"') {
                        curSentence = curSentence.substring(1);
                    }
                    if (curSentence.charAt(curSentence.length() - 1) == '"') {
                        curSentence = curSentence.substring(0, curSentence.length() - 1);
                    }
                    sentences.add(curSentence);
                }
                if (counter % 4 == 1) {
                    types.add(line);
                }
                counter++;
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    if (mode.equals("TEST")) {
        try (BufferedReader br = new BufferedReader(new FileReader(fileName))) {
            String line;
            while ((line = br.readLine()) != null) {
                String curSentence = line.split("\t")[1];
                if (curSentence.charAt(0) == '"') {
                    curSentence = curSentence.substring(1);
                }
                if (curSentence.charAt(curSentence.length() - 1) == '"') {
                    curSentence = curSentence.substring(0, curSentence.length() - 1);
                }
                sentences.add(curSentence);
                types.add("UNKNOWN");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
    for (int i = 0; i < sentences.size(); i++) {
        List<String[]> tokens = new ArrayList<>();
        String sentence = sentences.get(i);
        String type = types.get(i);
        Pair<String[], IntPair[]> tokenizedSentence = statefulTokenizer.tokenizeSentence(sentence);
        List<String> curTokens = new LinkedList<>(Arrays.asList(tokenizedSentence.getFirst()));
        int firstArgStart = 0;
        int firstArgEnd = 0;
        int secondArgStart = 0;
        int secondArgEnd = 0;
        for (int j = 0; j < curTokens.size(); j++) {
            if (curTokens.get(j).equals("<") && curTokens.get(j + 1).equals("e1") && curTokens.get(j + 2).equals(">")) {
                firstArgStart = j;
                for (int k = j; k < j + 3; k++) {
                    curTokens.remove(j);
                }
            }
            if (curTokens.get(j).equals("<") && curTokens.get(j + 1).equals("/") && curTokens.get(j + 2).equals("e1") && curTokens.get(j + 3).equals(">")) {
                firstArgEnd = j;
                for (int k = j; k < j + 4; k++) {
                    curTokens.remove(j);
                }
            }
            if (curTokens.get(j).equals("<") && curTokens.get(j + 1).equals("e2") && curTokens.get(j + 2).equals(">")) {
                secondArgStart = j;
                for (int k = j; k < j + 3; k++) {
                    curTokens.remove(j);
                }
            }
            if (curTokens.get(j).equals("<") && curTokens.get(j + 1).equals("/") && curTokens.get(j + 2).equals("e2") && curTokens.get(j + 3).equals(">")) {
                secondArgEnd = j;
                for (int k = j; k < j + 4; k++) {
                    curTokens.remove(j);
                }
            }
        }
        tokens.add(curTokens.toArray(new String[curTokens.size()]));
        TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(tokens);
        try {
            ta.addView(_posAnnotator);
            __chunker.addView(ta);
            __stanfordDep.addView(ta);
            __mentionAnnotator.addView(ta);
            View annotatedTokenView = new SpanLabelView("RE_ANNOTATED", ta);
            for (Constituent co : ta.getView(ViewNames.TOKENS).getConstituents()) {
                Constituent c = co.cloneForNewView("RE_ANNOTATED");
                for (String s : co.getAttributeKeys()) {
                    c.addAttribute(s, co.getAttribute(s));
                }
                c.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(_wordnet, c));
                c.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(_wordnet, c));
                annotatedTokenView.addConstituent(c);
            }
            ta.addView("RE_ANNOTATED", annotatedTokenView);
        } catch (Exception e) {
            e.printStackTrace();
        }
        SpanLabelView mentionView = new SpanLabelView("MENTIONS", "MENTIONS", ta, 1.0f);
        Constituent firstArg = new Constituent("MENTION", 1.0f, "MENTIONS", ta, firstArgStart, firstArgEnd);
        Constituent secondArg = new Constituent("MENTION", 1.0f, "MENTIONS", ta, secondArgStart, secondArgEnd);
        firstArg.addAttribute("GAZ", _gazetteers.annotatePhrase(firstArg));
        secondArg.addAttribute("GAZ", _gazetteers.annotatePhrase(secondArg));
        View annotatedMentionView = ta.getView(ViewNames.MENTION);
        List<Constituent> firstMentions = annotatedMentionView.getConstituentsCoveringToken(firstArg.getStartSpan());
        List<Constituent> secondMentions = annotatedMentionView.getConstituentsCoveringToken(secondArg.getStartSpan());
        if (firstMentions.size() == 0) {
            firstArg.addAttribute("EntityType", "UNKNOWN");
        } else {
            firstArg.addAttribute("EntityType", firstMentions.get(0).getAttribute("EntityType"));
        }
        if (secondMentions.size() == 0) {
            secondArg.addAttribute("EntityType", "UNKNOWN");
        } else {
            secondArg.addAttribute("EntityType", secondMentions.get(0).getAttribute("EntityType"));
        }
        mentionView.addConstituent(firstArg);
        mentionView.addConstituent(secondArg);
        if (type.contains("e1,e2")) {
            Relation relation = new Relation(type.split("[(]")[0], firstArg, secondArg, 1.0f);
            relation.addAttribute("RelationSubtype", relation.getRelationName());
            mentionView.addRelation(relation);
        } else if (type.contains("e2,e1")) {
            Relation relation = new Relation(type.split("[(]")[0], secondArg, firstArg, 1.0f);
            relation.addAttribute("RelationSubtype", relation.getRelationName());
            mentionView.addRelation(relation);
        } else {
            Relation relationLeft = new Relation(type, secondArg, firstArg, 1.0f);
            Relation relationRight = new Relation(type, firstArg, secondArg, 1.0f);
            relationLeft.addAttribute("RelationSubtype", relationLeft.getRelationName());
            relationRight.addAttribute("RelationSubtype", relationRight.getRelationName());
            mentionView.addRelation(relationLeft);
            mentionView.addRelation(relationRight);
        }
        ta.addView("MENTIONS", mentionView);
        ret.add(ta);
    }
    return ret;
}

Also used : BufferedReader(java.io.BufferedReader) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) FileReader(java.io.FileReader)

Example 20 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class NerOntonotesTest method testOntonotesNer.

@Test
public void testOntonotesNer() {
    TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    Properties props = new Properties();
    NERAnnotator nerOntonotes = NerAnnotatorManager.buildNerAnnotator(new ResourceManager(props), ViewNames.NER_ONTONOTES);
    TextAnnotation taOnto = tab.createTextAnnotation("", "", TEST_INPUT);
    try {
        nerOntonotes.getView(taOnto);
    } catch (AnnotatorException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    View v = taOnto.getView(nerOntonotes.getViewName());
    assertEquals(3, v.getConstituents().size());
}

Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) Properties(java.util.Properties) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) Test(org.junit.Test)

Aggregations

StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)30 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)29 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)19 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)16 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)12 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)9 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)9 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)7 Properties (java.util.Properties)7 ChunkerAnnotator (edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)5 POSTaggerAnnotator (edu.stanford.nlp.pipeline.POSTaggerAnnotator)5 ParserAnnotator (edu.stanford.nlp.pipeline.ParserAnnotator)5 Test (org.junit.Test)5 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)4 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)4 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)4 ChunkerConfigurator (edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator)3 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)2