Examples with StatefulTokenizer - edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer

Example 21 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class PipelineFactory method buildPipeline.

/**
 * create an AnnotatorService with the given view names in the argument. The names are supposed
 * be strings, separated by space.
 *
 * @return AnnotatorService with specified NLP components
 * @throws IOException
 * @throws AnnotatorException
 */
public static BasicAnnotatorService buildPipeline(Boolean disableCache, String... views) throws IOException, AnnotatorException {
    List<String> allViewNames = ViewNames.getAllViewNames();
    Map<String, String> nonDefaultValues = new HashMap<>();
    for (String vu : views) {
        if (allViewNames.contains(vu)) {
            switch(vu) {
                case ViewNames.POS:
                    nonDefaultValues.put(PipelineConfigurator.USE_POS.key, Configurator.TRUE);
                    break;
                case ViewNames.LEMMA:
                    nonDefaultValues.put(PipelineConfigurator.USE_LEMMA.key, Configurator.TRUE);
                    break;
                case ViewNames.NER_CONLL:
                    nonDefaultValues.put(PipelineConfigurator.USE_NER_CONLL.key, Configurator.TRUE);
                    break;
                case ViewNames.NER_ONTONOTES:
                    nonDefaultValues.put(PipelineConfigurator.USE_NER_ONTONOTES.key, Configurator.TRUE);
                    break;
                case ViewNames.QUANTITIES:
                    nonDefaultValues.put(PipelineConfigurator.USE_QUANTIFIER.key, Configurator.TRUE);
                    break;
                case ViewNames.SHALLOW_PARSE:
                    nonDefaultValues.put(PipelineConfigurator.USE_SHALLOW_PARSE.key, Configurator.TRUE);
                    break;
                case ViewNames.SRL_VERB:
                    nonDefaultValues.put(PipelineConfigurator.USE_SRL_VERB.key, Configurator.TRUE);
                    break;
                case ViewNames.SRL_NOM:
                    nonDefaultValues.put(PipelineConfigurator.USE_SRL_NOM.key, Configurator.TRUE);
                    break;
                case ViewNames.DEPENDENCY_STANFORD:
                    nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_DEP.key, Configurator.TRUE);
                    break;
                case ViewNames.DEPENDENCY:
                    nonDefaultValues.put(PipelineConfigurator.USE_DEP.key, Configurator.TRUE);
                    break;
                case ViewNames.PARSE_STANFORD:
                    nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_PARSE.key, Configurator.TRUE);
                    break;
                case ViewNames.SRL_PREP:
                    nonDefaultValues.put(PipelineConfigurator.USE_SRL_PREP.key, Configurator.TRUE);
                    break;
                case ViewNames.SRL_COMMA:
                    nonDefaultValues.put(PipelineConfigurator.USE_SRL_COMMA.key, Configurator.TRUE);
                    break;
                case ViewNames.VERB_SENSE:
                    nonDefaultValues.put(PipelineConfigurator.USE_VERB_SENSE.key, Configurator.TRUE);
                    break;
                case ViewNames.TRANSLITERATION:
                    nonDefaultValues.put(PipelineConfigurator.USE_TRANSLITERATION.key, Configurator.TRUE);
                    break;
                case ViewNames.TIMEX3:
                    nonDefaultValues.put(PipelineConfigurator.USE_TIMEX3.key, Configurator.TRUE);
                    break;
                case ViewNames.MENTION:
                    nonDefaultValues.put(PipelineConfigurator.USE_MENTION.key, Configurator.TRUE);
                    break;
                case ViewNames.RELATION:
                    nonDefaultValues.put(PipelineConfigurator.USE_RELATION.key, Configurator.TRUE);
                    break;
                case ViewNames.DATALESS_ESA:
                    nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_ESA.key, Configurator.TRUE);
                    break;
                case ViewNames.DATALESS_W2V:
                    nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_W2V.key, Configurator.TRUE);
                    break;
                case ViewNames.QUESTION_TYPE:
                    nonDefaultValues.put(PipelineConfigurator.USE_QUESTION_TYPER.key, Configurator.TRUE);
                    break;
                default:
                    logger.warn("View name " + vu + " is not supported yet. Look into the readme of the pipeline to see the list of valid annotators. ");
            }
        } else {
            throw new IllegalArgumentException("The view name " + vu + " is not a valid view name. " + "The possible view names are static members of the class `ViewName`. ");
        }
    }
    if (disableCache) {
        nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.TRUE);
    } else {
        nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.FALSE);
    }
    // using the default settings and changing the views
    ResourceManager fullRm = (new PipelineConfigurator()).getConfig(new Stanford331Configurator().getConfig(nonDefaultValues));
    boolean splitOnHypen = fullRm.getBoolean(PipelineConfigurator.SPLIT_ON_DASH.key);
    TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnHypen, false));
    Map<String, Annotator> annotators = buildAnnotators(fullRm);
    return new SentencePipeline(taBldr, annotators, fullRm);
}

Also used : Stanford331Configurator(edu.illinois.cs.cogcomp.pipeline.common.Stanford331Configurator) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) HashMap(java.util.HashMap) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) PipelineConfigurator(edu.illinois.cs.cogcomp.pipeline.common.PipelineConfigurator) TemporalChunkerAnnotator(edu.illinois.cs.cogcomp.temporal.normalizer.main.TemporalChunkerAnnotator) W2VDatalessAnnotator(edu.illinois.cs.cogcomp.datalessclassification.ta.W2VDatalessAnnotator) RelationAnnotator(org.cogcomp.re.RelationAnnotator) ESADatalessAnnotator(edu.illinois.cs.cogcomp.datalessclassification.ta.ESADatalessAnnotator) POSTaggerAnnotator(edu.stanford.nlp.pipeline.POSTaggerAnnotator) PrepSRLAnnotator(edu.illinois.cs.cogcomp.prepsrl.PrepSRLAnnotator) ParserAnnotator(edu.stanford.nlp.pipeline.ParserAnnotator) MentionAnnotator(org.cogcomp.md.MentionAnnotator) QuestionTypeAnnotator(edu.illinois.cs.cogcomp.question_typer.QuestionTypeAnnotator) ChunkerAnnotator(edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator) DepAnnotator(edu.illinois.cs.cogcomp.depparse.DepAnnotator) POSAnnotator(edu.illinois.cs.cogcomp.pos.POSAnnotator) VerbSenseAnnotator(edu.illinois.cs.cogcomp.verbsense.VerbSenseAnnotator) NERAnnotator(edu.illinois.cs.cogcomp.ner.NERAnnotator) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)

Example 22 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class OntonotesNamedEntityReader method nextAnnotation.

/**
 * parse the pen treebank parse file, producing an annotation covering the entire file.
 * @param data the data from the file, each line.
 * @param docid the id representing the document name.
 * @return the text annotation.
 * @throws AnnotatorException
 */
private XmlTextAnnotation nextAnnotation(String data, String docid) throws AnnotatorException {
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(data, "OntoNotes 5.0", docid);
    TextAnnotation ta = xta.getTextAnnotation();
    List<SpanInfo> fudge = xta.getXmlMarkup();
    // create the named entity vi
    View nerView = new SpanLabelView(VIEW_NAME, ta);
    for (SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            Pair<String, IntPair> neLabelPair = si.attributes.get("type");
            String neLabel = neLabelPair.getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            if (cleanTextNeTokStart == -1 || cleanTextNeTokEnd == -1) {
                for (Constituent c : nerView.getConstituents()) {
                    System.err.println(c);
                }
                System.err.println("Something wonky in \"" + docid + "\", at " + charOffsets + ", " + cleanTextCharStart + " - " + cleanTextCharEnd + " = " + ta.text.substring(cleanTextCharStart, cleanTextCharEnd));
            } else {
                if (entityCounts.containsKey(neLabel)) {
                    entityCounts.put(neLabel, (entityCounts.get(neLabel) + 1));
                } else {
                    entityCounts.put(neLabel, 1);
                }
                // constituent token indexing uses one-past-the-end
                Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
                nerView.addConstituent(neCon);
            }
        }
    }
    ta.addView(VIEW_NAME, nerView);
    return xta;
}

Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) SpanInfo(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 23 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class XmlTextAnnotationMakerOntonotesTest method testNestedNames.

/**
 * the edit offsets get messed up when there are nested tags.
 */
@Test
public void testNestedNames() {
    String text = "He spoke with Paul <ENAMEX TYPE=\"PERSON\"><ENAMEX TYPE=\"PERSON\" E_OFF=\"1\">Paula</ENAMEX> Zahn</ENAMEX> .";
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(text, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
    StringTransformation xst = xta.getXmlSt();
    for (XmlDocumentProcessor.SpanInfo si : fudge) {
        int newTextStart = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getFirst());
        int newTextEnd = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getSecond());
        String neStr = ta.getText().substring(newTextStart, newTextEnd);
        assertTrue(REF_ENTITIES.contains(neStr));
    }
}

Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Test(org.junit.Test)

Example 24 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class ESADatalessAnnotator method main.

/**
 * @param args config: config file path testFile: Test File
 */
public static void main(String[] args) {
    CommandLine cmd = getCMDOpts(args);
    ResourceManager rm;
    try {
        String configFile = cmd.getOptionValue("config", "config/project.properties");
        ResourceManager nonDefaultRm = new ResourceManager(configFile);
        rm = new ESADatalessConfigurator().getConfig(nonDefaultRm);
    } catch (IOException e) {
        rm = new ESADatalessConfigurator().getDefaultConfig();
    }
    String testFile = cmd.getOptionValue("testFile", "data/graphicsTestDocument.txt");
    StringBuilder sb = new StringBuilder();
    String line;
    try (BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) {
        while ((line = br.readLine()) != null) {
            sb.append(line);
            sb.append(" ");
        }
        String text = sb.toString().trim();
        TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
        TextAnnotation ta = taBuilder.createTextAnnotation(text);
        ESADatalessAnnotator datalessAnnotator = new ESADatalessAnnotator(rm);
        datalessAnnotator.addView(ta);
        List<Constituent> annots = ta.getView(ViewNames.DATALESS_ESA).getConstituents();
        System.out.println("Predicted LabelIDs:");
        for (Constituent annot : annots) {
            System.out.println(annot.getLabel());
        }
        Map<String, String> labelNameMap = DatalessAnnotatorUtils.getLabelNameMap(rm.getString(DatalessConfigurator.LabelName_Path.key));
        System.out.println("Predicted Labels:");
        for (Constituent annot : annots) {
            System.out.println(labelNameMap.get(annot.getLabel()));
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        logger.error("Test File not found at " + testFile + " ... exiting");
        System.exit(-1);
    } catch (IOException e) {
        e.printStackTrace();
        logger.error("IO Error while reading the test file ... exiting");
        System.exit(-1);
    } catch (AnnotatorException e) {
        e.printStackTrace();
        logger.error("Error Annotating the Test Document with the Dataless View ... exiting");
        System.exit(-1);
    }
}

Also used : AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) CommandLine(org.apache.commons.cli.CommandLine) ESADatalessConfigurator(edu.illinois.cs.cogcomp.datalessclassification.config.ESADatalessConfigurator) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 25 with StatefulTokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.

the class W2VDatalessTest method getTextAnnotation.

private TextAnnotation getTextAnnotation(String text) {
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation ta = taBuilder.createTextAnnotation(text);
    return ta;
}

Also used : TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Aggregations

StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)30 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)29 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)19 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)16 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)12 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)9 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)9 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)7 Properties (java.util.Properties)7 ChunkerAnnotator (edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)5 POSTaggerAnnotator (edu.stanford.nlp.pipeline.POSTaggerAnnotator)5 ParserAnnotator (edu.stanford.nlp.pipeline.ParserAnnotator)5 Test (org.junit.Test)5 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)4 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)4 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)4 ChunkerConfigurator (edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator)3 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)2