Examples with CoreAnnotation - edu.stanford.nlp.ling.CoreAnnotation

Example 1 with CoreAnnotation

use of edu.stanford.nlp.ling.CoreAnnotation in project CoreNLP by stanfordnlp.

the class DocumentMaker method getStanfordCoreNLP.

private synchronized StanfordCoreNLP getStanfordCoreNLP(Properties props) {
    if (coreNLP != null) {
        return coreNLP;
    }
    Properties pipelineProps = new Properties(props);
    if (CorefProperties.conll(props)) {
        pipelineProps.setProperty("annotators", (CorefProperties.getLanguage(props) == Locale.CHINESE ? "lemma, ner" : "lemma") + (CorefProperties.useGoldMentions(props) ? "" : ", coref.mention"));
        pipelineProps.setProperty("ner.applyFineGrained", "false");
        coreNLP = new StanfordCoreNLP(pipelineProps, false);
    } else {
        pipelineProps.setProperty("annotators", (CorefProperties.useConstituencyParse(props) ? "parse" : "pos") + ", lemma, ner " + (CorefProperties.useConstituencyParse(props) ? "" : ", depparse") + (CorefProperties.useGoldMentions(props) ? "" : ", coref.mention"));
        pipelineProps.setProperty("ner.applyFineGrained", "false");
        coreNLP = new StanfordCoreNLP(pipelineProps, false);
        if (CorefProperties.useConstituencyParse(props)) {
            // The first annotator is now known to be the parse annotator.
            // It is possible that this parse annotator needs POS tags to work.
            // If so, we need to add a tagger.
            // Hopefully the annotator cache will save us from doing a ton
            // of extra annotator loading.
            Set<Class<? extends CoreAnnotation>> requirements = coreNLP.requires();
            if (requirements.contains(CoreAnnotations.PartOfSpeechAnnotation.class)) {
                pipelineProps.setProperty("annotators", "pos, " + pipelineProps.getProperty("annotators"));
                coreNLP = new StanfordCoreNLP(pipelineProps, false);
            }
        }
    }
    return coreNLP;
}

Also used : TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) CorefProperties(edu.stanford.nlp.coref.CorefProperties) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) StanfordCoreNLP(edu.stanford.nlp.pipeline.StanfordCoreNLP)

Example 2 with CoreAnnotation

use of edu.stanford.nlp.ling.CoreAnnotation in project CoreNLP by stanfordnlp.

the class WebServiceAnnotator method main.

/**
 * A quick script to debug server lifecycle.
 */
public static void main(String[] args) throws InterruptedException {
    WebServiceAnnotator annotator = new WebServiceAnnotator() {

        @Override
        public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
            return Collections.emptySet();
        }

        @Override
        public Set<Class<? extends CoreAnnotation>> requires() {
            return Collections.emptySet();
        }

        @Override
        protected Optional<String[]> startCommand() {
            return Optional.of(new String[] { "bash", "script.sh" });
        }

        @Override
        protected Optional<String[]> stopCommand() {
            return Optional.empty();
        }

        @Override
        protected boolean ready(boolean initialTest) {
            return this.ping("http://localhost:8000");
        }

        @Override
        protected void annotateImpl(Annotation ann) throws ShouldRetryException, PermanentlyFailedException {
            log.info("Fake annotated! ping=" + this.ping("http://localhost:8000"));
        }

        public String toString() {
            return "<test WebServiceAnnotator>";
        }
    };
    Annotation ann = new Annotation("");
    annotator.annotate(ann);
}

Also used : CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation)

Example 3 with CoreAnnotation

use of edu.stanford.nlp.ling.CoreAnnotation in project CoreNLP by stanfordnlp.

the class NERFeatureFactory method featuresC.

protected void featuresC(PaddedList<IN> cInfo, int loc, FeatureCollector out) {
    out.setSuffix("C");
    CoreLabel p3 = cInfo.get(loc - 3);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel c = cInfo.get(loc);
    CoreLabel n = cInfo.get(loc + 1);
    CoreLabel n2 = cInfo.get(loc + 2);
    String cWord = getWord(c);
    String pWord = getWord(p);
    String nWord = getWord(n);
    String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class);
    String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class);
    String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class);
    if (flags.useDistSim) {
        distSimAnnotate(cInfo);
    }
    if (flags.useBagOfWords) {
        for (IN word : cInfo) {
            out.build().append(getWord(word)).append("-BAGOFWORDS").add();
        }
    }
    if (flags.useDistSim && flags.useMoreTags) {
        out.build().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(cWord).append("-PDISTSIM-CWORD").add();
    }
    if (flags.useDistSim) {
        out.build().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-DISTSIM").add();
    }
    if (flags.useTitle) {
        if (titlePattern.matcher(cWord).matches()) {
            out.add("IS_TITLE");
        }
    } else if (flags.useTitle2) {
        if (titlePattern2.matcher(cWord).matches()) {
            out.add("IS_TITLE");
        }
    }
    if (flags.slashHyphenTreatment != SeqClassifierFlags.SlashHyphenEnum.NONE) {
        if (flags.useWord) {
            generateSlashHyphenFeatures(cWord, "-WFRAG", "-WORD", out);
        }
    }
    if (flags.useInternal && flags.useExternal) {
        if (flags.useWord) {
            out.build().append(cWord).append("-WORD").add();
        }
        if (flags.use2W) {
            out.build().append(getWord(p2)).append("-P2W").add();
            out.build().append(getWord(n2)).append("-N2W").add();
        }
        if (flags.useLC) {
            out.build().append(cWord.toLowerCase()).append("-CL").add();
            out.build().append(pWord.toLowerCase()).append("-PL").add();
            out.build().append(nWord.toLowerCase()).append("-NL").add();
        }
        if (flags.useUnknown) {
            // for true casing
            out.build().append(c.get(CoreAnnotations.UnknownAnnotation.class)).append("-UNKNOWN").add();
            out.build().append(p.get(CoreAnnotations.UnknownAnnotation.class)).append("-PUNKNOWN").add();
            out.build().append(n.get(CoreAnnotations.UnknownAnnotation.class)).append("-NUNKNOWN").add();
        }
        if (flags.useLemmas) {
            String lem = c.getString(CoreAnnotations.LemmaAnnotation.class);
            if (!lem.isEmpty()) {
                out.build().append(lem).append("-LEM").add();
            }
        }
        if (flags.usePrevNextLemmas) {
            String plem = p.getString(CoreAnnotations.LemmaAnnotation.class);
            String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class);
            if (!plem.isEmpty()) {
                out.build().append(plem).append("-PLEM").add();
            }
            if (!nlem.isEmpty()) {
                out.build().append(nlem).append("-NLEM").add();
            }
        }
        if (flags.checkNameList) {
            try {
                if (lastNames == null) {
                    lastNames = Generics.newHashSet();
                    for (String line : ObjectBank.getLineIterator(flags.lastNameList)) {
                        lastNames.add(line.split("\\s+")[0]);
                    }
                }
                if (maleNames == null) {
                    maleNames = Generics.newHashSet();
                    for (String line : ObjectBank.getLineIterator(flags.maleNameList)) {
                        maleNames.add(line.split("\\s+")[0]);
                    }
                }
                if (femaleNames == null) {
                    femaleNames = Generics.newHashSet();
                    for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) {
                        femaleNames.add(line.split("\\s+")[0]);
                    }
                }
                String name = cWord.toUpperCase();
                if (lastNames.contains(name)) {
                    out.add("LAST_NAME");
                }
                if (maleNames.contains(name)) {
                    out.add("MALE_NAME");
                }
                if (femaleNames.contains(name)) {
                    out.add("FEMALE_NAME");
                }
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }
        if (flags.binnedLengths != null) {
            int len = cWord.length(), beg = -1, end = -1;
            for (int i = 0; i < flags.binnedLengths.length; i++) {
                if (len <= flags.binnedLengths[i]) {
                    beg = i == 0 ? 1 : flags.binnedLengths[i - 1];
                    end = flags.binnedLengths[i];
                    break;
                }
            }
            if (beg < 0) {
                beg = flags.binnedLengths[flags.binnedLengths.length - 1];
            }
            out.build().append("Len-").append(Integer.toString(beg)).dash().append(end > 0 ? Integer.toString(end) : "Inf").add();
        }
        if (flags.useABGENE) {
            out.build().append(c.get(CoreAnnotations.AbgeneAnnotation.class)).append("-ABGENE").add();
            out.build().append(p.get(CoreAnnotations.AbgeneAnnotation.class)).append("-PABGENE").add();
            out.build().append(n.get(CoreAnnotations.AbgeneAnnotation.class)).append("-NABGENE").add();
        }
        if (flags.useABSTRFreqDict) {
            out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
            out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").append(c.get(CoreAnnotations.DictAnnotation.class)).append("-DICT").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
            out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").append(c.get(CoreAnnotations.DictAnnotation.class)).append("-DICT").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
        }
        if (flags.useABSTR) {
            out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").add();
            out.build().append(p.get(CoreAnnotations.AbstrAnnotation.class)).append("-PABSTRACT").add();
            out.build().append(n.get(CoreAnnotations.AbstrAnnotation.class)).append("-NABSTRACT").add();
        }
        if (flags.useGENIA) {
            out.build().append(c.get(CoreAnnotations.GeniaAnnotation.class)).append("-GENIA").add();
            out.build().append(p.get(CoreAnnotations.GeniaAnnotation.class)).append("-PGENIA").add();
            out.build().append(n.get(CoreAnnotations.GeniaAnnotation.class)).append("-NGENIA").add();
        }
        if (flags.useWEBFreqDict) {
            out.build().append(c.get(CoreAnnotations.WebAnnotation.class)).append("-WEB").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
            out.build().append(c.get(CoreAnnotations.WebAnnotation.class)).append("-WEB").append(c.get(CoreAnnotations.DictAnnotation.class)).append("-DICT").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
            out.build().append(c.get(CoreAnnotations.WebAnnotation.class)).append("-WEB").append(c.get(CoreAnnotations.DictAnnotation.class)).append("-DICT").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
        }
        if (flags.useWEB) {
            out.build().append(c.get(CoreAnnotations.WebAnnotation.class)).append("-WEB").add();
            out.build().append(p.get(CoreAnnotations.WebAnnotation.class)).append("-PWEB").add();
            out.build().append(n.get(CoreAnnotations.WebAnnotation.class)).append("-NWEB").add();
        }
        if (flags.useIsURL) {
            out.build().append(c.get(CoreAnnotations.IsURLAnnotation.class)).append("-ISURL").add();
        }
        if (flags.useEntityRule) {
            out.build().append(c.get(CoreAnnotations.EntityRuleAnnotation.class)).append("-ENTITYRULE").add();
        }
        if (flags.useEntityTypes) {
            out.build().append(c.get(CoreAnnotations.EntityTypeAnnotation.class)).append("-ENTITYTYPE").add();
        }
        if (flags.useIsDateRange) {
            out.build().append(c.get(CoreAnnotations.IsDateRangeAnnotation.class)).append("-ISDATERANGE").add();
        }
        if (flags.useABSTRFreq) {
            out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").add();
        }
        if (flags.useFREQ) {
            out.build().append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").add();
        }
        if (flags.useMoreTags) {
            out.build().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(cWord).append("-PTAG-CWORD").add();
        }
        if (flags.usePosition) {
            out.build().append(c.get(CoreAnnotations.PositionAnnotation.class)).append("-POSITION").add();
        }
        if (flags.useBeginSent) {
            String pos = c.get(CoreAnnotations.PositionAnnotation.class);
            if ("0".equals(pos)) {
                out.add("BEGIN-SENT");
                out.build().append(cShape).append("-BEGIN-SENT").add();
            } else if (Integer.toString(cInfo.size() - 1).equals(pos)) {
                out.add("END-SENT");
                out.build().append(cShape).append("-END-SENT").add();
            } else {
                out.add("IN-SENT");
                out.build().append(cShape).append("-IN-SENT").add();
            }
        }
        if (flags.useTags) {
            out.build().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
        }
        if (flags.useOrdinal) {
            if (isOrdinal(cInfo, loc)) {
                out.add("C_ORDINAL");
                if (isOrdinal(cInfo, loc - 1)) {
                    // log.info(getWord(p) + " ");
                    out.add("PC_ORDINAL");
                }
            // log.info(cWord);
            }
            if (isOrdinal(cInfo, loc - 1)) {
                out.add("P_ORDINAL");
            }
        }
        if (flags.usePrev) {
            out.build().append(pWord).append("-PW").add();
            if (flags.useTags) {
                out.build().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-PTAG").add();
            }
            if (flags.useDistSim) {
                out.build().append(p.get(CoreAnnotations.DistSimAnnotation.class)).append("-PDISTSIM").add();
            }
            if (flags.useIsURL) {
                out.build().append(p.get(CoreAnnotations.IsURLAnnotation.class)).append("-PISURL").add();
            }
            if (flags.useEntityTypes) {
                out.build().append(p.get(CoreAnnotations.EntityTypeAnnotation.class)).append("-PENTITYTYPE").add();
            }
        }
        if (flags.useNext) {
            out.build().append(nWord).append("-NW").add();
            if (flags.useTags) {
                out.build().append(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-NTAG").add();
            }
            if (flags.useDistSim) {
                out.build().append(n.get(CoreAnnotations.DistSimAnnotation.class)).append("-NDISTSIM").add();
            }
            if (flags.useIsURL) {
                out.build().append(n.get(CoreAnnotations.IsURLAnnotation.class)).append("-NISURL").add();
            }
            if (flags.useEntityTypes) {
                out.build().append(n.get(CoreAnnotations.EntityTypeAnnotation.class)).append("-NENTITYTYPE").add();
            }
        }
        if (flags.useEitherSideWord) {
            out.build().append(pWord).append("-EW").add();
            out.build().append(nWord).append("-EW").add();
        }
        if (flags.useWordPairs) {
            out.build().append(cWord).dash().append(pWord).append("-W-PW").add();
            out.build().append(cWord).dash().append(nWord).append("-W-NW").add();
        }
        if (flags.useSymTags) {
            if (flags.useTags) {
                out.build().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-PCNTAGS").add();
                out.build().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-CNTAGS").add();
                out.build().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-PCTAGS").add();
            }
            if (flags.useDistSim) {
                out.build().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(n.get(CoreAnnotations.DistSimAnnotation.class)).append("-PCNDISTSIM").add();
                out.build().append(c.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(n.get(CoreAnnotations.DistSimAnnotation.class)).append("-CNDISTSIM").add();
                out.build().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-PCDISTSIM").add();
            }
        }
        if (flags.useSymWordPairs) {
            out.build().append(pWord).dash().append(nWord).append("-SWORDS").add();
        }
        if (flags.useGazFeatures || flags.useMoreGazFeatures) {
            String pGazAnnotation = p.get(CoreAnnotations.GazAnnotation.class);
            String nGazAnnotation = n.get(CoreAnnotations.GazAnnotation.class);
            String cGazAnnotation = c.get(CoreAnnotations.GazAnnotation.class);
            if (flags.useGazFeatures) {
                if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
                    out.build().append(cGazAnnotation).append("-GAZ").add();
                }
                // n
                if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
                    out.build().append(nGazAnnotation).append("-NGAZ").add();
                }
                // p
                if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
                    out.build().append(pGazAnnotation).append("-PGAZ").add();
                }
            }
            if (flags.useMoreGazFeatures) {
                if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
                    out.build().append(cGazAnnotation).dash().append(cWord).append("-CG-CW-GAZ").add();
                    // c-n
                    if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
                        out.build().append(cGazAnnotation).dash().append(nGazAnnotation).append("-CNGAZ").add();
                    }
                    // p-c
                    if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
                        out.build().append(pGazAnnotation).dash().append(cGazAnnotation).append("-PCGAZ").add();
                    }
                }
            }
        }
        if (flags.useAbbr || flags.useMinimalAbbr) {
            out.build().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-ABBR").add();
        }
        if (flags.useAbbr1 || flags.useMinimalAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                out.build().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-ABBR").add();
            }
        }
        if (flags.useAbbr) {
            out.build().append(p.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-PCABBR").add();
            out.build().append(c.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(n.get(CoreAnnotations.AbbrAnnotation.class)).append("-CNABBR").add();
            out.build().append(p.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(n.get(CoreAnnotations.AbbrAnnotation.class)).append("-PCNABBR").add();
        }
        if (flags.useAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                out.build().append(p.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-PCABBR").add();
                out.build().append(c.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(n.get(CoreAnnotations.AbbrAnnotation.class)).append("-CNABBR").add();
                out.build().append(p.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(n.get(CoreAnnotations.AbbrAnnotation.class)).append("-PCNABBR").add();
            }
        }
        if (flags.useChunks) {
            out.build().append(p.get(CoreAnnotations.ChunkAnnotation.class)).dash().append(c.get(CoreAnnotations.ChunkAnnotation.class)).append("-PCCHUNK").add();
            out.build().append(c.get(CoreAnnotations.ChunkAnnotation.class)).dash().append(n.get(CoreAnnotations.ChunkAnnotation.class)).append("-CNCHUNK").add();
            out.build().append(p.get(CoreAnnotations.ChunkAnnotation.class)).dash().append(c.get(CoreAnnotations.ChunkAnnotation.class)).dash().append(n.get(CoreAnnotations.ChunkAnnotation.class)).append("-PCNCHUNK").add();
        }
        if (flags.useMinimalAbbr) {
            out.build().append(cWord).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-CWABB").add();
        }
        if (flags.useMinimalAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                out.build().append(cWord).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-CWABB").add();
            }
        }
        String prevVB = "", nextVB = "";
        if (flags.usePrevVB) {
            for (int j = loc - 1; ; j--) {
                CoreLabel wi = cInfo.get(j);
                if (wi == cInfo.getPad()) {
                    prevVB = "X";
                    out.add("X-PVB");
                    break;
                } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
                    out.build().append(getWord(wi)).append("-PVB").add();
                    prevVB = getWord(wi);
                    break;
                }
            }
        }
        if (flags.useNextVB) {
            for (int j = loc + 1; ; j++) {
                CoreLabel wi = cInfo.get(j);
                if (wi == cInfo.getPad()) {
                    out.add("X-NVB");
                    nextVB = "X";
                    break;
                } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
                    out.build().append(getWord(wi)).append("-NVB").add();
                    nextVB = getWord(wi);
                    break;
                }
            }
        }
        if (flags.useVB) {
            out.build().append(prevVB).dash().append(nextVB).append("-PNVB").add();
        }
        if (flags.useShapeConjunctions) {
            out.build().append(c.get(CoreAnnotations.PositionAnnotation.class)).append(cShape).append("-POS-SH").add();
            if (flags.useTags) {
                out.build().append(c.tag()).append(cShape).append("-TAG-SH").add();
            }
            if (flags.useDistSim) {
                out.build().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append(cShape).append("-DISTSIM-SH").add();
            }
        }
        if (flags.useWordTag) {
            out.build().append(cWord).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-W-T").add();
            out.build().append(cWord).dash().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-W-PT").add();
            out.build().append(cWord).dash().append(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-W-NT").add();
        }
        if (flags.useNPHead) {
            // TODO: neat idea, but this would need to be set somewhere.
            // Probably should have its own annotation as this one would
            // be more narrow and would clobber other potential uses
            out.build().append(c.get(CoreAnnotations.HeadWordStringAnnotation.class)).append("-HW").add();
            if (flags.useTags) {
                out.build().append(c.get(CoreAnnotations.HeadWordStringAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-HW-T").add();
            }
            if (flags.useDistSim) {
                out.build().append(c.get(CoreAnnotations.HeadWordStringAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-HW-DISTSIM").add();
            }
        }
        if (flags.useNPGovernor) {
            out.build().append(c.get(CoreAnnotations.GovernorAnnotation.class)).append("-GW").add();
            if (flags.useTags) {
                out.build().append(c.get(CoreAnnotations.GovernorAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-GW-T").add();
            }
            if (flags.useDistSim) {
                out.build().append(c.get(CoreAnnotations.GovernorAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-DISTSIM-T1").add();
            }
        }
        if (flags.useHeadGov) {
            // TODO: neat idea, but this would need to be set somewhere.
            // Probably should have its own annotation as this one would
            // be more narrow and would clobber other potential uses
            out.build().append(c.get(CoreAnnotations.HeadWordStringAnnotation.class)).dash().append(c.get(CoreAnnotations.GovernorAnnotation.class)).append("-HW_GW").add();
        }
        if (flags.useClassFeature) {
            out.add("###");
        }
        if (flags.useFirstWord) {
            out.add(getWord(cInfo.get(0)));
        }
        if (flags.useNGrams) {
            Collection<String> subs = null;
            if (flags.cacheNGrams) {
                subs = wordToSubstrings.get(cWord);
            }
            if (subs == null) {
                subs = new ArrayList<>();
                String word = '<' + cWord + '>';
                if (flags.lowercaseNGrams) {
                    word = word.toLowerCase();
                }
                if (flags.dehyphenateNGrams) {
                    word = dehyphenate(word);
                }
                if (flags.greekifyNGrams) {
                    word = greekify(word);
                }
                // hoist flags.noMidNGrams so only linear in word length for that case
                if (flags.noMidNGrams) {
                    int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length()) : word.length();
                    for (int j = 2; j <= max; j++) {
                        subs.add(intern('#' + word.substring(0, j) + '#'));
                    }
                    int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : 0;
                    int lenM1 = word.length() - 1;
                    for (int i = start; i < lenM1; i++) {
                        subs.add(intern('#' + word.substring(i) + '#'));
                    }
                } else {
                    for (int i = 0; i < word.length(); i++) {
                        for (int j = i + 2, max = Math.min(word.length(), i + flags.maxNGramLeng); j <= max; j++) {
                            if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
                                continue;
                            }
                            subs.add(intern('#' + word.substring(i, j) + '#'));
                        }
                    }
                }
                if (flags.cacheNGrams) {
                    wordToSubstrings.put(cWord, subs);
                }
            }
            for (String sub : subs) {
                out.add(sub);
            }
            if (flags.conjoinShapeNGrams) {
                for (String str : subs) {
                    out.build().append(str).dash().append(cShape).append("-CNGram-CS").add();
                }
            }
        }
        if (flags.useGazettes) {
            if (flags.sloppyGazette) {
                Collection<String> entries = wordToGazetteEntries.get(cWord);
                if (entries != null) {
                    for (String entry : entries) {
                        out.add(entry);
                    }
                }
            }
            if (flags.cleanGazette) {
                Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord);
                if (infos != null) {
                    gazette: for (GazetteInfo gInfo : infos) {
                        for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) {
                            if (!gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc)))) {
                                continue gazette;
                            }
                        }
                        out.add(gInfo.feature);
                    }
                }
            }
        }
        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) {
            out.build().append(cShape).append("-TYPE").add();
            if (flags.useTypeSeqs) {
                out.build().append(pShape).append("-PTYPE").add();
                out.build().append(nShape).append("-NTYPE").add();
                out.build().append(pWord).append("...").append(cShape).append("-PW_CTYPE").add();
                out.build().append(cShape).append("...").append(nWord).append("-NW_CTYPE").add();
                out.build().append(pShape).append("...").append(cShape).append("-PCTYPE").add();
                out.build().append(cShape).append("...").append(nShape).append("-CNTYPE").add();
                out.build().append(pShape).append("...").append(cShape).append("...").append(nShape).append("-PCNTYPE").add();
            }
        }
        if (flags.useLastRealWord) {
            if (pWord.length() <= 3) {
                // extending this to check for 2 short words doesn't seem to help....
                out.build().append(getWord(p2)).append("...").append(cShape).append("-PPW_CTYPE").add();
            }
        }
        if (flags.useNextRealWord) {
            if (nWord.length() <= 3) {
                // extending this to check for 2 short words doesn't seem to help....
                out.build().append(getWord(n2)).append("...").append(cShape).append("-NNW_CTYPE").add();
            }
        }
        if (flags.useOccurrencePatterns) {
            occurrencePatterns(cInfo, loc, out);
        }
        if (flags.useDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                CoreLabel dn = cInfo.get(loc + i);
                CoreLabel dp = cInfo.get(loc - i);
                out.build().append(getWord(dn)).append("-DISJN").add();
                if (flags.useDisjunctiveShapeInteraction) {
                    out.build().append(getWord(dn)).dash().append(cShape).append("-DISJN-CS").add();
                }
                out.build().append(getWord(dp)).append("-DISJP").add();
                if (flags.useDisjunctiveShapeInteraction) {
                    out.build().append(getWord(dp)).dash().append(cShape).append("-DISJP-CS").add();
                }
            }
        }
        if (flags.useUndirectedDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                CoreLabel dn = cInfo.get(loc + i);
                CoreLabel dp = cInfo.get(loc - i);
                out.build().append(getWord(dn)).append("-DISJ").add();
                out.build().append(getWord(dp)).append("-DISJ").add();
            }
        }
        if (flags.useWideDisjunctive) {
            for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
                out.build().append(getWord(cInfo.get(loc + i))).append("-DISJWN").add();
                out.build().append(getWord(cInfo.get(loc - i))).append("-DISJWP").add();
            }
        }
        if (flags.useEitherSideDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                out.build().append(getWord(cInfo.get(loc + i))).append("-DISJWE").add();
                out.build().append(getWord(cInfo.get(loc - i))).append("-DISJWE").add();
            }
        }
        if (flags.useDisjShape) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                out.build().append(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)).append("-NDISJSHAPE").add();
                // out.build().append((cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class)).append("-PDISJSHAPE").add();
                out.build().append(cShape).dash().append(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)).append("-CNDISJSHAPE").add();
            // out.build().append(c.get(CoreAnnotations.ShapeAnnotation.class)).dash().append(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class)).append("-CPDISJSHAPE").add();
            }
        }
        if (flags.useExtraTaggySequences) {
            if (flags.useTags) {
                out.build().append(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TTS").add();
                out.build().append(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TTTS").add();
            }
            if (flags.useDistSim) {
                out.build().append(p2.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-DISTSIM_TTS1").add();
                out.build().append(p3.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(p2.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-DISTSIM_TTTS1").add();
            }
        }
        if (flags.useMUCFeatures) {
            out.build().append(c.get(CoreAnnotations.SectionAnnotation.class)).append("-SECTION").add();
            out.build().append(c.get(CoreAnnotations.WordPositionAnnotation.class)).append("-WORD_POSITION").add();
            out.build().append(c.get(CoreAnnotations.SentencePositionAnnotation.class)).append("-SENT_POSITION").add();
            out.build().append(c.get(CoreAnnotations.ParaPositionAnnotation.class)).append("-PARA_POSITION").add();
            out.build().append(c.get(CoreAnnotations.WordPositionAnnotation.class)).dash().append(c.get(CoreAnnotations.ShapeAnnotation.class)).append("-WORD_POSITION_SHAPE").add();
        }
    } else if (flags.useInternal) {
        if (flags.useWord) {
            out.build().append(cWord).append("-WORD").add();
        }
        if (flags.useNGrams) {
            Collection<String> subs = wordToSubstrings.get(cWord);
            if (subs == null) {
                subs = new ArrayList<>();
                String word = '<' + cWord + '>';
                if (flags.lowercaseNGrams) {
                    word = word.toLowerCase();
                }
                if (flags.dehyphenateNGrams) {
                    word = dehyphenate(word);
                }
                if (flags.greekifyNGrams) {
                    word = greekify(word);
                }
                for (int i = 0; i < word.length(); i++) {
                    for (int j = i + 2; j <= word.length(); j++) {
                        if (flags.noMidNGrams && i != 0 && j != word.length()) {
                            continue;
                        }
                        if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
                            continue;
                        }
                        // subs.add(intern("#" + word.substring(i, j) + "#"));
                        subs.add(intern('#' + word.substring(i, j) + '#'));
                    }
                }
                if (flags.cacheNGrams) {
                    wordToSubstrings.put(cWord, subs);
                }
            }
            for (String sub : subs) {
                out.add(sub);
            }
            if (flags.conjoinShapeNGrams) {
                String shape = c.get(CoreAnnotations.ShapeAnnotation.class);
                for (String str : subs) {
                    out.build().append(str).dash().append(shape).append("-CNGram-CS").add();
                }
            }
        }
        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) {
            out.build().append(cShape).append("-TYPE").add();
        }
        if (flags.useOccurrencePatterns) {
            occurrencePatterns(cInfo, loc, out);
        }
    } else if (flags.useExternal) {
        if (flags.usePrev) {
            out.build().append(pWord).append("-PW").add();
        }
        if (flags.useNext) {
            out.build().append(nWord).append("-NW").add();
        }
        if (flags.useWordPairs) {
            out.build().append(cWord).dash().append(pWord).append("-W-PW").add();
            out.build().append(cWord).dash().append(nWord).append("-W-NW").add();
        }
        if (flags.useSymWordPairs) {
            out.build().append(pWord).dash().append(nWord).append("-SWORDS").add();
        }
        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) {
            if (flags.useTypeSeqs) {
                out.build().append(pShape).append("-PTYPE").add();
                out.build().append(nShape).append("-NTYPE").add();
                out.build().append(pWord).append("...").append(cShape).append("-PW_CTYPE").add();
                out.build().append(cShape).append("...").append(nWord).append("-NW_CTYPE").add();
                if (flags.maxLeft > 0)
                    // this one just isn't useful, at least given c,pc,s,ps.  Might be useful 0th-order
                    out.build().append(pShape).append("...").append(cShape).append("-PCTYPE").add();
                out.build().append(cShape).append("...").append(nShape).append("-CNTYPE").add();
                out.build().append(pShape).append("...").append(cShape).append("...").append(nShape).append("-PCNTYPE").add();
            }
        }
        if (flags.useLastRealWord) {
            if (pWord.length() <= 3) {
                out.build().append(getWord(p2)).append("...").append(cShape).append("-PPW_CTYPE").add();
            }
        }
        if (flags.useNextRealWord) {
            if (nWord.length() <= 3) {
                out.build().append(getWord(n2)).append("...").append(cShape).append("-NNW_CTYPE").add();
            }
        }
        if (flags.useDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                CoreLabel dn = cInfo.get(loc + i);
                CoreLabel dp = cInfo.get(loc - i);
                out.build().append(getWord(dn)).append("-DISJN").add();
                if (flags.useDisjunctiveShapeInteraction) {
                    out.build().append(getWord(dn)).dash().append(cShape).append("-DISJN-CS").add();
                }
                out.build().append(getWord(dp)).append("-DISJP").add();
                if (flags.useDisjunctiveShapeInteraction) {
                    out.build().append(getWord(dp)).dash().append(cShape).append("-DISJP-CS").add();
                }
            }
        }
        if (flags.useWideDisjunctive) {
            for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
                out.build().append(getWord(cInfo.get(loc + i))).append("-DISJWN").add();
                out.build().append(getWord(cInfo.get(loc - i))).append("-DISJWP").add();
            }
        }
        if (flags.useDisjShape) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                out.build().append(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)).append("-NDISJSHAPE").add();
                // out.build().append((cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class)).append("-PDISJSHAPE").add();
                out.build().append(c.get(CoreAnnotations.ShapeAnnotation.class)).dash().append(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)).append("-CNDISJSHAPE").add();
            // out.build().append(c.get(CoreAnnotations.ShapeAnnotation.class)).dash().append(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class)).append("-CPDISJSHAPE").add();
            }
        }
    }
    // Stuff to add binary features from the additional columns
    if (flags.twoStage) {
        out.build().append(c.get(Bin1Annotation.class)).append("-BIN1").add();
        out.build().append(c.get(Bin2Annotation.class)).append("-BIN2").add();
        out.build().append(c.get(Bin3Annotation.class)).append("-BIN3").add();
        out.build().append(c.get(Bin4Annotation.class)).append("-BIN4").add();
        out.build().append(c.get(Bin5Annotation.class)).append("-BIN5").add();
        out.build().append(c.get(Bin6Annotation.class)).append("-BIN6").add();
    }
    if (flags.useIfInteger) {
        try {
            int val = Integer.parseInt(cWord);
            if (val > 0)
                out.add("POSITIVE_INTEGER");
            else if (val < 0)
                out.add("NEGATIVE_INTEGER");
        // log.info("FOUND INTEGER");
        } catch (NumberFormatException e) {
        // not an integer value, nothing to do
        }
    }
    // Stuff to add arbitrary features
    if (flags.useGenericFeatures) {
        // see if we need to cache the keys
        if (genericAnnotationKeys == null) {
            makeGenericKeyCache(c);
        }
        // now look through the cached keys
        for (Class<?> key : genericAnnotationKeys) {
            // log.info("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key));
            Object col = c.get((Class<CoreAnnotation<Object>>) key);
            if (col instanceof Collection) {
                for (Object ob : (Collection<?>) col) {
                    out.build().append(ob.toString()).dash().append(CoreLabel.genericValues.get(key)).add();
                }
            } else if (col != null) {
                out.build().append(col.toString()).dash().append(CoreLabel.genericValues.get(key)).add();
            }
        }
    }
    if (flags.useTopics) {
        // out.build().append(p.get(CoreAnnotations.TopicAnnotation.class), "-", cWord, "--CWORD").add();
        out.build().append(c.get(CoreAnnotations.TopicAnnotation.class)).append("-TopicID").add();
        out.build().append(p.get(CoreAnnotations.TopicAnnotation.class)).append("-PTopicID").add();
        out.build().append(n.get(CoreAnnotations.TopicAnnotation.class)).append("-NTopicID").add();
    // out.build().append(p.get(CoreAnnotations.TopicAnnotation.class)).dash().append(c.get(CoreAnnotations.TopicAnnotation.class)).dash().append(n.get(CoreAnnotations.TopicAnnotation.class)).append("-PCNTopicID").add();
    // out.build().append(c.get(CoreAnnotations.TopicAnnotation.class)).dash().append(n.get(CoreAnnotations.TopicAnnotation.class)).append("-CNTopicID").add();
    // out.build().append(p.get(CoreAnnotations.TopicAnnotation.class)).dash().append(c.get(CoreAnnotations.TopicAnnotation.class)).append("-PCTopicID").add();
    // out.build().append(c.get(CoreAnnotations.TopicAnnotation.class)).append(cShape).append("-TopicID-SH").add();
    }
    // NER tag annotations from a previous NER system
    if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) {
        out.build().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-CStackedNERTag").add();
        out.build().append(cWord).dash().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-WCStackedNERTag").add();
        if (flags.useNext) {
            out.build().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-CNStackedNERTag").add();
            out.build().append(cWord).dash().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-WCNStackedNERTag").add();
            if (flags.usePrev) {
                out.build().append(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-PCNStackedNERTag").add();
                out.build().append(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(cWord).append(" -").append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-PWCNStackedNERTag").add();
            }
        }
        if (flags.usePrev) {
            out.build().append(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-PCStackedNERTag").add();
        }
    }
    if (flags.useWordnetFeatures)
        out.build().append(c.get(CoreAnnotations.WordnetSynAnnotation.class)).append("-WordnetSyn").add();
    if (flags.useProtoFeatures)
        out.build().append(c.get(CoreAnnotations.ProtoAnnotation.class)).append("-Proto").add();
    if (flags.usePhraseWordTags)
        out.build().append(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class)).append("-PhraseTag").add();
    if (flags.usePhraseWords) {
        for (String w : c.get(CoreAnnotations.PhraseWordsAnnotation.class)) {
            out.build().append(w).append("-PhraseWord").add();
        }
    }
    if (flags.useCommonWordsFeature)
        out.add(c.get(CoreAnnotations.CommonWordsAnnotation.class));
    if (flags.useRadical && cWord.length() > 0) {
        // todo [cdm 2016]: Really all stuff in this file should be fixed to work with codepoints outside BMP
        if (cWord.length() == 1) {
            out.build().append(RadicalMap.getRadical(cWord.charAt(0))).append("-SINGLE-CHAR-RADICAL").add();
        } else {
            out.build().append(RadicalMap.getRadical(cWord.charAt(0))).append("-START-RADICAL").add();
            out.build().append(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1))).append("-END-RADICAL").add();
        }
        for (int i = 0; i < cWord.length(); ++i) {
            out.build().append(RadicalMap.getRadical(cWord.charAt(i))).append("-RADICAL").add();
        }
    }
    if (flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()) {
        for (String s : c.word().split(flags.splitWordRegex)) {
            out.build().append(s).append("-SPLITWORD").add();
        }
    }
    if (flags.useMoreNeighborNGrams) {
        int maxLen = pWord.length();
        if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
            maxLen = flags.maxNGramLeng;
        }
        for (int len = 1; len <= maxLen; ++len) {
            out.build().append(pWord.substring(0, len)).append("-PREV-PREFIX").add();
        }
        for (int pos = pWord.length() - maxLen; pos < pWord.length(); ++pos) {
            out.build().append(pWord.substring(pos, pWord.length())).append("-PREV-SUFFIX").add();
        }
        maxLen = nWord.length();
        if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
            maxLen = flags.maxNGramLeng;
        }
        for (int len = 1; len <= maxLen; ++len) {
            out.build().append(nWord.substring(0, len)).append("-NEXT-PREFIX").add();
        }
        for (int pos = nWord.length() - maxLen; pos < nWord.length(); ++pos) {
            out.build().append(nWord.substring(pos, nWord.length())).append("-NEXT-SUFFIX").add();
        }
    }
}

Also used : ArrayList(java.util.ArrayList) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Collection(java.util.Collection)

Example 4 with CoreAnnotation

use of edu.stanford.nlp.ling.CoreAnnotation in project CoreNLP by stanfordnlp.

the class StanfordCoreNLP method construct.

//
// AnnotatorPool construction support
//
private void construct(Properties props, boolean enforceRequirements, AnnotatorImplementations annotatorImplementations, AnnotatorPool pool) {
    Timing tim = new Timing();
    this.numWords = 0;
    this.constituentTreePrinter = new TreePrint("penn");
    this.dependencyTreePrinter = new TreePrint("typedDependenciesCollapsed");
    if (props == null) {
        // if undefined, find the properties file in the classpath
        props = loadPropertiesFromClasspath();
    } else if (props.getProperty("annotators") == null) {
        // this happens when some command line options are specified (e.g just "-filelist") but no properties file is.
        // we use the options that are given and let them override the default properties from the class path properties.
        Properties fromClassPath = loadPropertiesFromClasspath();
        fromClassPath.putAll(props);
        props = fromClassPath;
    }
    this.properties = props;
    if (pool == null) {
        // if undefined, load the default annotator pool
        pool = getDefaultAnnotatorPool(props, annotatorImplementations);
    }
    // Set threading
    if (this.properties.containsKey("threads")) {
        ArgumentParser.threads = PropertiesUtils.getInt(this.properties, "threads");
        this.availableProcessors = new Semaphore(ArgumentParser.threads);
    } else {
        this.availableProcessors = new Semaphore(1);
    }
    // now construct the annotators from the given properties in the given order
    List<String> annoNames = Arrays.asList(getRequiredProperty(props, "annotators").split("[, \t]+"));
    Set<String> alreadyAddedAnnoNames = Generics.newHashSet();
    Set<Class<? extends CoreAnnotation>> requirementsSatisfied = Generics.newHashSet();
    for (String name : annoNames) {
        name = name.trim();
        if (name.isEmpty()) {
            continue;
        }
        logger.info("Adding annotator " + name);
        Annotator an = pool.get(name);
        this.addAnnotator(an);
        if (enforceRequirements) {
            Set<Class<? extends CoreAnnotation>> allRequirements = an.requires();
            for (Class<? extends CoreAnnotation> requirement : allRequirements) {
                if (!requirementsSatisfied.contains(requirement)) {
                    String fmt = "annotator \"%s\" requires annotation \"%s\". The usual requirements for this annotator are: %s";
                    throw new IllegalArgumentException(String.format(fmt, name, requirement.getSimpleName(), StringUtils.join(Annotator.DEFAULT_REQUIREMENTS.getOrDefault(name, Collections.singleton("unknown")), ",")));
                }
            }
            requirementsSatisfied.addAll(an.requirementsSatisfied());
        }
        alreadyAddedAnnoNames.add(name);
    }
    // Sanity check
    if (!alreadyAddedAnnoNames.contains(STANFORD_SSPLIT)) {
        System.setProperty(NEWLINE_SPLITTER_PROPERTY, "false");
    }
    this.pipelineSetupTime = tim.report();
}

Also used : TreePrint(edu.stanford.nlp.trees.TreePrint) Semaphore(java.util.concurrent.Semaphore) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation)

Aggregations

CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)2 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)1 CorefProperties (edu.stanford.nlp.coref.CorefProperties)1 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 StanfordCoreNLP (edu.stanford.nlp.pipeline.StanfordCoreNLP)1 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)1 TreePrint (edu.stanford.nlp.trees.TreePrint)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Semaphore (java.util.concurrent.Semaphore)1