use of edu.stanford.nlp.ling.CoreAnnotation in project CoreNLP by stanfordnlp.
the class DocumentMaker method getStanfordCoreNLP.
private synchronized StanfordCoreNLP getStanfordCoreNLP(Properties props) {
if (coreNLP != null) {
return coreNLP;
}
Properties pipelineProps = new Properties(props);
if (CorefProperties.conll(props)) {
pipelineProps.setProperty("annotators", (CorefProperties.getLanguage(props) == Locale.CHINESE ? "lemma, ner" : "lemma") + (CorefProperties.useGoldMentions(props) ? "" : ", coref.mention"));
pipelineProps.setProperty("ner.applyFineGrained", "false");
coreNLP = new StanfordCoreNLP(pipelineProps, false);
} else {
pipelineProps.setProperty("annotators", (CorefProperties.useConstituencyParse(props) ? "parse" : "pos") + ", lemma, ner " + (CorefProperties.useConstituencyParse(props) ? "" : ", depparse") + (CorefProperties.useGoldMentions(props) ? "" : ", coref.mention"));
pipelineProps.setProperty("ner.applyFineGrained", "false");
coreNLP = new StanfordCoreNLP(pipelineProps, false);
if (CorefProperties.useConstituencyParse(props)) {
// The first annotator is now known to be the parse annotator.
// It is possible that this parse annotator needs POS tags to work.
// If so, we need to add a tagger.
// Hopefully the annotator cache will save us from doing a ton
// of extra annotator loading.
Set<Class<? extends CoreAnnotation>> requirements = coreNLP.requires();
if (requirements.contains(CoreAnnotations.PartOfSpeechAnnotation.class)) {
pipelineProps.setProperty("annotators", "pos, " + pipelineProps.getProperty("annotators"));
coreNLP = new StanfordCoreNLP(pipelineProps, false);
}
}
}
return coreNLP;
}
use of edu.stanford.nlp.ling.CoreAnnotation in project CoreNLP by stanfordnlp.
the class WebServiceAnnotator method main.
/**
* A quick script to debug server lifecycle.
*/
public static void main(String[] args) throws InterruptedException {
WebServiceAnnotator annotator = new WebServiceAnnotator() {
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.emptySet();
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.emptySet();
}
@Override
protected Optional<String[]> startCommand() {
return Optional.of(new String[] { "bash", "script.sh" });
}
@Override
protected Optional<String[]> stopCommand() {
return Optional.empty();
}
@Override
protected boolean ready(boolean initialTest) {
return this.ping("http://localhost:8000");
}
@Override
protected void annotateImpl(Annotation ann) throws ShouldRetryException, PermanentlyFailedException {
log.info("Fake annotated! ping=" + this.ping("http://localhost:8000"));
}
public String toString() {
return "<test WebServiceAnnotator>";
}
};
Annotation ann = new Annotation("");
annotator.annotate(ann);
}
use of edu.stanford.nlp.ling.CoreAnnotation in project CoreNLP by stanfordnlp.
the class NERFeatureFactory method featuresC.
protected void featuresC(PaddedList<IN> cInfo, int loc, FeatureCollector out) {
out.setSuffix("C");
CoreLabel p3 = cInfo.get(loc - 3);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel c = cInfo.get(loc);
CoreLabel n = cInfo.get(loc + 1);
CoreLabel n2 = cInfo.get(loc + 2);
String cWord = getWord(c);
String pWord = getWord(p);
String nWord = getWord(n);
String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class);
String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class);
String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class);
if (flags.useDistSim) {
distSimAnnotate(cInfo);
}
if (flags.useBagOfWords) {
for (IN word : cInfo) {
out.build().append(getWord(word)).append("-BAGOFWORDS").add();
}
}
if (flags.useDistSim && flags.useMoreTags) {
out.build().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(cWord).append("-PDISTSIM-CWORD").add();
}
if (flags.useDistSim) {
out.build().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-DISTSIM").add();
}
if (flags.useTitle) {
if (titlePattern.matcher(cWord).matches()) {
out.add("IS_TITLE");
}
} else if (flags.useTitle2) {
if (titlePattern2.matcher(cWord).matches()) {
out.add("IS_TITLE");
}
}
if (flags.slashHyphenTreatment != SeqClassifierFlags.SlashHyphenEnum.NONE) {
if (flags.useWord) {
generateSlashHyphenFeatures(cWord, "-WFRAG", "-WORD", out);
}
}
if (flags.useInternal && flags.useExternal) {
if (flags.useWord) {
out.build().append(cWord).append("-WORD").add();
}
if (flags.use2W) {
out.build().append(getWord(p2)).append("-P2W").add();
out.build().append(getWord(n2)).append("-N2W").add();
}
if (flags.useLC) {
out.build().append(cWord.toLowerCase()).append("-CL").add();
out.build().append(pWord.toLowerCase()).append("-PL").add();
out.build().append(nWord.toLowerCase()).append("-NL").add();
}
if (flags.useUnknown) {
// for true casing
out.build().append(c.get(CoreAnnotations.UnknownAnnotation.class)).append("-UNKNOWN").add();
out.build().append(p.get(CoreAnnotations.UnknownAnnotation.class)).append("-PUNKNOWN").add();
out.build().append(n.get(CoreAnnotations.UnknownAnnotation.class)).append("-NUNKNOWN").add();
}
if (flags.useLemmas) {
String lem = c.getString(CoreAnnotations.LemmaAnnotation.class);
if (!lem.isEmpty()) {
out.build().append(lem).append("-LEM").add();
}
}
if (flags.usePrevNextLemmas) {
String plem = p.getString(CoreAnnotations.LemmaAnnotation.class);
String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class);
if (!plem.isEmpty()) {
out.build().append(plem).append("-PLEM").add();
}
if (!nlem.isEmpty()) {
out.build().append(nlem).append("-NLEM").add();
}
}
if (flags.checkNameList) {
try {
if (lastNames == null) {
lastNames = Generics.newHashSet();
for (String line : ObjectBank.getLineIterator(flags.lastNameList)) {
lastNames.add(line.split("\\s+")[0]);
}
}
if (maleNames == null) {
maleNames = Generics.newHashSet();
for (String line : ObjectBank.getLineIterator(flags.maleNameList)) {
maleNames.add(line.split("\\s+")[0]);
}
}
if (femaleNames == null) {
femaleNames = Generics.newHashSet();
for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) {
femaleNames.add(line.split("\\s+")[0]);
}
}
String name = cWord.toUpperCase();
if (lastNames.contains(name)) {
out.add("LAST_NAME");
}
if (maleNames.contains(name)) {
out.add("MALE_NAME");
}
if (femaleNames.contains(name)) {
out.add("FEMALE_NAME");
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
if (flags.binnedLengths != null) {
int len = cWord.length(), beg = -1, end = -1;
for (int i = 0; i < flags.binnedLengths.length; i++) {
if (len <= flags.binnedLengths[i]) {
beg = i == 0 ? 1 : flags.binnedLengths[i - 1];
end = flags.binnedLengths[i];
break;
}
}
if (beg < 0) {
beg = flags.binnedLengths[flags.binnedLengths.length - 1];
}
out.build().append("Len-").append(Integer.toString(beg)).dash().append(end > 0 ? Integer.toString(end) : "Inf").add();
}
if (flags.useABGENE) {
out.build().append(c.get(CoreAnnotations.AbgeneAnnotation.class)).append("-ABGENE").add();
out.build().append(p.get(CoreAnnotations.AbgeneAnnotation.class)).append("-PABGENE").add();
out.build().append(n.get(CoreAnnotations.AbgeneAnnotation.class)).append("-NABGENE").add();
}
if (flags.useABSTRFreqDict) {
out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").append(c.get(CoreAnnotations.DictAnnotation.class)).append("-DICT").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").append(c.get(CoreAnnotations.DictAnnotation.class)).append("-DICT").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
}
if (flags.useABSTR) {
out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").add();
out.build().append(p.get(CoreAnnotations.AbstrAnnotation.class)).append("-PABSTRACT").add();
out.build().append(n.get(CoreAnnotations.AbstrAnnotation.class)).append("-NABSTRACT").add();
}
if (flags.useGENIA) {
out.build().append(c.get(CoreAnnotations.GeniaAnnotation.class)).append("-GENIA").add();
out.build().append(p.get(CoreAnnotations.GeniaAnnotation.class)).append("-PGENIA").add();
out.build().append(n.get(CoreAnnotations.GeniaAnnotation.class)).append("-NGENIA").add();
}
if (flags.useWEBFreqDict) {
out.build().append(c.get(CoreAnnotations.WebAnnotation.class)).append("-WEB").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
out.build().append(c.get(CoreAnnotations.WebAnnotation.class)).append("-WEB").append(c.get(CoreAnnotations.DictAnnotation.class)).append("-DICT").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
out.build().append(c.get(CoreAnnotations.WebAnnotation.class)).append("-WEB").append(c.get(CoreAnnotations.DictAnnotation.class)).append("-DICT").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
}
if (flags.useWEB) {
out.build().append(c.get(CoreAnnotations.WebAnnotation.class)).append("-WEB").add();
out.build().append(p.get(CoreAnnotations.WebAnnotation.class)).append("-PWEB").add();
out.build().append(n.get(CoreAnnotations.WebAnnotation.class)).append("-NWEB").add();
}
if (flags.useIsURL) {
out.build().append(c.get(CoreAnnotations.IsURLAnnotation.class)).append("-ISURL").add();
}
if (flags.useEntityRule) {
out.build().append(c.get(CoreAnnotations.EntityRuleAnnotation.class)).append("-ENTITYRULE").add();
}
if (flags.useEntityTypes) {
out.build().append(c.get(CoreAnnotations.EntityTypeAnnotation.class)).append("-ENTITYTYPE").add();
}
if (flags.useIsDateRange) {
out.build().append(c.get(CoreAnnotations.IsDateRangeAnnotation.class)).append("-ISDATERANGE").add();
}
if (flags.useABSTRFreq) {
out.build().append(c.get(CoreAnnotations.AbstrAnnotation.class)).append("-ABSTRACT").append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").add();
}
if (flags.useFREQ) {
out.build().append(c.get(CoreAnnotations.FreqAnnotation.class)).append("-FREQ").add();
}
if (flags.useMoreTags) {
out.build().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(cWord).append("-PTAG-CWORD").add();
}
if (flags.usePosition) {
out.build().append(c.get(CoreAnnotations.PositionAnnotation.class)).append("-POSITION").add();
}
if (flags.useBeginSent) {
String pos = c.get(CoreAnnotations.PositionAnnotation.class);
if ("0".equals(pos)) {
out.add("BEGIN-SENT");
out.build().append(cShape).append("-BEGIN-SENT").add();
} else if (Integer.toString(cInfo.size() - 1).equals(pos)) {
out.add("END-SENT");
out.build().append(cShape).append("-END-SENT").add();
} else {
out.add("IN-SENT");
out.build().append(cShape).append("-IN-SENT").add();
}
}
if (flags.useTags) {
out.build().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TAG").add();
}
if (flags.useOrdinal) {
if (isOrdinal(cInfo, loc)) {
out.add("C_ORDINAL");
if (isOrdinal(cInfo, loc - 1)) {
// log.info(getWord(p) + " ");
out.add("PC_ORDINAL");
}
// log.info(cWord);
}
if (isOrdinal(cInfo, loc - 1)) {
out.add("P_ORDINAL");
}
}
if (flags.usePrev) {
out.build().append(pWord).append("-PW").add();
if (flags.useTags) {
out.build().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-PTAG").add();
}
if (flags.useDistSim) {
out.build().append(p.get(CoreAnnotations.DistSimAnnotation.class)).append("-PDISTSIM").add();
}
if (flags.useIsURL) {
out.build().append(p.get(CoreAnnotations.IsURLAnnotation.class)).append("-PISURL").add();
}
if (flags.useEntityTypes) {
out.build().append(p.get(CoreAnnotations.EntityTypeAnnotation.class)).append("-PENTITYTYPE").add();
}
}
if (flags.useNext) {
out.build().append(nWord).append("-NW").add();
if (flags.useTags) {
out.build().append(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-NTAG").add();
}
if (flags.useDistSim) {
out.build().append(n.get(CoreAnnotations.DistSimAnnotation.class)).append("-NDISTSIM").add();
}
if (flags.useIsURL) {
out.build().append(n.get(CoreAnnotations.IsURLAnnotation.class)).append("-NISURL").add();
}
if (flags.useEntityTypes) {
out.build().append(n.get(CoreAnnotations.EntityTypeAnnotation.class)).append("-NENTITYTYPE").add();
}
}
if (flags.useEitherSideWord) {
out.build().append(pWord).append("-EW").add();
out.build().append(nWord).append("-EW").add();
}
if (flags.useWordPairs) {
out.build().append(cWord).dash().append(pWord).append("-W-PW").add();
out.build().append(cWord).dash().append(nWord).append("-W-NW").add();
}
if (flags.useSymTags) {
if (flags.useTags) {
out.build().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-PCNTAGS").add();
out.build().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-CNTAGS").add();
out.build().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-PCTAGS").add();
}
if (flags.useDistSim) {
out.build().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(n.get(CoreAnnotations.DistSimAnnotation.class)).append("-PCNDISTSIM").add();
out.build().append(c.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(n.get(CoreAnnotations.DistSimAnnotation.class)).append("-CNDISTSIM").add();
out.build().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-PCDISTSIM").add();
}
}
if (flags.useSymWordPairs) {
out.build().append(pWord).dash().append(nWord).append("-SWORDS").add();
}
if (flags.useGazFeatures || flags.useMoreGazFeatures) {
String pGazAnnotation = p.get(CoreAnnotations.GazAnnotation.class);
String nGazAnnotation = n.get(CoreAnnotations.GazAnnotation.class);
String cGazAnnotation = c.get(CoreAnnotations.GazAnnotation.class);
if (flags.useGazFeatures) {
if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
out.build().append(cGazAnnotation).append("-GAZ").add();
}
// n
if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
out.build().append(nGazAnnotation).append("-NGAZ").add();
}
// p
if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
out.build().append(pGazAnnotation).append("-PGAZ").add();
}
}
if (flags.useMoreGazFeatures) {
if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
out.build().append(cGazAnnotation).dash().append(cWord).append("-CG-CW-GAZ").add();
// c-n
if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
out.build().append(cGazAnnotation).dash().append(nGazAnnotation).append("-CNGAZ").add();
}
// p-c
if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
out.build().append(pGazAnnotation).dash().append(cGazAnnotation).append("-PCGAZ").add();
}
}
}
}
if (flags.useAbbr || flags.useMinimalAbbr) {
out.build().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-ABBR").add();
}
if (flags.useAbbr1 || flags.useMinimalAbbr1) {
if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
out.build().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-ABBR").add();
}
}
if (flags.useAbbr) {
out.build().append(p.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-PCABBR").add();
out.build().append(c.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(n.get(CoreAnnotations.AbbrAnnotation.class)).append("-CNABBR").add();
out.build().append(p.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(n.get(CoreAnnotations.AbbrAnnotation.class)).append("-PCNABBR").add();
}
if (flags.useAbbr1) {
if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
out.build().append(p.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-PCABBR").add();
out.build().append(c.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(n.get(CoreAnnotations.AbbrAnnotation.class)).append("-CNABBR").add();
out.build().append(p.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).dash().append(n.get(CoreAnnotations.AbbrAnnotation.class)).append("-PCNABBR").add();
}
}
if (flags.useChunks) {
out.build().append(p.get(CoreAnnotations.ChunkAnnotation.class)).dash().append(c.get(CoreAnnotations.ChunkAnnotation.class)).append("-PCCHUNK").add();
out.build().append(c.get(CoreAnnotations.ChunkAnnotation.class)).dash().append(n.get(CoreAnnotations.ChunkAnnotation.class)).append("-CNCHUNK").add();
out.build().append(p.get(CoreAnnotations.ChunkAnnotation.class)).dash().append(c.get(CoreAnnotations.ChunkAnnotation.class)).dash().append(n.get(CoreAnnotations.ChunkAnnotation.class)).append("-PCNCHUNK").add();
}
if (flags.useMinimalAbbr) {
out.build().append(cWord).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-CWABB").add();
}
if (flags.useMinimalAbbr1) {
if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
out.build().append(cWord).dash().append(c.get(CoreAnnotations.AbbrAnnotation.class)).append("-CWABB").add();
}
}
String prevVB = "", nextVB = "";
if (flags.usePrevVB) {
for (int j = loc - 1; ; j--) {
CoreLabel wi = cInfo.get(j);
if (wi == cInfo.getPad()) {
prevVB = "X";
out.add("X-PVB");
break;
} else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
out.build().append(getWord(wi)).append("-PVB").add();
prevVB = getWord(wi);
break;
}
}
}
if (flags.useNextVB) {
for (int j = loc + 1; ; j++) {
CoreLabel wi = cInfo.get(j);
if (wi == cInfo.getPad()) {
out.add("X-NVB");
nextVB = "X";
break;
} else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
out.build().append(getWord(wi)).append("-NVB").add();
nextVB = getWord(wi);
break;
}
}
}
if (flags.useVB) {
out.build().append(prevVB).dash().append(nextVB).append("-PNVB").add();
}
if (flags.useShapeConjunctions) {
out.build().append(c.get(CoreAnnotations.PositionAnnotation.class)).append(cShape).append("-POS-SH").add();
if (flags.useTags) {
out.build().append(c.tag()).append(cShape).append("-TAG-SH").add();
}
if (flags.useDistSim) {
out.build().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append(cShape).append("-DISTSIM-SH").add();
}
}
if (flags.useWordTag) {
out.build().append(cWord).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-W-T").add();
out.build().append(cWord).dash().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-W-PT").add();
out.build().append(cWord).dash().append(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-W-NT").add();
}
if (flags.useNPHead) {
// TODO: neat idea, but this would need to be set somewhere.
// Probably should have its own annotation as this one would
// be more narrow and would clobber other potential uses
out.build().append(c.get(CoreAnnotations.HeadWordStringAnnotation.class)).append("-HW").add();
if (flags.useTags) {
out.build().append(c.get(CoreAnnotations.HeadWordStringAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-HW-T").add();
}
if (flags.useDistSim) {
out.build().append(c.get(CoreAnnotations.HeadWordStringAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-HW-DISTSIM").add();
}
}
if (flags.useNPGovernor) {
out.build().append(c.get(CoreAnnotations.GovernorAnnotation.class)).append("-GW").add();
if (flags.useTags) {
out.build().append(c.get(CoreAnnotations.GovernorAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-GW-T").add();
}
if (flags.useDistSim) {
out.build().append(c.get(CoreAnnotations.GovernorAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-DISTSIM-T1").add();
}
}
if (flags.useHeadGov) {
// TODO: neat idea, but this would need to be set somewhere.
// Probably should have its own annotation as this one would
// be more narrow and would clobber other potential uses
out.build().append(c.get(CoreAnnotations.HeadWordStringAnnotation.class)).dash().append(c.get(CoreAnnotations.GovernorAnnotation.class)).append("-HW_GW").add();
}
if (flags.useClassFeature) {
out.add("###");
}
if (flags.useFirstWord) {
out.add(getWord(cInfo.get(0)));
}
if (flags.useNGrams) {
Collection<String> subs = null;
if (flags.cacheNGrams) {
subs = wordToSubstrings.get(cWord);
}
if (subs == null) {
subs = new ArrayList<>();
String word = '<' + cWord + '>';
if (flags.lowercaseNGrams) {
word = word.toLowerCase();
}
if (flags.dehyphenateNGrams) {
word = dehyphenate(word);
}
if (flags.greekifyNGrams) {
word = greekify(word);
}
// hoist flags.noMidNGrams so only linear in word length for that case
if (flags.noMidNGrams) {
int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length()) : word.length();
for (int j = 2; j <= max; j++) {
subs.add(intern('#' + word.substring(0, j) + '#'));
}
int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : 0;
int lenM1 = word.length() - 1;
for (int i = start; i < lenM1; i++) {
subs.add(intern('#' + word.substring(i) + '#'));
}
} else {
for (int i = 0; i < word.length(); i++) {
for (int j = i + 2, max = Math.min(word.length(), i + flags.maxNGramLeng); j <= max; j++) {
if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
continue;
}
subs.add(intern('#' + word.substring(i, j) + '#'));
}
}
}
if (flags.cacheNGrams) {
wordToSubstrings.put(cWord, subs);
}
}
for (String sub : subs) {
out.add(sub);
}
if (flags.conjoinShapeNGrams) {
for (String str : subs) {
out.build().append(str).dash().append(cShape).append("-CNGram-CS").add();
}
}
}
if (flags.useGazettes) {
if (flags.sloppyGazette) {
Collection<String> entries = wordToGazetteEntries.get(cWord);
if (entries != null) {
for (String entry : entries) {
out.add(entry);
}
}
}
if (flags.cleanGazette) {
Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord);
if (infos != null) {
gazette: for (GazetteInfo gInfo : infos) {
for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) {
if (!gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc)))) {
continue gazette;
}
}
out.add(gInfo.feature);
}
}
}
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) {
out.build().append(cShape).append("-TYPE").add();
if (flags.useTypeSeqs) {
out.build().append(pShape).append("-PTYPE").add();
out.build().append(nShape).append("-NTYPE").add();
out.build().append(pWord).append("...").append(cShape).append("-PW_CTYPE").add();
out.build().append(cShape).append("...").append(nWord).append("-NW_CTYPE").add();
out.build().append(pShape).append("...").append(cShape).append("-PCTYPE").add();
out.build().append(cShape).append("...").append(nShape).append("-CNTYPE").add();
out.build().append(pShape).append("...").append(cShape).append("...").append(nShape).append("-PCNTYPE").add();
}
}
if (flags.useLastRealWord) {
if (pWord.length() <= 3) {
// extending this to check for 2 short words doesn't seem to help....
out.build().append(getWord(p2)).append("...").append(cShape).append("-PPW_CTYPE").add();
}
}
if (flags.useNextRealWord) {
if (nWord.length() <= 3) {
// extending this to check for 2 short words doesn't seem to help....
out.build().append(getWord(n2)).append("...").append(cShape).append("-NNW_CTYPE").add();
}
}
if (flags.useOccurrencePatterns) {
occurrencePatterns(cInfo, loc, out);
}
if (flags.useDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
CoreLabel dn = cInfo.get(loc + i);
CoreLabel dp = cInfo.get(loc - i);
out.build().append(getWord(dn)).append("-DISJN").add();
if (flags.useDisjunctiveShapeInteraction) {
out.build().append(getWord(dn)).dash().append(cShape).append("-DISJN-CS").add();
}
out.build().append(getWord(dp)).append("-DISJP").add();
if (flags.useDisjunctiveShapeInteraction) {
out.build().append(getWord(dp)).dash().append(cShape).append("-DISJP-CS").add();
}
}
}
if (flags.useUndirectedDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
CoreLabel dn = cInfo.get(loc + i);
CoreLabel dp = cInfo.get(loc - i);
out.build().append(getWord(dn)).append("-DISJ").add();
out.build().append(getWord(dp)).append("-DISJ").add();
}
}
if (flags.useWideDisjunctive) {
for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
out.build().append(getWord(cInfo.get(loc + i))).append("-DISJWN").add();
out.build().append(getWord(cInfo.get(loc - i))).append("-DISJWP").add();
}
}
if (flags.useEitherSideDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
out.build().append(getWord(cInfo.get(loc + i))).append("-DISJWE").add();
out.build().append(getWord(cInfo.get(loc - i))).append("-DISJWE").add();
}
}
if (flags.useDisjShape) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
out.build().append(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)).append("-NDISJSHAPE").add();
// out.build().append((cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class)).append("-PDISJSHAPE").add();
out.build().append(cShape).dash().append(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)).append("-CNDISJSHAPE").add();
// out.build().append(c.get(CoreAnnotations.ShapeAnnotation.class)).dash().append(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class)).append("-CPDISJSHAPE").add();
}
}
if (flags.useExtraTaggySequences) {
if (flags.useTags) {
out.build().append(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TTS").add();
out.build().append(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).dash().append(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class)).append("-TTTS").add();
}
if (flags.useDistSim) {
out.build().append(p2.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-DISTSIM_TTS1").add();
out.build().append(p3.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(p2.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(p.get(CoreAnnotations.DistSimAnnotation.class)).dash().append(c.get(CoreAnnotations.DistSimAnnotation.class)).append("-DISTSIM_TTTS1").add();
}
}
if (flags.useMUCFeatures) {
out.build().append(c.get(CoreAnnotations.SectionAnnotation.class)).append("-SECTION").add();
out.build().append(c.get(CoreAnnotations.WordPositionAnnotation.class)).append("-WORD_POSITION").add();
out.build().append(c.get(CoreAnnotations.SentencePositionAnnotation.class)).append("-SENT_POSITION").add();
out.build().append(c.get(CoreAnnotations.ParaPositionAnnotation.class)).append("-PARA_POSITION").add();
out.build().append(c.get(CoreAnnotations.WordPositionAnnotation.class)).dash().append(c.get(CoreAnnotations.ShapeAnnotation.class)).append("-WORD_POSITION_SHAPE").add();
}
} else if (flags.useInternal) {
if (flags.useWord) {
out.build().append(cWord).append("-WORD").add();
}
if (flags.useNGrams) {
Collection<String> subs = wordToSubstrings.get(cWord);
if (subs == null) {
subs = new ArrayList<>();
String word = '<' + cWord + '>';
if (flags.lowercaseNGrams) {
word = word.toLowerCase();
}
if (flags.dehyphenateNGrams) {
word = dehyphenate(word);
}
if (flags.greekifyNGrams) {
word = greekify(word);
}
for (int i = 0; i < word.length(); i++) {
for (int j = i + 2; j <= word.length(); j++) {
if (flags.noMidNGrams && i != 0 && j != word.length()) {
continue;
}
if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
continue;
}
// subs.add(intern("#" + word.substring(i, j) + "#"));
subs.add(intern('#' + word.substring(i, j) + '#'));
}
}
if (flags.cacheNGrams) {
wordToSubstrings.put(cWord, subs);
}
}
for (String sub : subs) {
out.add(sub);
}
if (flags.conjoinShapeNGrams) {
String shape = c.get(CoreAnnotations.ShapeAnnotation.class);
for (String str : subs) {
out.build().append(str).dash().append(shape).append("-CNGram-CS").add();
}
}
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) {
out.build().append(cShape).append("-TYPE").add();
}
if (flags.useOccurrencePatterns) {
occurrencePatterns(cInfo, loc, out);
}
} else if (flags.useExternal) {
if (flags.usePrev) {
out.build().append(pWord).append("-PW").add();
}
if (flags.useNext) {
out.build().append(nWord).append("-NW").add();
}
if (flags.useWordPairs) {
out.build().append(cWord).dash().append(pWord).append("-W-PW").add();
out.build().append(cWord).dash().append(nWord).append("-W-NW").add();
}
if (flags.useSymWordPairs) {
out.build().append(pWord).dash().append(nWord).append("-SWORDS").add();
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) {
if (flags.useTypeSeqs) {
out.build().append(pShape).append("-PTYPE").add();
out.build().append(nShape).append("-NTYPE").add();
out.build().append(pWord).append("...").append(cShape).append("-PW_CTYPE").add();
out.build().append(cShape).append("...").append(nWord).append("-NW_CTYPE").add();
if (flags.maxLeft > 0)
// this one just isn't useful, at least given c,pc,s,ps. Might be useful 0th-order
out.build().append(pShape).append("...").append(cShape).append("-PCTYPE").add();
out.build().append(cShape).append("...").append(nShape).append("-CNTYPE").add();
out.build().append(pShape).append("...").append(cShape).append("...").append(nShape).append("-PCNTYPE").add();
}
}
if (flags.useLastRealWord) {
if (pWord.length() <= 3) {
out.build().append(getWord(p2)).append("...").append(cShape).append("-PPW_CTYPE").add();
}
}
if (flags.useNextRealWord) {
if (nWord.length() <= 3) {
out.build().append(getWord(n2)).append("...").append(cShape).append("-NNW_CTYPE").add();
}
}
if (flags.useDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
CoreLabel dn = cInfo.get(loc + i);
CoreLabel dp = cInfo.get(loc - i);
out.build().append(getWord(dn)).append("-DISJN").add();
if (flags.useDisjunctiveShapeInteraction) {
out.build().append(getWord(dn)).dash().append(cShape).append("-DISJN-CS").add();
}
out.build().append(getWord(dp)).append("-DISJP").add();
if (flags.useDisjunctiveShapeInteraction) {
out.build().append(getWord(dp)).dash().append(cShape).append("-DISJP-CS").add();
}
}
}
if (flags.useWideDisjunctive) {
for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
out.build().append(getWord(cInfo.get(loc + i))).append("-DISJWN").add();
out.build().append(getWord(cInfo.get(loc - i))).append("-DISJWP").add();
}
}
if (flags.useDisjShape) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
out.build().append(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)).append("-NDISJSHAPE").add();
// out.build().append((cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class)).append("-PDISJSHAPE").add();
out.build().append(c.get(CoreAnnotations.ShapeAnnotation.class)).dash().append(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)).append("-CNDISJSHAPE").add();
// out.build().append(c.get(CoreAnnotations.ShapeAnnotation.class)).dash().append(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class)).append("-CPDISJSHAPE").add();
}
}
}
// Stuff to add binary features from the additional columns
if (flags.twoStage) {
out.build().append(c.get(Bin1Annotation.class)).append("-BIN1").add();
out.build().append(c.get(Bin2Annotation.class)).append("-BIN2").add();
out.build().append(c.get(Bin3Annotation.class)).append("-BIN3").add();
out.build().append(c.get(Bin4Annotation.class)).append("-BIN4").add();
out.build().append(c.get(Bin5Annotation.class)).append("-BIN5").add();
out.build().append(c.get(Bin6Annotation.class)).append("-BIN6").add();
}
if (flags.useIfInteger) {
try {
int val = Integer.parseInt(cWord);
if (val > 0)
out.add("POSITIVE_INTEGER");
else if (val < 0)
out.add("NEGATIVE_INTEGER");
// log.info("FOUND INTEGER");
} catch (NumberFormatException e) {
// not an integer value, nothing to do
}
}
// Stuff to add arbitrary features
if (flags.useGenericFeatures) {
// see if we need to cache the keys
if (genericAnnotationKeys == null) {
makeGenericKeyCache(c);
}
// now look through the cached keys
for (Class<?> key : genericAnnotationKeys) {
// log.info("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key));
Object col = c.get((Class<CoreAnnotation<Object>>) key);
if (col instanceof Collection) {
for (Object ob : (Collection<?>) col) {
out.build().append(ob.toString()).dash().append(CoreLabel.genericValues.get(key)).add();
}
} else if (col != null) {
out.build().append(col.toString()).dash().append(CoreLabel.genericValues.get(key)).add();
}
}
}
if (flags.useTopics) {
// out.build().append(p.get(CoreAnnotations.TopicAnnotation.class), "-", cWord, "--CWORD").add();
out.build().append(c.get(CoreAnnotations.TopicAnnotation.class)).append("-TopicID").add();
out.build().append(p.get(CoreAnnotations.TopicAnnotation.class)).append("-PTopicID").add();
out.build().append(n.get(CoreAnnotations.TopicAnnotation.class)).append("-NTopicID").add();
// out.build().append(p.get(CoreAnnotations.TopicAnnotation.class)).dash().append(c.get(CoreAnnotations.TopicAnnotation.class)).dash().append(n.get(CoreAnnotations.TopicAnnotation.class)).append("-PCNTopicID").add();
// out.build().append(c.get(CoreAnnotations.TopicAnnotation.class)).dash().append(n.get(CoreAnnotations.TopicAnnotation.class)).append("-CNTopicID").add();
// out.build().append(p.get(CoreAnnotations.TopicAnnotation.class)).dash().append(c.get(CoreAnnotations.TopicAnnotation.class)).append("-PCTopicID").add();
// out.build().append(c.get(CoreAnnotations.TopicAnnotation.class)).append(cShape).append("-TopicID-SH").add();
}
// NER tag annotations from a previous NER system
if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) {
out.build().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-CStackedNERTag").add();
out.build().append(cWord).dash().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-WCStackedNERTag").add();
if (flags.useNext) {
out.build().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-CNStackedNERTag").add();
out.build().append(cWord).dash().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-WCNStackedNERTag").add();
if (flags.usePrev) {
out.build().append(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-PCNStackedNERTag").add();
out.build().append(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(cWord).append(" -").append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-PWCNStackedNERTag").add();
}
}
if (flags.usePrev) {
out.build().append(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).dash().append(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)).append("-PCStackedNERTag").add();
}
}
if (flags.useWordnetFeatures)
out.build().append(c.get(CoreAnnotations.WordnetSynAnnotation.class)).append("-WordnetSyn").add();
if (flags.useProtoFeatures)
out.build().append(c.get(CoreAnnotations.ProtoAnnotation.class)).append("-Proto").add();
if (flags.usePhraseWordTags)
out.build().append(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class)).append("-PhraseTag").add();
if (flags.usePhraseWords) {
for (String w : c.get(CoreAnnotations.PhraseWordsAnnotation.class)) {
out.build().append(w).append("-PhraseWord").add();
}
}
if (flags.useCommonWordsFeature)
out.add(c.get(CoreAnnotations.CommonWordsAnnotation.class));
if (flags.useRadical && cWord.length() > 0) {
// todo [cdm 2016]: Really all stuff in this file should be fixed to work with codepoints outside BMP
if (cWord.length() == 1) {
out.build().append(RadicalMap.getRadical(cWord.charAt(0))).append("-SINGLE-CHAR-RADICAL").add();
} else {
out.build().append(RadicalMap.getRadical(cWord.charAt(0))).append("-START-RADICAL").add();
out.build().append(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1))).append("-END-RADICAL").add();
}
for (int i = 0; i < cWord.length(); ++i) {
out.build().append(RadicalMap.getRadical(cWord.charAt(i))).append("-RADICAL").add();
}
}
if (flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()) {
for (String s : c.word().split(flags.splitWordRegex)) {
out.build().append(s).append("-SPLITWORD").add();
}
}
if (flags.useMoreNeighborNGrams) {
int maxLen = pWord.length();
if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
maxLen = flags.maxNGramLeng;
}
for (int len = 1; len <= maxLen; ++len) {
out.build().append(pWord.substring(0, len)).append("-PREV-PREFIX").add();
}
for (int pos = pWord.length() - maxLen; pos < pWord.length(); ++pos) {
out.build().append(pWord.substring(pos, pWord.length())).append("-PREV-SUFFIX").add();
}
maxLen = nWord.length();
if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
maxLen = flags.maxNGramLeng;
}
for (int len = 1; len <= maxLen; ++len) {
out.build().append(nWord.substring(0, len)).append("-NEXT-PREFIX").add();
}
for (int pos = nWord.length() - maxLen; pos < nWord.length(); ++pos) {
out.build().append(nWord.substring(pos, nWord.length())).append("-NEXT-SUFFIX").add();
}
}
}
use of edu.stanford.nlp.ling.CoreAnnotation in project CoreNLP by stanfordnlp.
the class StanfordCoreNLP method construct.
//
// AnnotatorPool construction support
//
private void construct(Properties props, boolean enforceRequirements, AnnotatorImplementations annotatorImplementations, AnnotatorPool pool) {
Timing tim = new Timing();
this.numWords = 0;
this.constituentTreePrinter = new TreePrint("penn");
this.dependencyTreePrinter = new TreePrint("typedDependenciesCollapsed");
if (props == null) {
// if undefined, find the properties file in the classpath
props = loadPropertiesFromClasspath();
} else if (props.getProperty("annotators") == null) {
// this happens when some command line options are specified (e.g just "-filelist") but no properties file is.
// we use the options that are given and let them override the default properties from the class path properties.
Properties fromClassPath = loadPropertiesFromClasspath();
fromClassPath.putAll(props);
props = fromClassPath;
}
this.properties = props;
if (pool == null) {
// if undefined, load the default annotator pool
pool = getDefaultAnnotatorPool(props, annotatorImplementations);
}
// Set threading
if (this.properties.containsKey("threads")) {
ArgumentParser.threads = PropertiesUtils.getInt(this.properties, "threads");
this.availableProcessors = new Semaphore(ArgumentParser.threads);
} else {
this.availableProcessors = new Semaphore(1);
}
// now construct the annotators from the given properties in the given order
List<String> annoNames = Arrays.asList(getRequiredProperty(props, "annotators").split("[, \t]+"));
Set<String> alreadyAddedAnnoNames = Generics.newHashSet();
Set<Class<? extends CoreAnnotation>> requirementsSatisfied = Generics.newHashSet();
for (String name : annoNames) {
name = name.trim();
if (name.isEmpty()) {
continue;
}
logger.info("Adding annotator " + name);
Annotator an = pool.get(name);
this.addAnnotator(an);
if (enforceRequirements) {
Set<Class<? extends CoreAnnotation>> allRequirements = an.requires();
for (Class<? extends CoreAnnotation> requirement : allRequirements) {
if (!requirementsSatisfied.contains(requirement)) {
String fmt = "annotator \"%s\" requires annotation \"%s\". The usual requirements for this annotator are: %s";
throw new IllegalArgumentException(String.format(fmt, name, requirement.getSimpleName(), StringUtils.join(Annotator.DEFAULT_REQUIREMENTS.getOrDefault(name, Collections.singleton("unknown")), ",")));
}
}
requirementsSatisfied.addAll(an.requirementsSatisfied());
}
alreadyAddedAnnoNames.add(name);
}
// Sanity check
if (!alreadyAddedAnnoNames.contains(STANFORD_SSPLIT)) {
System.setProperty(NEWLINE_SPLITTER_PROPERTY, "false");
}
this.pipelineSetupTime = tim.report();
}
Aggregations