use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class NERClassifierCombiner method main.
/** The main method. */
public static void main(String[] args) throws Exception {
StringUtils.logInvocationString(log, args);
Properties props = StringUtils.argsToProperties(args);
// false for print probs as printed in next code block
SeqClassifierFlags flags = new SeqClassifierFlags(props, false);
String loadPath = props.getProperty("loadClassifier");
NERClassifierCombiner ncc;
if (loadPath != null) {
// note that when loading a serialized classifier, the philosophy is override
// any settings in props with those given in the commandline
// so if you dumped it with useSUTime = false, and you say -useSUTime at
// the commandline, the commandline takes precedence
ncc = getClassifier(loadPath, props);
} else {
// pass null for passDownProperties to let all props go through
ncc = createNERClassifierCombiner("ner", null, props);
}
// write the NERClassifierCombiner to the given path on disk
String serializeTo = props.getProperty("serializeTo");
if (serializeTo != null) {
ncc.serializeClassifier(serializeTo);
}
String textFile = props.getProperty("textFile");
if (textFile != null) {
ncc.classifyAndWriteAnswers(textFile);
}
// run on multiple textFiles , based off CRFClassifier code
String textFiles = props.getProperty("textFiles");
if (textFiles != null) {
List<File> files = new ArrayList<>();
for (String filename : textFiles.split(",")) {
files.add(new File(filename));
}
ncc.classifyFilesAndWriteAnswers(files);
}
// options for run the NERClassifierCombiner on a testFile or testFiles
String testFile = props.getProperty("testFile");
String testFiles = props.getProperty("testFiles");
String crfToExamine = props.getProperty("crfToExamine");
DocumentReaderAndWriter<CoreLabel> readerAndWriter = ncc.defaultReaderAndWriter();
if (testFile != null || testFiles != null) {
// check if there is not a crf specific request
if (crfToExamine == null) {
// in this case there is no crfToExamine
if (testFile != null) {
ncc.classifyAndWriteAnswers(testFile, readerAndWriter, true);
} else {
List<File> files = Arrays.asList(testFiles.split(",")).stream().map(File::new).collect(Collectors.toList());
ncc.classifyFilesAndWriteAnswers(files, ncc.defaultReaderAndWriter(), true);
}
} else {
ClassifierCombiner.examineCRF(ncc, crfToExamine, flags, testFile, testFiles, readerAndWriter);
}
}
// option for showing info about the NERClassifierCombiner
String showNCCInfo = props.getProperty("showNCCInfo");
if (showNCCInfo != null) {
showNCCInfo(ncc);
}
// option for reading in from stdin
if (flags.readStdin) {
ncc.classifyStdin();
}
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class NERClassifierCombiner method recognizeNumberSequences.
private void recognizeNumberSequences(List<CoreLabel> words, final CoreMap document, final CoreMap sentence) {
// we need to copy here because NumberSequenceClassifier overwrites the AnswerAnnotation
List<CoreLabel> newWords = NumberSequenceClassifier.copyTokens(words, sentence);
nsc.classifyWithGlobalInformation(newWords, document, sentence);
// also, copy all the additional annotations generated by SUTime and NumberNormalizer
for (int i = 0, sz = words.size(); i < sz; i++) {
CoreLabel origWord = words.get(i);
CoreLabel newWord = newWords.get(i);
// log.info(newWord.word() + " => " + newWord.get(CoreAnnotations.AnswerAnnotation.class) + " " + origWord.ner());
String before = origWord.get(CoreAnnotations.AnswerAnnotation.class);
String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class);
if ((before == null || before.equals(nsc.flags.backgroundSymbol) || before.equals("MISC")) && !newGuess.equals(nsc.flags.backgroundSymbol)) {
origWord.set(CoreAnnotations.AnswerAnnotation.class, newGuess);
}
// transfer other annotations generated by SUTime or NumberNormalizer
NumberSequenceClassifier.transferAnnotations(newWord, origWord);
}
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class NERFeatureFactory method featuresC.
protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
CoreLabel p3 = cInfo.get(loc - 3);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel c = cInfo.get(loc);
CoreLabel n = cInfo.get(loc + 1);
CoreLabel n2 = cInfo.get(loc + 2);
String cWord = getWord(c);
String pWord = getWord(p);
String nWord = getWord(n);
String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class);
String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class);
String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class);
Collection<String> featuresC = new ArrayList<>();
if (flags.useDistSim) {
distSimAnnotate(cInfo);
}
if (flags.useBagOfWords) {
for (IN word : cInfo) {
featuresC.add(getWord(word) + "-BAGOFWORDS");
}
}
if (flags.useDistSim && flags.useMoreTags) {
featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD");
}
if (flags.useDistSim) {
featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM");
}
if (flags.useTitle) {
Matcher m = titlePattern.matcher(cWord);
if (m.matches()) {
featuresC.add("IS_TITLE");
}
} else if (flags.useTitle2) {
Matcher m = titlePattern2.matcher(cWord);
if (m.matches()) {
featuresC.add("IS_TITLE");
}
}
if (flags.slashHyphenTreatment != SeqClassifierFlags.SlashHyphenEnum.NONE) {
if (flags.useWord) {
generateSlashHyphenFeatures(cWord, featuresC, "-WFRAG", "-WORD");
}
}
if (flags.useInternal && flags.useExternal) {
if (flags.useWord) {
featuresC.add(cWord + "-WORD");
}
if (flags.use2W) {
featuresC.add(getWord(p2) + "-P2W");
featuresC.add(getWord(n2) + "-N2W");
}
if (flags.useLC) {
featuresC.add(cWord.toLowerCase() + "-CL");
featuresC.add(pWord.toLowerCase() + "-PL");
featuresC.add(nWord.toLowerCase() + "-NL");
}
if (flags.useUnknown) {
// for true casing
featuresC.add(c.get(CoreAnnotations.UnknownAnnotation.class) + "-UNKNOWN");
featuresC.add(p.get(CoreAnnotations.UnknownAnnotation.class) + "-PUNKNOWN");
featuresC.add(n.get(CoreAnnotations.UnknownAnnotation.class) + "-NUNKNOWN");
}
if (flags.useLemmas) {
String lem = c.getString(CoreAnnotations.LemmaAnnotation.class);
if (!"".equals(lem)) {
featuresC.add(lem + "-LEM");
}
}
if (flags.usePrevNextLemmas) {
String plem = p.getString(CoreAnnotations.LemmaAnnotation.class);
String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class);
if (!"".equals(plem)) {
featuresC.add(plem + "-PLEM");
}
if (!"".equals(nlem)) {
featuresC.add(nlem + "-NLEM");
}
}
if (flags.checkNameList) {
try {
if (lastNames == null) {
lastNames = Generics.newHashSet();
for (String line : ObjectBank.getLineIterator(flags.lastNameList)) {
String[] cols = line.split("\\s+");
lastNames.add(cols[0]);
}
}
if (maleNames == null) {
maleNames = Generics.newHashSet();
for (String line : ObjectBank.getLineIterator(flags.maleNameList)) {
String[] cols = line.split("\\s+");
maleNames.add(cols[0]);
}
}
if (femaleNames == null) {
femaleNames = Generics.newHashSet();
for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) {
String[] cols = line.split("\\s+");
femaleNames.add(cols[0]);
}
}
String name = cWord.toUpperCase();
if (lastNames.contains(name)) {
featuresC.add("LAST_NAME");
}
if (maleNames.contains(name)) {
featuresC.add("MALE_NAME");
}
if (femaleNames.contains(name)) {
featuresC.add("FEMALE_NAME");
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
if (flags.binnedLengths != null) {
int len = cWord.length();
String featureName = null;
for (int i = 0; i <= flags.binnedLengths.length; i++) {
if (i == flags.binnedLengths.length) {
featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf";
} else if (len <= flags.binnedLengths[i]) {
featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-' + flags.binnedLengths[i];
break;
}
}
featuresC.add(featureName);
}
if (flags.useABGENE) {
featuresC.add(c.get(CoreAnnotations.AbgeneAnnotation.class) + "-ABGENE");
featuresC.add(p.get(CoreAnnotations.AbgeneAnnotation.class) + "-PABGENE");
featuresC.add(n.get(CoreAnnotations.AbgeneAnnotation.class) + "-NABGENE");
}
if (flags.useABSTRFreqDict) {
featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
}
if (flags.useABSTR) {
featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT");
featuresC.add(p.get(CoreAnnotations.AbstrAnnotation.class) + "-PABSTRACT");
featuresC.add(n.get(CoreAnnotations.AbstrAnnotation.class) + "-NABSTRACT");
}
if (flags.useGENIA) {
featuresC.add(c.get(CoreAnnotations.GeniaAnnotation.class) + "-GENIA");
featuresC.add(p.get(CoreAnnotations.GeniaAnnotation.class) + "-PGENIA");
featuresC.add(n.get(CoreAnnotations.GeniaAnnotation.class) + "-NGENIA");
}
if (flags.useWEBFreqDict) {
featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
}
if (flags.useWEB) {
featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB");
featuresC.add(p.get(CoreAnnotations.WebAnnotation.class) + "-PWEB");
featuresC.add(n.get(CoreAnnotations.WebAnnotation.class) + "-NWEB");
}
if (flags.useIsURL) {
featuresC.add(c.get(CoreAnnotations.IsURLAnnotation.class) + "-ISURL");
}
if (flags.useEntityRule) {
featuresC.add(c.get(CoreAnnotations.EntityRuleAnnotation.class) + "-ENTITYRULE");
}
if (flags.useEntityTypes) {
featuresC.add(c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ENTITYTYPE");
}
if (flags.useIsDateRange) {
featuresC.add(c.get(CoreAnnotations.IsDateRangeAnnotation.class) + "-ISDATERANGE");
}
if (flags.useABSTRFreq) {
featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ");
}
if (flags.useFREQ) {
featuresC.add(c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ");
}
if (flags.useMoreTags) {
featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD");
}
if (flags.usePosition) {
featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + "-POSITION");
}
if (flags.useBeginSent) {
String pos = c.get(CoreAnnotations.PositionAnnotation.class);
if ("0".equals(pos)) {
featuresC.add("BEGIN-SENT");
featuresC.add(cShape + "-BEGIN-SENT");
} else if (Integer.toString(cInfo.size() - 1).equals(pos)) {
featuresC.add("END-SENT");
featuresC.add(cShape + "-END-SENT");
} else {
featuresC.add("IN-SENT");
featuresC.add(cShape + "-IN-SENT");
}
}
if (flags.useTags) {
featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
}
if (flags.useOrdinal) {
if (isOrdinal(cInfo, loc)) {
featuresC.add("C_ORDINAL");
if (isOrdinal(cInfo, loc - 1)) {
//log.info(getWord(p) + " ");
featuresC.add("PC_ORDINAL");
}
//log.info(cWord);
}
if (isOrdinal(cInfo, loc - 1)) {
featuresC.add("P_ORDINAL");
}
}
if (flags.usePrev) {
featuresC.add(pWord + "-PW");
if (flags.useTags) {
featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PTAG");
}
if (flags.useDistSim) {
featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + "-PDISTSIM");
}
if (flags.useIsURL) {
featuresC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + "-PISURL");
}
if (flags.useEntityTypes) {
featuresC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + "-PENTITYTYPE");
}
}
if (flags.useNext) {
featuresC.add(nWord + "-NW");
if (flags.useTags) {
featuresC.add(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-NTAG");
}
if (flags.useDistSim) {
featuresC.add(n.get(CoreAnnotations.DistSimAnnotation.class) + "-NDISTSIM");
}
if (flags.useIsURL) {
featuresC.add(n.get(CoreAnnotations.IsURLAnnotation.class) + "-NISURL");
}
if (flags.useEntityTypes) {
featuresC.add(n.get(CoreAnnotations.EntityTypeAnnotation.class) + "-NENTITYTYPE");
}
}
if (flags.useEitherSideWord) {
featuresC.add(pWord + "-EW");
featuresC.add(nWord + "-EW");
}
if (flags.useWordPairs) {
featuresC.add(cWord + '-' + pWord + "-W-PW");
featuresC.add(cWord + '-' + nWord + "-W-NW");
}
if (flags.useSymTags) {
if (flags.useTags) {
featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCNTAGS");
featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-CNTAGS");
featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCTAGS");
}
if (flags.useDistSim) {
featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-PCNDISTSIM");
featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-CNDISTSIM");
featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-PCDISTSIM");
}
}
if (flags.useSymWordPairs) {
featuresC.add(pWord + '-' + nWord + "-SWORDS");
}
String pGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? p.get(CoreAnnotations.GazAnnotation.class) : null;
String nGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? n.get(CoreAnnotations.GazAnnotation.class) : null;
String cGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? c.get(CoreAnnotations.GazAnnotation.class) : null;
if (flags.useGazFeatures) {
if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
featuresC.add(cGazAnnotation + "-GAZ");
}
// n
if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
featuresC.add(nGazAnnotation + "-NGAZ");
}
// p
if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
featuresC.add(pGazAnnotation + "-PGAZ");
}
}
if (flags.useMoreGazFeatures) {
if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
featuresC.add(cGazAnnotation + '-' + cWord + "-CG-CW-GAZ");
// c-n
if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
featuresC.add(cGazAnnotation + '-' + nGazAnnotation + "-CNGAZ");
}
// p-c
if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
featuresC.add(pGazAnnotation + '-' + cGazAnnotation + "-PCGAZ");
}
}
}
if (flags.useAbbr || flags.useMinimalAbbr) {
featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR");
}
if (flags.useAbbr1 || flags.useMinimalAbbr1) {
if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR");
}
}
if (flags.useAbbr) {
featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR");
featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR");
featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR");
}
if (flags.useAbbr1) {
if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR");
featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR");
featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR");
}
}
if (flags.useChunks) {
featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-PCCHUNK");
featuresC.add(c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-CNCHUNK");
featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK");
}
if (flags.useMinimalAbbr) {
featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB");
}
if (flags.useMinimalAbbr1) {
if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB");
}
}
String prevVB = "", nextVB = "";
if (flags.usePrevVB) {
for (int j = loc - 1; ; j--) {
CoreLabel wi = cInfo.get(j);
if (wi == cInfo.getPad()) {
prevVB = "X";
featuresC.add("X-PVB");
break;
} else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
featuresC.add(getWord(wi) + "-PVB");
prevVB = getWord(wi);
break;
}
}
}
if (flags.useNextVB) {
for (int j = loc + 1; ; j++) {
CoreLabel wi = cInfo.get(j);
if (wi == cInfo.getPad()) {
featuresC.add("X-NVB");
nextVB = "X";
break;
} else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
featuresC.add(getWord(wi) + "-NVB");
nextVB = getWord(wi);
break;
}
}
}
if (flags.useVB) {
featuresC.add(prevVB + '-' + nextVB + "-PNVB");
}
if (flags.useShapeConjunctions) {
featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + cShape + "-POS-SH");
if (flags.useTags) {
featuresC.add(c.tag() + cShape + "-TAG-SH");
}
if (flags.useDistSim) {
featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + cShape + "-DISTSIM-SH");
}
}
if (flags.useWordTag) {
featuresC.add(cWord + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-T");
featuresC.add(cWord + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-PT");
featuresC.add(cWord + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-NT");
}
if (flags.useNPHead) {
// TODO: neat idea, but this would need to be set somewhere.
// Probably should have its own annotation as this one would
// be more narrow and would clobber other potential uses
featuresC.add(c.get(CoreAnnotations.HeadWordStringAnnotation.class) + "-HW");
if (flags.useTags) {
featuresC.add(c.get(CoreAnnotations.HeadWordStringAnnotation.class) + "-" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-HW-T");
}
if (flags.useDistSim) {
featuresC.add(c.get(CoreAnnotations.HeadWordStringAnnotation.class) + "-" + c.get(CoreAnnotations.DistSimAnnotation.class) + "-HW-DISTSIM");
}
}
if (flags.useNPGovernor) {
featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + "-GW");
if (flags.useTags) {
featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-GW-T");
}
if (flags.useDistSim) {
featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM-T1");
}
}
if (flags.useHeadGov) {
// TODO: neat idea, but this would need to be set somewhere.
// Probably should have its own annotation as this one would
// be more narrow and would clobber other potential uses
featuresC.add(c.get(CoreAnnotations.HeadWordStringAnnotation.class) + "-" + c.get(CoreAnnotations.GovernorAnnotation.class) + "-HW_GW");
}
if (flags.useClassFeature) {
featuresC.add("###");
}
if (flags.useFirstWord) {
String firstWord = getWord(cInfo.get(0));
featuresC.add(firstWord);
}
if (flags.useNGrams) {
Collection<String> subs = null;
if (flags.cacheNGrams) {
subs = wordToSubstrings.get(cWord);
}
if (subs == null) {
subs = new ArrayList<>();
String word = '<' + cWord + '>';
if (flags.lowercaseNGrams) {
word = word.toLowerCase();
}
if (flags.dehyphenateNGrams) {
word = dehyphenate(word);
}
if (flags.greekifyNGrams) {
word = greekify(word);
}
// hoist flags.noMidNGrams so only linear in word length for that case
if (flags.noMidNGrams) {
int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length()) : word.length();
for (int j = 2; j <= max; j++) {
subs.add(intern('#' + word.substring(0, j) + '#'));
}
int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : 0;
int lenM1 = word.length() - 1;
for (int i = start; i < lenM1; i++) {
subs.add(intern('#' + word.substring(i) + '#'));
}
} else {
for (int i = 0; i < word.length(); i++) {
for (int j = i + 2, max = Math.min(word.length(), i + flags.maxNGramLeng); j <= max; j++) {
if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
continue;
}
subs.add(intern('#' + word.substring(i, j) + '#'));
}
}
}
if (flags.cacheNGrams) {
wordToSubstrings.put(cWord, subs);
}
}
featuresC.addAll(subs);
if (flags.conjoinShapeNGrams) {
for (String str : subs) {
String feat = str + '-' + cShape + "-CNGram-CS";
featuresC.add(feat);
}
}
}
if (flags.useGazettes) {
if (flags.sloppyGazette) {
Collection<String> entries = wordToGazetteEntries.get(cWord);
if (entries != null) {
featuresC.addAll(entries);
}
}
if (flags.cleanGazette) {
Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord);
if (infos != null) {
for (GazetteInfo gInfo : infos) {
boolean ok = true;
for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) {
ok &= gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc)));
}
if (ok) {
featuresC.add(gInfo.feature);
}
}
}
}
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
featuresC.add(cShape + "-TYPE");
if (flags.useTypeSeqs) {
featuresC.add(pShape + "-PTYPE");
featuresC.add(nShape + "-NTYPE");
featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
featuresC.add(pShape + "..." + cShape + "-PCTYPE");
featuresC.add(cShape + "..." + nShape + "-CNTYPE");
featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
}
}
if (flags.useLastRealWord) {
if (pWord.length() <= 3) {
// extending this to check for 2 short words doesn't seem to help....
featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE");
}
}
if (flags.useNextRealWord) {
if (nWord.length() <= 3) {
// extending this to check for 2 short words doesn't seem to help....
featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE");
}
}
if (flags.useOccurrencePatterns) {
featuresC.addAll(occurrencePatterns(cInfo, loc));
}
if (flags.useDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
CoreLabel dn = cInfo.get(loc + i);
CoreLabel dp = cInfo.get(loc - i);
featuresC.add(getWord(dn) + "-DISJN");
if (flags.useDisjunctiveShapeInteraction) {
featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS");
}
featuresC.add(getWord(dp) + "-DISJP");
if (flags.useDisjunctiveShapeInteraction) {
featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS");
}
}
}
if (flags.useUndirectedDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
CoreLabel dn = cInfo.get(loc + i);
CoreLabel dp = cInfo.get(loc - i);
featuresC.add(getWord(dn) + "-DISJ");
featuresC.add(getWord(dp) + "-DISJ");
}
}
if (flags.useWideDisjunctive) {
for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN");
featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP");
}
}
if (flags.useEitherSideDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWE");
featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWE");
}
}
if (flags.useDisjShape) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE");
// featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE");
featuresC.add(cShape + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE");
// featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE");
}
}
if (flags.useExtraTaggySequences) {
if (flags.useTags) {
featuresC.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS");
featuresC.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS");
}
if (flags.useDistSim) {
featuresC.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1");
featuresC.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1");
}
}
if (flags.useMUCFeatures) {
featuresC.add(c.get(CoreAnnotations.SectionAnnotation.class) + "-SECTION");
featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + "-WORD_POSITION");
featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class) + "-SENT_POSITION");
featuresC.add(c.get(CoreAnnotations.ParaPositionAnnotation.class) + "-PARA_POSITION");
featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-WORD_POSITION_SHAPE");
}
} else if (flags.useInternal) {
if (flags.useWord) {
featuresC.add(cWord + "-WORD");
}
if (flags.useNGrams) {
Collection<String> subs = wordToSubstrings.get(cWord);
if (subs == null) {
subs = new ArrayList<>();
String word = '<' + cWord + '>';
if (flags.lowercaseNGrams) {
word = word.toLowerCase();
}
if (flags.dehyphenateNGrams) {
word = dehyphenate(word);
}
if (flags.greekifyNGrams) {
word = greekify(word);
}
for (int i = 0; i < word.length(); i++) {
for (int j = i + 2; j <= word.length(); j++) {
if (flags.noMidNGrams && i != 0 && j != word.length()) {
continue;
}
if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
continue;
}
//subs.add(intern("#" + word.substring(i, j) + "#"));
subs.add(intern('#' + word.substring(i, j) + '#'));
}
}
if (flags.cacheNGrams) {
wordToSubstrings.put(cWord, subs);
}
}
featuresC.addAll(subs);
if (flags.conjoinShapeNGrams) {
String shape = c.get(CoreAnnotations.ShapeAnnotation.class);
for (String str : subs) {
String feat = str + '-' + shape + "-CNGram-CS";
featuresC.add(feat);
}
}
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
featuresC.add(cShape + "-TYPE");
}
if (flags.useOccurrencePatterns) {
featuresC.addAll(occurrencePatterns(cInfo, loc));
}
} else if (flags.useExternal) {
if (flags.usePrev) {
featuresC.add(pWord + "-PW");
}
if (flags.useNext) {
featuresC.add(nWord + "-NW");
}
if (flags.useWordPairs) {
featuresC.add(cWord + '-' + pWord + "-W-PW");
featuresC.add(cWord + '-' + nWord + "-W-NW");
}
if (flags.useSymWordPairs) {
featuresC.add(pWord + '-' + nWord + "-SWORDS");
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
if (flags.useTypeSeqs) {
featuresC.add(pShape + "-PTYPE");
featuresC.add(nShape + "-NTYPE");
featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
// this one just isn't useful, at least given c,pc,s,ps. Might be useful 0th-order
if (flags.maxLeft > 0)
featuresC.add(pShape + "..." + cShape + "-PCTYPE");
featuresC.add(cShape + "..." + nShape + "-CNTYPE");
featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
}
}
if (flags.useLastRealWord) {
if (pWord.length() <= 3) {
featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE");
}
}
if (flags.useNextRealWord) {
if (nWord.length() <= 3) {
featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE");
}
}
if (flags.useDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
CoreLabel dn = cInfo.get(loc + i);
CoreLabel dp = cInfo.get(loc - i);
featuresC.add(getWord(dn) + "-DISJN");
if (flags.useDisjunctiveShapeInteraction) {
featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS");
}
featuresC.add(getWord(dp) + "-DISJP");
if (flags.useDisjunctiveShapeInteraction) {
featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS");
}
}
}
if (flags.useWideDisjunctive) {
for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN");
featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP");
}
}
if (flags.useDisjShape) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE");
// featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE");
featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE");
// featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE");
}
}
}
// Stuff to add binary features from the additional columns
if (flags.twoStage) {
featuresC.add(c.get(Bin1Annotation.class) + "-BIN1");
featuresC.add(c.get(Bin2Annotation.class) + "-BIN2");
featuresC.add(c.get(Bin3Annotation.class) + "-BIN3");
featuresC.add(c.get(Bin4Annotation.class) + "-BIN4");
featuresC.add(c.get(Bin5Annotation.class) + "-BIN5");
featuresC.add(c.get(Bin6Annotation.class) + "-BIN6");
}
if (flags.useIfInteger) {
try {
int val = Integer.parseInt(cWord);
if (val > 0)
featuresC.add("POSITIVE_INTEGER");
else if (val < 0)
featuresC.add("NEGATIVE_INTEGER");
// log.info("FOUND INTEGER");
} catch (NumberFormatException e) {
// not an integer value, nothing to do
}
}
//Stuff to add arbitrary features
if (flags.useGenericFeatures) {
//see if we need to cache the keys
if (genericAnnotationKeys == null) {
makeGenericKeyCache(c);
}
//now look through the cached keys
for (Class key : genericAnnotationKeys) {
//log.info("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key));
if (c.get(key) != null && c.get(key) instanceof Collection) {
for (Object ob : (Collection) c.get(key)) {
featuresC.add(ob + "-" + CoreLabel.genericValues.get(key));
}
} else {
featuresC.add(c.get(key) + "-" + CoreLabel.genericValues.get(key));
}
}
}
if (flags.useTopics) {
//featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + cWord + "--CWORD");
featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + "-TopicID");
featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + "-PTopicID");
featuresC.add(n.get(CoreAnnotations.TopicAnnotation.class) + "-NTopicID");
//featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-PCNTopicID");
//featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-CNTopicID");
//featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + "-PCTopicID");
//featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + cShape + "-TopicID-SH");
//asdasd
}
// NER tag annotations from a previous NER system
if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) {
featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CStackedNERTag");
featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCStackedNERTag");
if (flags.useNext) {
featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CNStackedNERTag");
featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCNStackedNERTag");
if (flags.usePrev) {
featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCNStackedNERTag");
featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + cWord + " -" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PWCNStackedNERTag");
}
}
if (flags.usePrev) {
featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCStackedNERTag");
}
}
if (flags.useWordnetFeatures)
featuresC.add(c.get(CoreAnnotations.WordnetSynAnnotation.class) + "-WordnetSyn");
if (flags.useProtoFeatures)
featuresC.add(c.get(CoreAnnotations.ProtoAnnotation.class) + "-Proto");
if (flags.usePhraseWordTags)
featuresC.add(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class) + "-PhraseTag");
if (flags.usePhraseWords) {
for (String w : c.get(CoreAnnotations.PhraseWordsAnnotation.class)) featuresC.add(w + "-PhraseWord");
}
if (flags.useCommonWordsFeature)
featuresC.add(c.get(CoreAnnotations.CommonWordsAnnotation.class));
if (flags.useRadical && cWord.length() > 0) {
// todo [cdm 2016]: Really all stuff in this file should be fixed to work with codepoints outside BMP
if (cWord.length() == 1) {
featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-SINGLE-CHAR-RADICAL");
} else {
featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-START-RADICAL");
featuresC.add(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1)) + "-END-RADICAL");
}
for (int i = 0; i < cWord.length(); ++i) {
featuresC.add(RadicalMap.getRadical(cWord.charAt(i)) + "-RADICAL");
}
}
if (flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()) {
String[] ws = c.word().split(flags.splitWordRegex);
for (String s : ws) {
featuresC.add(s + "-SPLITWORD");
}
}
if (flags.useMoreNeighborNGrams) {
int maxLen = pWord.length();
if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
maxLen = flags.maxNGramLeng;
}
for (int len = 1; len <= maxLen; ++len) {
featuresC.add(pWord.substring(0, len) + "-PREV-PREFIX");
}
for (int pos = pWord.length() - maxLen; pos < pWord.length(); ++pos) {
featuresC.add(pWord.substring(pos, pWord.length()) + "-PREV-SUFFIX");
}
maxLen = nWord.length();
if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
maxLen = flags.maxNGramLeng;
}
for (int len = 1; len <= maxLen; ++len) {
featuresC.add(nWord.substring(0, len) + "-NEXT-PREFIX");
}
for (int pos = nWord.length() - maxLen; pos < nWord.length(); ++pos) {
featuresC.add(nWord.substring(pos, nWord.length()) + "-NEXT-SUFFIX");
}
}
return featuresC;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class NERFeatureFactory method featuresCpCnC.
protected Collection<String> featuresCpCnC(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
Collection<String> featuresCpCnC = new ArrayList<>();
if (flags.useNext && flags.usePrev) {
if (flags.useSequences && flags.usePrevSequences && flags.useNextSequences) {
featuresCpCnC.add("PNSEQ");
featuresCpCnC.add(getWord(c) + "-PNSEQW");
}
}
return featuresCpCnC;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class SingletonPredictor method generateFeatureVectors.
/**
* Generate the training features from the CoNLL input file.
* @return Dataset of feature vectors
* @throws Exception
*/
private static GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {
GeneralDataset<String, String> dataset = new Dataset<>();
Dictionaries dict = new Dictionaries(props);
MentionExtractor mentionExtractor = new CoNLLMentionExtractor(dict, props, new Semantics(dict));
Document document;
while ((document = mentionExtractor.nextDoc()) != null) {
setTokenIndices(document);
document.extractGoldCorefClusters();
Map<Integer, CorefCluster> entities = document.goldCorefClusters;
// Generate features for coreferent mentions with class label 1
for (CorefCluster entity : entities.values()) {
for (Mention mention : entity.getCorefMentions()) {
// Ignore verbal mentions
if (mention.headWord.tag().startsWith("V"))
continue;
IndexedWord head = mention.dependency.getNodeByIndexSafe(mention.headWord.index());
if (head == null)
continue;
ArrayList<String> feats = mention.getSingletonFeatures(dict);
dataset.add(new BasicDatum<>(feats, "1"));
}
}
// Generate features for singletons with class label 0
ArrayList<CoreLabel> gold_heads = new ArrayList<>();
for (Mention gold_men : document.allGoldMentions.values()) {
gold_heads.add(gold_men.headWord);
}
for (Mention predicted_men : document.allPredictedMentions.values()) {
SemanticGraph dep = predicted_men.dependency;
IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
if (head == null)
continue;
// Ignore verbal mentions
if (predicted_men.headWord.tag().startsWith("V"))
continue;
// If the mention is in the gold set, it is not a singleton and thus ignore
if (gold_heads.contains(predicted_men.headWord))
continue;
dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
}
}
dataset.summaryStatistics();
return dataset;
}
Aggregations