use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.
the class NERBenchmarkSlowITest method evalConll.
/**
* The main engine that does the heavy lifting for evaluating a dataset. We are performing
* 4-way classification on: ORG, PER, LOC, MISC
* @param dataset Dataset prefix to evaluate. Should be one of "train", "dev", "test"
* @throws IOException
* @throws Exception
* @return F1 computed for given dataset by model
*/
// NOTE that CoNLL tests assume a 4-class classification scheme: ORG, PER, LOC, MISC
public HashMap<String, Double> evalConll(String dataset) throws IOException {
SeqClassifierFlags flags = new SeqClassifierFlags();
flags.entitySubclassification = "noprefix";
CoNLLDocumentReaderAndWriter rw = new CoNLLDocumentReaderAndWriter();
rw.init(flags);
String inputFile;
File resultsFile;
switch(dataset) {
case "train":
resultsFile = File.createTempFile("conlldev", null);
inputFile = CONLL_DEV;
break;
case "dev":
resultsFile = File.createTempFile("conlldev", null);
inputFile = CONLL_DEV;
break;
case "test":
resultsFile = File.createTempFile("conlltest", null);
inputFile = CONLL_TEST;
break;
default:
throw new RuntimeException("Not a valid dataset name provided!");
}
resultsFile.deleteOnExit();
PrintWriter writer = new PrintWriter(resultsFile);
for (Iterator<List<CoreLabel>> itr = rw.getIterator(IOUtils.readerFromString(inputFile)); itr.hasNext(); ) {
List<CoreLabel> goldLabels = itr.next();
String docString = "";
for (CoreLabel f1 : goldLabels) {
docString += " " + f1.word();
}
Annotation docAnnotation = new Annotation(docString);
conllNERAnnotationPipeline.annotate(docAnnotation);
List<CoreLabel> predictLabels = new ArrayList<CoreLabel>();
for (CoreLabel l : docAnnotation.get(TokensAnnotation.class)) {
predictLabels.add(l);
}
assertEquals("# gold outputs not same as # predicted!\n", goldLabels.size(), predictLabels.size());
int numLabels = goldLabels.size();
// Write to output file
for (int i = 0; i < numLabels; i++) {
CoreLabel gold = goldLabels.get(i);
String goldToken;
// TODO(meric): What is difference between GoldAnswer and Answer annotation?
goldToken = gold.get(AnswerAnnotation.class);
CoreLabel predict = predictLabels.get(i);
String predictStr = predict.get(NamedEntityTagAnnotation.class);
String predictPrefix = convert(predictStr);
assertEquals("Gold and Predict words don't match!\n", gold.get(TextAnnotation.class), predict.get(TextAnnotation.class));
writer.println(gold.get(TextAnnotation.class) + "\t" + "_" + "\t" + goldToken + "\t" + predictPrefix);
}
}
writer.close();
// Run CoNLL eval script and extract F1 score
String result = runEvalScript(resultsFile);
HashMap<String, Double> parsedF1 = parseResults(result);
return parsedF1;
}
use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.
the class NERFeatureFactoryITest method testSloppyGazette.
@Test
public void testSloppyGazette() {
List<CoreLabel> sentence = SentenceUtils.toCoreLabelList("For three years , John Bauer has worked at Stanford .".split(" +"));
PaddedList<CoreLabel> paddedSentence = new PaddedList<>(sentence, new CoreLabel());
Properties props = new Properties();
props.setProperty("useGazettes", "true");
props.setProperty("sloppyGazette", "true");
props.setProperty("gazette", "data/edu/stanford/nlp/ie/test_gazette.txt");
SeqClassifierFlags flags = new SeqClassifierFlags(props);
NERFeatureFactory<CoreLabel> factory = new NERFeatureFactory<>();
factory.init(flags);
Set<String> features = new HashSet<String>();
NERFeatureFactory.FeatureCollector collector = new NERFeatureFactory.FeatureCollector(features);
factory.featuresC(paddedSentence, 4, collector);
checkFeatures(features, "FOO-GAZ|C", "BAR-GAZ|C", "John-WORD|C", "FOO-GAZ1|C", "BAR-GAZ2|C", "BAZ-GAZ2|C", "BAZ-GAZ|C");
features.clear();
factory.featuresC(paddedSentence, 5, collector);
checkFeatures(features, "BAR-GAZ|C", "BAZ-GAZ|C", "BAR-GAZ2|C", "BAZ-GAZ2|C", "Bauer-WORD|C");
features.clear();
factory.featuresC(paddedSentence, 6, collector);
checkFeatures(features, "has-WORD|C");
features.clear();
}
use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.
the class NERClassifierCombiner method main.
/**
* The main method.
*/
public static void main(String[] args) throws Exception {
StringUtils.logInvocationString(log, args);
Properties props = StringUtils.argsToProperties(args);
// false for print probs as printed in next code block
SeqClassifierFlags flags = new SeqClassifierFlags(props, false);
String loadPath = props.getProperty("loadClassifier");
NERClassifierCombiner ncc;
if (loadPath != null) {
// note that when loading a serialized classifier, the philosophy is override
// any settings in props with those given in the commandline
// so if you dumped it with useSUTime = false, and you say -useSUTime at
// the commandline, the commandline takes precedence
ncc = getClassifier(loadPath, props);
} else {
// pass null for passDownProperties to let all props go through
ncc = createNERClassifierCombiner("ner", null, props);
}
// write the NERClassifierCombiner to the given path on disk
String serializeTo = props.getProperty("serializeTo");
if (serializeTo != null) {
ncc.serializeClassifier(serializeTo);
}
String textFile = props.getProperty("textFile");
if (textFile != null) {
ncc.classifyAndWriteAnswers(textFile);
}
// run on multiple textFiles , based off CRFClassifier code
String textFiles = props.getProperty("textFiles");
if (textFiles != null) {
List<File> files = new ArrayList<>();
for (String filename : textFiles.split(",")) {
files.add(new File(filename));
}
ncc.classifyFilesAndWriteAnswers(files);
}
// options for run the NERClassifierCombiner on a testFile or testFiles
String testFile = props.getProperty("testFile");
String testFiles = props.getProperty("testFiles");
String crfToExamine = props.getProperty("crfToExamine");
DocumentReaderAndWriter<CoreLabel> readerAndWriter = ncc.defaultReaderAndWriter();
if (testFile != null || testFiles != null) {
// check if there is not a crf specific request
if (crfToExamine == null) {
// in this case there is no crfToExamine
if (testFile != null) {
ncc.classifyAndWriteAnswers(testFile, readerAndWriter, true);
} else {
List<File> files = Arrays.stream(testFiles.split(",")).map(File::new).collect(Collectors.toList());
ncc.classifyFilesAndWriteAnswers(files, ncc.defaultReaderAndWriter(), true);
}
} else {
ClassifierCombiner.examineCRF(ncc, crfToExamine, flags, testFile, testFiles, readerAndWriter);
}
}
// option for showing info about the NERClassifierCombiner
String showNCCInfo = props.getProperty("showNCCInfo");
if (showNCCInfo != null) {
showNCCInfo(ncc);
}
// option for reading in from stdin
if (flags.readStdin) {
ncc.classifyStdin();
}
}
use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.
the class AnnotatedTextReader method parseColumnFile.
public static Map<String, DataInstance> parseColumnFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) {
CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
Properties props = new Properties();
SeqClassifierFlags flags = new SeqClassifierFlags(props);
flags.entitySubclassification = "noprefix";
flags.retainEntitySubclassification = false;
conllreader.init(flags);
Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);
;
int num = -1;
Map<String, DataInstance> sents = new HashMap<>();
while (dociter.hasNext()) {
List<CoreLabel> doc = dociter.next();
List<String> words = new ArrayList<>();
List<CoreLabel> sentcore = new ArrayList<>();
int tokenindex = 0;
for (CoreLabel l : doc) {
if (l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")) {
if (words.size() > 0) {
num++;
String docid = sentIDprefix + "-" + String.valueOf(num);
DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
sents.put(docid, sentInst);
words = new ArrayList<>();
sentcore = new ArrayList<>();
tokenindex = 0;
}
continue;
}
tokenindex++;
words.add(l.word());
l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
String label = l.get(CoreAnnotations.AnswerAnnotation.class);
assert label != null : "label cannot be null";
l.set(CoreAnnotations.TextAnnotation.class, l.word());
l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());
if (setGoldClass) {
l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
}
if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
l.set(setClassForTheseLabels.get(label), label);
sentcore.add(l);
}
if (words.size() > 0) {
num++;
String docid = sentIDprefix + "-" + String.valueOf(num);
;
DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
sents.put(docid, sentInst);
}
}
return sents;
}
use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.
the class MaxMatchSegmenter method main.
public static void main(String[] args) {
Properties props = StringUtils.argsToProperties(args);
// logger.debug(props.toString());
SeqClassifierFlags flags = new SeqClassifierFlags(props);
MaxMatchSegmenter seg = new MaxMatchSegmenter();
String lexiconFile = props.getProperty("lexicon");
if (lexiconFile != null) {
seg.addLexicon(lexiconFile);
} else {
logger.error("Error: no lexicon file!");
System.exit(1);
}
Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter();
sighanRW.init(flags);
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
PrintWriter stdoutW = new PrintWriter(System.out);
int lineNb = 0;
for (; ; ) {
++lineNb;
logger.info("line: " + lineNb);
try {
String line = br.readLine();
if (line == null)
break;
String outputLine = null;
if (props.getProperty("greedy") != null) {
ArrayList<Word> sentence = seg.greedilySegmentWords(line);
outputLine = SentenceUtils.listToString(sentence);
} else if (props.getProperty("maxwords") != null) {
seg.buildSegmentationLattice(line);
outputLine = SentenceUtils.listToString(seg.segmentWords(MatchHeuristic.MAXWORDS));
} else {
seg.buildSegmentationLattice(line);
outputLine = SentenceUtils.listToString(seg.maxMatchSegmentation());
}
StringReader strR = new StringReader(outputLine);
Iterator<List<CoreLabel>> itr = sighanRW.getIterator(strR);
while (itr.hasNext()) {
sighanRW.printAnswers(itr.next(), stdoutW);
}
// System.out.println(outputLine);
} catch (IOException e) {
break;
}
}
stdoutW.flush();
}
Aggregations