Search in sources :

Example 1 with SeqClassifierFlags

use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.

the class NERBenchmarkSlowITest method evalConll.

/**
 * The main engine that does the heavy lifting for evaluating a dataset. We are performing
 * 4-way classification on: ORG, PER, LOC, MISC
 * @param dataset Dataset prefix to evaluate. Should be one of "train", "dev", "test"
 * @throws IOException
 * @throws Exception
 * @return F1 computed for given dataset by model
 */
// NOTE that CoNLL tests assume a 4-class classification scheme: ORG, PER, LOC, MISC
public HashMap<String, Double> evalConll(String dataset) throws IOException {
    SeqClassifierFlags flags = new SeqClassifierFlags();
    flags.entitySubclassification = "noprefix";
    CoNLLDocumentReaderAndWriter rw = new CoNLLDocumentReaderAndWriter();
    rw.init(flags);
    String inputFile;
    File resultsFile;
    switch(dataset) {
        case "train":
            resultsFile = File.createTempFile("conlldev", null);
            inputFile = CONLL_DEV;
            break;
        case "dev":
            resultsFile = File.createTempFile("conlldev", null);
            inputFile = CONLL_DEV;
            break;
        case "test":
            resultsFile = File.createTempFile("conlltest", null);
            inputFile = CONLL_TEST;
            break;
        default:
            throw new RuntimeException("Not a valid dataset name provided!");
    }
    resultsFile.deleteOnExit();
    PrintWriter writer = new PrintWriter(resultsFile);
    for (Iterator<List<CoreLabel>> itr = rw.getIterator(IOUtils.readerFromString(inputFile)); itr.hasNext(); ) {
        List<CoreLabel> goldLabels = itr.next();
        String docString = "";
        for (CoreLabel f1 : goldLabels) {
            docString += " " + f1.word();
        }
        Annotation docAnnotation = new Annotation(docString);
        conllNERAnnotationPipeline.annotate(docAnnotation);
        List<CoreLabel> predictLabels = new ArrayList<CoreLabel>();
        for (CoreLabel l : docAnnotation.get(TokensAnnotation.class)) {
            predictLabels.add(l);
        }
        assertEquals("# gold outputs not same as # predicted!\n", goldLabels.size(), predictLabels.size());
        int numLabels = goldLabels.size();
        // Write to output file
        for (int i = 0; i < numLabels; i++) {
            CoreLabel gold = goldLabels.get(i);
            String goldToken;
            // TODO(meric): What is difference between GoldAnswer and Answer annotation?
            goldToken = gold.get(AnswerAnnotation.class);
            CoreLabel predict = predictLabels.get(i);
            String predictStr = predict.get(NamedEntityTagAnnotation.class);
            String predictPrefix = convert(predictStr);
            assertEquals("Gold and Predict words don't match!\n", gold.get(TextAnnotation.class), predict.get(TextAnnotation.class));
            writer.println(gold.get(TextAnnotation.class) + "\t" + "_" + "\t" + goldToken + "\t" + predictPrefix);
        }
    }
    writer.close();
    // Run CoNLL eval script and extract F1 score
    String result = runEvalScript(resultsFile);
    HashMap<String, Double> parsedF1 = parseResults(result);
    return parsedF1;
}
Also used : AnswerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation) ArrayList(java.util.ArrayList) SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags) TextAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation) AnswerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation) TokensAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) NamedEntityTagAnnotation(edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ArrayList(java.util.ArrayList) List(java.util.List) TextAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation) File(java.io.File) CoNLLDocumentReaderAndWriter(edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter) PrintWriter(java.io.PrintWriter)

Example 2 with SeqClassifierFlags

use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.

the class NERFeatureFactoryITest method testSloppyGazette.

@Test
public void testSloppyGazette() {
    List<CoreLabel> sentence = SentenceUtils.toCoreLabelList("For three years , John Bauer has worked at Stanford .".split(" +"));
    PaddedList<CoreLabel> paddedSentence = new PaddedList<>(sentence, new CoreLabel());
    Properties props = new Properties();
    props.setProperty("useGazettes", "true");
    props.setProperty("sloppyGazette", "true");
    props.setProperty("gazette", "data/edu/stanford/nlp/ie/test_gazette.txt");
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    NERFeatureFactory<CoreLabel> factory = new NERFeatureFactory<>();
    factory.init(flags);
    Set<String> features = new HashSet<String>();
    NERFeatureFactory.FeatureCollector collector = new NERFeatureFactory.FeatureCollector(features);
    factory.featuresC(paddedSentence, 4, collector);
    checkFeatures(features, "FOO-GAZ|C", "BAR-GAZ|C", "John-WORD|C", "FOO-GAZ1|C", "BAR-GAZ2|C", "BAZ-GAZ2|C", "BAZ-GAZ|C");
    features.clear();
    factory.featuresC(paddedSentence, 5, collector);
    checkFeatures(features, "BAR-GAZ|C", "BAZ-GAZ|C", "BAR-GAZ2|C", "BAZ-GAZ2|C", "Bauer-WORD|C");
    features.clear();
    factory.featuresC(paddedSentence, 6, collector);
    checkFeatures(features, "has-WORD|C");
    features.clear();
}
Also used : PaddedList(edu.stanford.nlp.util.PaddedList) Properties(java.util.Properties) SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags) CoreLabel(edu.stanford.nlp.ling.CoreLabel) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with SeqClassifierFlags

use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.

the class NERClassifierCombiner method main.

/**
 * The main method.
 */
public static void main(String[] args) throws Exception {
    StringUtils.logInvocationString(log, args);
    Properties props = StringUtils.argsToProperties(args);
    // false for print probs as printed in next code block
    SeqClassifierFlags flags = new SeqClassifierFlags(props, false);
    String loadPath = props.getProperty("loadClassifier");
    NERClassifierCombiner ncc;
    if (loadPath != null) {
        // note that when loading a serialized classifier, the philosophy is override
        // any settings in props with those given in the commandline
        // so if you dumped it with useSUTime = false, and you say -useSUTime at
        // the commandline, the commandline takes precedence
        ncc = getClassifier(loadPath, props);
    } else {
        // pass null for passDownProperties to let all props go through
        ncc = createNERClassifierCombiner("ner", null, props);
    }
    // write the NERClassifierCombiner to the given path on disk
    String serializeTo = props.getProperty("serializeTo");
    if (serializeTo != null) {
        ncc.serializeClassifier(serializeTo);
    }
    String textFile = props.getProperty("textFile");
    if (textFile != null) {
        ncc.classifyAndWriteAnswers(textFile);
    }
    // run on multiple textFiles , based off CRFClassifier code
    String textFiles = props.getProperty("textFiles");
    if (textFiles != null) {
        List<File> files = new ArrayList<>();
        for (String filename : textFiles.split(",")) {
            files.add(new File(filename));
        }
        ncc.classifyFilesAndWriteAnswers(files);
    }
    // options for run the NERClassifierCombiner on a testFile or testFiles
    String testFile = props.getProperty("testFile");
    String testFiles = props.getProperty("testFiles");
    String crfToExamine = props.getProperty("crfToExamine");
    DocumentReaderAndWriter<CoreLabel> readerAndWriter = ncc.defaultReaderAndWriter();
    if (testFile != null || testFiles != null) {
        // check if there is not a crf specific request
        if (crfToExamine == null) {
            // in this case there is no crfToExamine
            if (testFile != null) {
                ncc.classifyAndWriteAnswers(testFile, readerAndWriter, true);
            } else {
                List<File> files = Arrays.stream(testFiles.split(",")).map(File::new).collect(Collectors.toList());
                ncc.classifyFilesAndWriteAnswers(files, ncc.defaultReaderAndWriter(), true);
            }
        } else {
            ClassifierCombiner.examineCRF(ncc, crfToExamine, flags, testFile, testFiles, readerAndWriter);
        }
    }
    // option for showing info about the NERClassifierCombiner
    String showNCCInfo = props.getProperty("showNCCInfo");
    if (showNCCInfo != null) {
        showNCCInfo(ncc);
    }
    // option for reading in from stdin
    if (flags.readStdin) {
        ncc.classifyStdin();
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags)

Example 4 with SeqClassifierFlags

use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.

the class AnnotatedTextReader method parseColumnFile.

public static Map<String, DataInstance> parseColumnFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) {
    CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
    Properties props = new Properties();
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    flags.entitySubclassification = "noprefix";
    flags.retainEntitySubclassification = false;
    conllreader.init(flags);
    Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);
    ;
    int num = -1;
    Map<String, DataInstance> sents = new HashMap<>();
    while (dociter.hasNext()) {
        List<CoreLabel> doc = dociter.next();
        List<String> words = new ArrayList<>();
        List<CoreLabel> sentcore = new ArrayList<>();
        int tokenindex = 0;
        for (CoreLabel l : doc) {
            if (l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")) {
                if (words.size() > 0) {
                    num++;
                    String docid = sentIDprefix + "-" + String.valueOf(num);
                    DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
                    sents.put(docid, sentInst);
                    words = new ArrayList<>();
                    sentcore = new ArrayList<>();
                    tokenindex = 0;
                }
                continue;
            }
            tokenindex++;
            words.add(l.word());
            l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
            l.set(CoreAnnotations.ValueAnnotation.class, l.word());
            String label = l.get(CoreAnnotations.AnswerAnnotation.class);
            assert label != null : "label cannot be null";
            l.set(CoreAnnotations.TextAnnotation.class, l.word());
            l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());
            if (setGoldClass) {
                l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
            }
            if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
                l.set(setClassForTheseLabels.get(label), label);
            sentcore.add(l);
        }
        if (words.size() > 0) {
            num++;
            String docid = sentIDprefix + "-" + String.valueOf(num);
            ;
            DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
            sents.put(docid, sentInst);
        }
    }
    return sents;
}
Also used : DataInstance(edu.stanford.nlp.patterns.DataInstance) SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoNLLDocumentReaderAndWriter(edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter)

Example 5 with SeqClassifierFlags

use of edu.stanford.nlp.sequences.SeqClassifierFlags in project CoreNLP by stanfordnlp.

the class MaxMatchSegmenter method main.

public static void main(String[] args) {
    Properties props = StringUtils.argsToProperties(args);
    // logger.debug(props.toString());
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    MaxMatchSegmenter seg = new MaxMatchSegmenter();
    String lexiconFile = props.getProperty("lexicon");
    if (lexiconFile != null) {
        seg.addLexicon(lexiconFile);
    } else {
        logger.error("Error: no lexicon file!");
        System.exit(1);
    }
    Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter();
    sighanRW.init(flags);
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    PrintWriter stdoutW = new PrintWriter(System.out);
    int lineNb = 0;
    for (; ; ) {
        ++lineNb;
        logger.info("line: " + lineNb);
        try {
            String line = br.readLine();
            if (line == null)
                break;
            String outputLine = null;
            if (props.getProperty("greedy") != null) {
                ArrayList<Word> sentence = seg.greedilySegmentWords(line);
                outputLine = SentenceUtils.listToString(sentence);
            } else if (props.getProperty("maxwords") != null) {
                seg.buildSegmentationLattice(line);
                outputLine = SentenceUtils.listToString(seg.segmentWords(MatchHeuristic.MAXWORDS));
            } else {
                seg.buildSegmentationLattice(line);
                outputLine = SentenceUtils.listToString(seg.maxMatchSegmentation());
            }
            StringReader strR = new StringReader(outputLine);
            Iterator<List<CoreLabel>> itr = sighanRW.getIterator(strR);
            while (itr.hasNext()) {
                sighanRW.printAnswers(itr.next(), stdoutW);
            }
        // System.out.println(outputLine);
        } catch (IOException e) {
            break;
        }
    }
    stdoutW.flush();
}
Also used : SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags) EncodingPrintWriter(edu.stanford.nlp.io.EncodingPrintWriter)

Aggregations

SeqClassifierFlags (edu.stanford.nlp.sequences.SeqClassifierFlags)10 CoreLabel (edu.stanford.nlp.ling.CoreLabel)8 ArrayList (java.util.ArrayList)3 List (java.util.List)3 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)2 AnswerAnnotation (edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation)2 CoNLLDocumentReaderAndWriter (edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter)2 File (java.io.File)2 PrintWriter (java.io.PrintWriter)2 HashSet (java.util.HashSet)2 Properties (java.util.Properties)2 TestDiscrete (edu.illinois.cs.cogcomp.lbjava.classify.TestDiscrete)1 EncodingPrintWriter (edu.stanford.nlp.io.EncodingPrintWriter)1 GoldAnswerAnnotation (edu.stanford.nlp.ling.CoreAnnotations.GoldAnswerAnnotation)1 NamedEntityTagAnnotation (edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation)1 TextAnnotation (edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation)1 TokensAnnotation (edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation)1 DataInstance (edu.stanford.nlp.patterns.DataInstance)1 Annotation (edu.stanford.nlp.pipeline.Annotation)1 PaddedList (edu.stanford.nlp.util.PaddedList)1