Search in sources :

Example 21 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class PatternsSimpleThreadedITest method runTest.

void runTest(String numThreads) {
    Properties spiedProperties = new Properties();
    final Path tempPath;
    try {
        tempPath = Files.createTempDirectory(null);
        spiedProperties.load(new InputStreamReader(new FileInputStream(new File("data/edu/stanford/nlp/patterns/patterns_itest.properties")), StandardCharsets.UTF_8));
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
    Path outputPath = Paths.get(tempPath.toString(), "output");
    Path modelPath = Paths.get(tempPath.toString(), "model");
    Path docsentsPath = Paths.get(tempPath.toString(), "docsents.ser");
    System.out.println("Test " + numThreads + " writing to " + tempPath);
    spiedProperties.setProperty("seedWordsFiles", "VACCINE_PREVENTABLE_DISEASE,data/edu/stanford/nlp/patterns/VACCINE_PREVENTABLE_DISEASE.txt");
    // We generate this file below
    spiedProperties.setProperty("file", docsentsPath.toString());
    spiedProperties.setProperty("fileFormat", "ser");
    spiedProperties.setProperty("outDir", outputPath.toString());
    spiedProperties.setProperty("patternsWordsDir", modelPath.toString());
    spiedProperties.setProperty("loadSavedPatternsWordsDir", Boolean.toString(false));
    spiedProperties.setProperty("numThreads", numThreads);
    // Run the pipeline on an input document
    // Algorithm based on
    // https://github.com/stanfordnlp/CoreNLP/blob/a9a4c2d75b177790a24c0f46188810668d044cd8/src/edu/stanford/nlp/patterns/GetPatternsFromDataMultiClass.java#L702
    // useTargetParserParentRestriction is false
    final Annotation document = new Annotation("** If you survive measles without complications ** I love these . " + "Why would n't you survive without complications , Immunologist ?");
    nlpPipeline.annotate(document);
    // Convert annotation to map to serialize, similarly to the original code algorithm
    int i = 0;
    final Map<String, DataInstance> sentenceMap = new HashMap<>();
    for (final CoreMap sentence : document.get(SentencesAnnotation.class)) {
        sentenceMap.put(Integer.toString(i++), DataInstance.getNewInstance(PatternFactory.PatternType.SURFACE, sentence));
    }
    try (final ObjectOutputStream sentenceMapStream = new ObjectOutputStream(new FileOutputStream(docsentsPath.toString()))) {
        sentenceMapStream.writeObject(sentenceMap);
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
    try {
        GetPatternsFromDataMultiClass.<SurfacePattern>run(spiedProperties);
    } catch (Exception e) {
        System.out.println("Test " + numThreads + " FAILED");
        System.out.println("  Intermediate files in " + tempPath);
        throw new RuntimeException(e);
    }
    System.out.println("Cleaning up temp files from " + tempPath);
    FileSystem.deleteDir(tempPath.toFile());
}
Also used : Path(java.nio.file.Path) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) DataInstance(edu.stanford.nlp.patterns.DataInstance) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) Properties(java.util.Properties) ObjectOutputStream(java.io.ObjectOutputStream) FileInputStream(java.io.FileInputStream) SentencesAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) FileOutputStream(java.io.FileOutputStream) SurfacePattern(edu.stanford.nlp.patterns.surface.SurfacePattern) File(java.io.File) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 22 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class CoNLL2011DocumentReader method main.

/**
 * Reads and dumps output, mainly for debugging.
 */
public static void main(String[] args) throws IOException {
    Properties props = StringUtils.argsToProperties(args);
    boolean debug = Boolean.parseBoolean(props.getProperty("debug", "false"));
    String filepath = props.getProperty("i");
    String outfile = props.getProperty("o");
    if (filepath == null || outfile == null) {
        usage();
        System.exit(-1);
    }
    PrintWriter fout = new PrintWriter(outfile);
    logger.info("Writing to " + outfile);
    String ext = props.getProperty("ext");
    Options options;
    if (ext != null) {
        options = new Options(".*" + ext + "$");
    } else {
        options = new Options();
    }
    options.annotateTreeCoref = true;
    options.annotateTreeNer = true;
    CorpusStats corpusStats = new CorpusStats();
    CoNLL2011DocumentReader reader = new CoNLL2011DocumentReader(filepath, options);
    int docCnt = 0;
    int sentCnt = 0;
    int tokenCnt = 0;
    for (Document doc; (doc = reader.getNextDocument()) != null; ) {
        corpusStats.process(doc);
        docCnt++;
        Annotation anno = doc.getAnnotation();
        if (debug)
            System.out.println("Document " + docCnt + ": " + anno.get(CoreAnnotations.DocIDAnnotation.class));
        for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) {
            if (debug)
                System.out.println("Parse: " + sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
            if (debug)
                System.out.println("Sentence Tokens: " + StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), ","));
            writeTabSep(fout, sentence, doc.corefChainMap);
            sentCnt++;
            tokenCnt += sentence.get(CoreAnnotations.TokensAnnotation.class).size();
        }
        if (debug) {
            for (CoreMap ner : doc.nerChunks) {
                System.out.println("NER Chunk: " + ner);
            }
            for (String id : doc.corefChainMap.keySet()) {
                System.out.println("Coref: " + id + " = " + StringUtils.join(doc.corefChainMap.get(id), ";"));
            }
        }
    }
    fout.close();
    System.out.println("Total document count: " + docCnt);
    System.out.println("Total sentence count: " + sentCnt);
    System.out.println("Total token count: " + tokenCnt);
    System.out.println(corpusStats);
}
Also used : CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Example 23 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class RuleBasedCorefMentionFinder method parse.

private Tree parse(List<CoreLabel> tokens, List<ParserConstraint> constraints) {
    CoreMap sent = new Annotation("");
    sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
    sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
    Annotation doc = new Annotation("");
    List<CoreMap> sents = new ArrayList<>(1);
    sents.add(sent);
    doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
    getParser().annotate(doc);
    sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
    return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
}
Also used : ParserAnnotations(edu.stanford.nlp.parser.common.ParserAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) Annotation(edu.stanford.nlp.pipeline.Annotation)

Example 24 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class PhraseTable method splitText.

public String[] splitText(String phraseText) {
    String[] words;
    if (tokenizer != null) {
        Annotation annotation = new Annotation(phraseText);
        tokenizer.annotate(annotation);
        List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
        words = new String[tokens.size()];
        for (int i = 0; i < tokens.size(); i++) {
            words[i] = tokens.get(i).word();
        }
    } else {
        phraseText = possPattern.matcher(phraseText).replaceAll(" 's$1");
        words = delimPattern.split(phraseText);
    }
    return words;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Annotation(edu.stanford.nlp.pipeline.Annotation)

Example 25 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class XMLToAnnotation method readXMLFormat.

public static Data readXMLFormat(String fileName) throws Exception {
    // Extract character list, gold quote speaker and mention information from the XML document.
    Document doc = XMLUtils.readDocumentFromFile(fileName);
    Node text = doc.getDocumentElement().getElementsByTagName("text").item(0);
    String docText = getJustText(text);
    Annotation document = getAnnotatedFile(docText, fileName, getProcessedCoreNLPProperties());
    List<CoreMap> quotes = document.get(CoreAnnotations.QuotationsAnnotation.class);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    List<GoldQuoteInfo> goldList = new ArrayList<>();
    Map<Integer, Mention> idToMention = new HashMap<>();
    List<Person> personList = readXMLCharacterList(doc);
    Map<String, List<Person>> personMap = QuoteAttributionUtils.readPersonMap(personList);
    List<Pair<Integer, String>> mentionIdToSpeakerList = new ArrayList<>();
    // there is at least 1 case in which the XML quote does not match up with the automatically-extracted quote. (Ex: quote by Mr. Collins that begins, "Hunsford, near Westerham, Kent, ...")
    // as the dirty solution, we treat all quotes encapsulated within an XML quote as the same speaker (although this is not 100% accurate!)
    int quoteIndex = 0;
    NodeList textElems = text.getChildNodes();
    int tokenIndex = 0;
    for (int i = 0; i < textElems.getLength(); i++) {
        Node chapterNode = textElems.item(i);
        if (chapterNode.getNodeName().equals("chapter")) {
            NodeList chapElems = chapterNode.getChildNodes();
            for (int j = 0; j < chapElems.getLength(); j++) {
                Node child = chapElems.item(j);
                if (child.getNodeName().equals("quote")) {
                    // search for nested mentions
                    NodeList quoteChildren = child.getChildNodes();
                    for (int k = 0; k < quoteChildren.getLength(); k++) {
                        Node quoteChild = quoteChildren.item(k);
                        if (quoteChild.getNodeName().equals("mention")) {
                            String mentionText = quoteChild.getTextContent();
                            int id = Integer.parseInt(quoteChild.getAttributes().getNamedItem("id").getTextContent().substring(1));
                            List<Integer> connections = readConnection(quoteChild.getAttributes().getNamedItem("connection").getNodeValue());
                            int endIndex = getEndIndex(tokenIndex, tokens, mentionText);
                            // mentions.put(id, new XMLMention(quoteChild.getTextContent(), tokenIndex, endIndex, id, connections));
                            idToMention.put(id, new Mention(mentionText, tokenIndex, endIndex));
                            tokenIndex = endIndex + 1;
                        } else {
                            String quoteText = quoteChild.getTextContent();
                            // trim unnecessarily newlines
                            quoteText = quoteText.replaceAll("\n(?!\n)", " ");
                            quoteText = quoteText.replaceAll("_", "");
                            tokenIndex = getEndIndex(tokenIndex, tokens, quoteText) + 1;
                        }
                    }
                    String quoteText = child.getTextContent();
                    // tokenIndex = getEndIndex(tokenIndex, tokens, quoteText) + 1;
                    // trim unnecessarily newlines
                    quoteText = quoteText.replaceAll("\n(?!\n)", " ");
                    quoteText = quoteText.replaceAll("_", "");
                    int quotationOffset = 1;
                    if (quoteText.startsWith("``"))
                        quotationOffset = 2;
                    List<Integer> connections = readConnection(child.getAttributes().getNamedItem("connection").getTextContent());
                    int id = Integer.parseInt(child.getAttributes().getNamedItem("id").getTextContent().substring(1));
                    Integer mention_id = null;
                    if (connections.size() > 0)
                        mention_id = connections.get(0);
                    else {
                        System.out.println("quote w/ no mention. ID: " + id);
                    }
                    // Pair<Integer, Integer> mentionPair = idToMentionPair.get(mention_id);
                    mentionIdToSpeakerList.add(new Pair<>(mention_id, child.getAttributes().getNamedItem("speaker").getTextContent()));
                    String annotatedQuoteText = quotes.get(quoteIndex).get(CoreAnnotations.TextAnnotation.class);
                    while (!quoteText.endsWith(annotatedQuoteText)) {
                        quoteIndex++;
                        annotatedQuoteText = quotes.get(quoteIndex).get(CoreAnnotations.TextAnnotation.class);
                        mentionIdToSpeakerList.add(new Pair<>(mention_id, child.getAttributes().getNamedItem("speaker").getTextContent()));
                    }
                    // idToMentionPair.put(id, new Pair<>(-1, -1));
                    // imention_id = connections.get(0);
                    // quotes.add(new XMLQuote(quoteText.substring(quotationOffset, quoteText.length() - quotationOffset), child.getAttributes().getNamedItem("speaker").getTextContent(), id, chapterIndex, mention_id));
                    quoteIndex++;
                } else if (child.getNodeName().equals("mention")) {
                    String mentionText = child.getTextContent();
                    int id = Integer.parseInt(child.getAttributes().getNamedItem("id").getTextContent().substring(1));
                    List<Integer> connections = readConnection(child.getAttributes().getNamedItem("connection").getNodeValue());
                    int endIndex = getEndIndex(tokenIndex, tokens, mentionText);
                    idToMention.put(id, new Mention(mentionText, tokenIndex, endIndex));
                    // mentions.put(id, new XMLMention(child.getTextContent(), tokenIndex, endIndex, id, connections));
                    tokenIndex = endIndex + 1;
                } else {
                    // #text
                    String nodeText = child.getTextContent();
                    nodeText = nodeText.replaceAll("\n(?!\n)", " ");
                    nodeText = nodeText.replaceAll("_", "");
                    if (tokenIndex >= tokens.size()) {
                        continue;
                    }
                    tokenIndex = getEndIndex(tokenIndex, tokens, nodeText) + 1;
                }
            }
        }
    }
    for (Pair<Integer, String> item : mentionIdToSpeakerList) {
        Mention mention = idToMention.get(item.first);
        if (mention == null) {
            goldList.add(new GoldQuoteInfo(-1, -1, item.second, null));
        } else {
            goldList.add(new GoldQuoteInfo(mention.begin, mention.end, item.second, mention.text));
        }
    }
    // verify
    if (document.get(CoreAnnotations.QuotationsAnnotation.class).size() != goldList.size()) {
        throw new RuntimeException("Quotes size and gold size don't match!");
    }
    return new Data(goldList, personList, document);
}
Also used : Node(org.w3c.dom.Node) Document(org.w3c.dom.Document) NodeList(org.w3c.dom.NodeList) Pair(edu.stanford.nlp.util.Pair) NodeList(org.w3c.dom.NodeList) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Aggregations

Annotation (edu.stanford.nlp.pipeline.Annotation)138 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)84 CoreMap (edu.stanford.nlp.util.CoreMap)77 CoreLabel (edu.stanford.nlp.ling.CoreLabel)48 StanfordCoreNLP (edu.stanford.nlp.pipeline.StanfordCoreNLP)43 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)31 ArrayList (java.util.ArrayList)31 Properties (java.util.Properties)28 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)21 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)18 Test (org.junit.Test)18 SentencesAnnotation (edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation)15 Tree (edu.stanford.nlp.trees.Tree)14 TokensAnnotation (edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation)12 TreeAnnotation (edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation)12 List (java.util.List)12 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)11 IOException (java.io.IOException)11 CorefChain (edu.stanford.nlp.coref.data.CorefChain)10 RNNCoreAnnotations (edu.stanford.nlp.neural.rnn.RNNCoreAnnotations)10