Search in sources :

Example 26 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class WikidictAnnotator method main.

/**
   * A debugging method to try entity linking sentences from the console.
   * @throws IOException
   */
public static void main(String[] args) throws IOException {
    Properties props = StringUtils.argsToProperties(args);
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,entitymentions,entitylink");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    IOUtils.console("sentence> ", line -> {
        Annotation ann = new Annotation(line);
        pipeline.annotate(ann);
        List<CoreLabel> tokens = ann.get(CoreAnnotations.SentencesAnnotation.class).get(0).get(CoreAnnotations.TokensAnnotation.class);
        System.err.println(StringUtils.join(tokens.stream().map(x -> x.get(CoreAnnotations.WikipediaEntityAnnotation.class)), "  "));
    });
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation)

Example 27 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class DocumentPreprocessor method main.

/**
   * A simple, deterministic sentence-splitter. This method only supports the English
   * tokenizer, so for other languages you should run the tokenizer first and then
   * run this sentence splitter with the "-whitespaceTokenization" option.
   *
   * @param args Command-line arguments
   */
public static void main(String[] args) throws IOException {
    final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
    if (options.containsKey("help")) {
        log.info(usage());
        return;
    }
    // Command-line flags
    String encoding = options.getProperty("encoding", "utf-8");
    boolean printSentenceLengths = PropertiesUtils.getBool(options, "printSentenceLengths", false);
    String xmlElementDelimiter = options.getProperty("xml", null);
    DocType docType = xmlElementDelimiter == null ? DocType.Plain : DocType.XML;
    String sentenceDelimiter = options.containsKey("noTokenization") ? System.getProperty("line.separator") : null;
    String tagDelimiter = options.getProperty("tag", null);
    String[] sentenceDelims = null;
    // Setup the TokenizerFactory
    int numFactoryFlags = 0;
    boolean suppressEscaping = options.containsKey("suppressEscaping");
    if (suppressEscaping)
        numFactoryFlags += 1;
    boolean customTokenizer = options.containsKey("tokenizerOptions");
    if (customTokenizer)
        numFactoryFlags += 1;
    boolean printOriginalText = options.containsKey("printOriginalText");
    if (printOriginalText)
        numFactoryFlags += 1;
    boolean whitespaceTokenization = options.containsKey("whitespaceTokenization");
    if (whitespaceTokenization)
        numFactoryFlags += 1;
    if (numFactoryFlags > 1) {
        log.info("Only one tokenizer flag allowed at a time: ");
        log.info("  -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
        return;
    }
    TokenizerFactory<? extends HasWord> tf = null;
    if (suppressEscaping) {
        tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
    } else if (customTokenizer) {
        tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions"));
    } else if (printOriginalText) {
        tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
    } else if (whitespaceTokenization) {
        List<String> whitespaceDelims = new ArrayList<>(Arrays.asList(DocumentPreprocessor.DEFAULT_SENTENCE_DELIMS));
        whitespaceDelims.add(WhitespaceLexer.NEWLINE);
        sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
    } else {
        tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    }
    String fileList = options.getProperty("", null);
    String[] files = fileList == null ? new String[1] : fileList.split("\\s+");
    int numSents = 0;
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);
    for (String file : files) {
        DocumentPreprocessor docPreprocessor;
        if (file == null || file.isEmpty()) {
            docPreprocessor = new DocumentPreprocessor(new InputStreamReader(System.in, encoding));
        } else {
            docPreprocessor = new DocumentPreprocessor(file, docType, encoding);
        }
        if (docType == DocType.XML) {
            docPreprocessor.setElementDelimiter(xmlElementDelimiter);
        }
        docPreprocessor.setTokenizerFactory(tf);
        if (sentenceDelimiter != null) {
            docPreprocessor.setSentenceDelimiter(sentenceDelimiter);
        }
        if (tagDelimiter != null) {
            docPreprocessor.setTagDelimiter(tagDelimiter);
        }
        if (sentenceDelims != null) {
            docPreprocessor.setSentenceFinalPuncWords(sentenceDelims);
        }
        for (List<HasWord> sentence : docPreprocessor) {
            numSents++;
            if (printSentenceLengths) {
                System.err.printf("Length: %d%n", sentence.size());
            }
            boolean printSpace = false;
            for (HasWord word : sentence) {
                if (printOriginalText) {
                    CoreLabel cl = (CoreLabel) word;
                    if (!printSpace) {
                        pw.print(cl.get(CoreAnnotations.BeforeAnnotation.class));
                        printSpace = true;
                    }
                    pw.print(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
                    pw.print(cl.get(CoreAnnotations.AfterAnnotation.class));
                } else {
                    if (printSpace)
                        pw.print(" ");
                    printSpace = true;
                    pw.print(word.word());
                }
            }
            pw.println();
        }
    }
    pw.close();
    System.err.printf("Read in %d sentences.%n", numSents);
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) InputStreamReader(java.io.InputStreamReader) Properties(java.util.Properties) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ArrayList(java.util.ArrayList) List(java.util.List) OutputStreamWriter(java.io.OutputStreamWriter) PrintWriter(java.io.PrintWriter)

Example 28 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class Sighan2005DocumentReaderAndWriter method printLattice.

@Override
public void printLattice(DFSA<String, Integer> tagLattice, List<CoreLabel> doc, PrintWriter out) {
    CoreLabel[] docArray = doc.toArray(new CoreLabel[doc.size()]);
    // Create answer lattice:
    MutableInteger nodeId = new MutableInteger(0);
    DFSA<String, Integer> answerLattice = new DFSA<>(null);
    DFSAState<String, Integer> aInitState = new DFSAState<>(nodeId.intValue(), answerLattice);
    answerLattice.setInitialState(aInitState);
    Map<DFSAState<String, Integer>, DFSAState<String, Integer>> stateLinks = Generics.newHashMap();
    // Convert binary lattice into word lattice:
    tagLatticeToAnswerLattice(tagLattice.initialState(), aInitState, new StringBuilder(""), nodeId, 0, 0.0, stateLinks, answerLattice, docArray);
    try {
        answerLattice.printAttFsmFormat(out);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : MutableInteger(edu.stanford.nlp.util.MutableInteger) IOException(java.io.IOException) DFSA(edu.stanford.nlp.fsm.DFSA) MutableInteger(edu.stanford.nlp.util.MutableInteger) CoreLabel(edu.stanford.nlp.ling.CoreLabel) DFSAState(edu.stanford.nlp.fsm.DFSAState)

Example 29 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class Sighan2005DocumentReaderAndWriter method tagLatticeToAnswerLattice.

/**
   * Recursively builds an answer lattice (Chinese words) from a Viterbi search graph
   * of binary predictions. This function does a limited amount of post-processing:
   * preserve white spaces of the input, and not segment between two latin characters or
   * between two digits. Consequently, the probabilities of all paths in answerLattice
   * may not sum to 1 (they do sum to 1 if no post processing applies).
   *
   * @param tSource Current node in Viterbi search graph.
   * @param aSource Current node in answer lattice.
   * @param answer Partial word starting at aSource.
   * @param nodeId Currently unused node identifier for answer graph.
   * @param pos Current position in docArray.
   * @param cost Current cost of answer.
   * @param stateLinks Maps nodes of the search graph to nodes in answer lattice
   * (when paths of the search graph are recombined, paths of the answer lattice should be
   *  recombined as well, if at word boundary).
   */
private void tagLatticeToAnswerLattice(DFSAState<String, Integer> tSource, DFSAState<String, Integer> aSource, StringBuilder answer, MutableInteger nodeId, int pos, double cost, Map<DFSAState<String, Integer>, DFSAState<String, Integer>> stateLinks, DFSA<String, Integer> answerLattice, CoreLabel[] docArray) {
    // Add "1" prediction after the end of the sentence, if applicable:
    if (tSource.isAccepting() && tSource.continuingInputs().isEmpty()) {
        tSource.addTransition(new DFSATransition<>("", tSource, new DFSAState<>(-1, null), "1", "", 0));
    }
    // Get current label, character, and prediction:
    CoreLabel curLabel = (pos < docArray.length) ? docArray[pos] : null;
    String curChr = null, origSpace = null;
    if (curLabel != null) {
        curChr = curLabel.get(CoreAnnotations.OriginalCharAnnotation.class);
        assert (curChr.length() == 1);
        origSpace = curLabel.get(CoreAnnotations.SpaceBeforeAnnotation.class);
    }
    // Get set of successors in search graph:
    Set<String> inputs = tSource.continuingInputs();
    // Only keep most probable transition out of initial state:
    String answerConstraint = null;
    if (pos == 0) {
        double minCost = Double.POSITIVE_INFINITY;
        // DFSATransition<String, Integer> bestTransition = null;
        for (String predictSpace : inputs) {
            DFSATransition<String, Integer> transition = tSource.transition(predictSpace);
            double transitionCost = transition.score();
            if (transitionCost < minCost) {
                if (predictSpace != null) {
                    logger.info(String.format("mincost (%s): %e -> %e%n", predictSpace, minCost, transitionCost));
                    minCost = transitionCost;
                    answerConstraint = predictSpace;
                }
            }
        }
    }
    // Follow along each transition:
    for (String predictSpace : inputs) {
        DFSATransition<String, Integer> transition = tSource.transition(predictSpace);
        DFSAState<String, Integer> tDest = transition.target();
        DFSAState<String, Integer> newASource = aSource;
        //logger.info(String.format("tsource=%s tdest=%s asource=%s pos=%d predictSpace=%s%n", tSource, tDest, newASource, pos, predictSpace));
        StringBuilder newAnswer = new StringBuilder(answer.toString());
        int answerLen = newAnswer.length();
        String prevChr = (answerLen > 0) ? newAnswer.substring(answerLen - 1) : null;
        double newCost = cost;
        // Ignore paths starting with zero:
        if (answerConstraint != null && !answerConstraint.equals(predictSpace)) {
            logger.info(String.format("Skipping transition %s at pos 0.%n", predictSpace));
            continue;
        }
        // Ignore paths not consistent with input segmentation:
        if (flags.keepAllWhitespaces && "0".equals(predictSpace) && "1".equals(origSpace)) {
            logger.info(String.format("Skipping non-boundary at pos %d, since space in the input.%n", pos));
            continue;
        }
        // (unless already present in original input)
        if ("1".equals(predictSpace) && "0".equals(origSpace) && prevChr != null && curChr != null) {
            char p = prevChr.charAt(0), c = curChr.charAt(0);
            if (ChineseStringUtils.isLetterASCII(p) && ChineseStringUtils.isLetterASCII(c)) {
                logger.info(String.format("Not hypothesizing a boundary at pos %d, since between two ASCII letters (%s and %s).%n", pos, prevChr, curChr));
                continue;
            }
            if (ChineseUtils.isNumber(p) && ChineseUtils.isNumber(c)) {
                logger.info(String.format("Not hypothesizing a boundary at pos %d, since between two numeral characters (%s and %s).%n", pos, prevChr, curChr));
                continue;
            }
        }
        // If predictSpace==1, create a new transition in answer search graph:
        if ("1".equals(predictSpace)) {
            if (newAnswer.toString().length() > 0) {
                // If answer destination node visited before, create a new edge and leave:
                if (stateLinks.containsKey(tSource)) {
                    DFSAState<String, Integer> aDest = stateLinks.get(tSource);
                    newASource.addTransition(new DFSATransition<>("", newASource, aDest, newAnswer.toString(), "", newCost));
                    //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n", newASource, aDest, newAnswer));
                    continue;
                }
                // If answer destination node not visited before, create it + new edge:
                nodeId.incValue(1);
                DFSAState<String, Integer> aDest = new DFSAState<>(nodeId.intValue(), answerLattice, 0.0);
                stateLinks.put(tSource, aDest);
                newASource.addTransition(new DFSATransition<>("", newASource, aDest, newAnswer.toString(), "", newCost));
                // Reached an accepting state:
                if (tSource.isAccepting()) {
                    aDest.setAccepting(true);
                    continue;
                }
                // Start new answer edge:
                newASource = aDest;
                newAnswer = new StringBuilder();
                newCost = 0.0;
            }
        }
        assert (curChr != null);
        newAnswer.append(curChr);
        newCost += transition.score();
        if (newCost < flags.searchGraphPrune || ChineseStringUtils.isLetterASCII(curChr.charAt(0)))
            tagLatticeToAnswerLattice(tDest, newASource, newAnswer, nodeId, pos + 1, newCost, stateLinks, answerLattice, docArray);
    }
}
Also used : MutableInteger(edu.stanford.nlp.util.MutableInteger) CoreLabel(edu.stanford.nlp.ling.CoreLabel) DFSAState(edu.stanford.nlp.fsm.DFSAState)

Example 30 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class RulesTest method testMentionMatchesSpeakerAnnotation.

public void testMentionMatchesSpeakerAnnotation() {
    Mention g1 = new Mention(0, 0, 0, null);
    Mention m1 = new Mention(0, 0, 0, null);
    Mention m2 = new Mention(0, 0, 0, null);
    Mention m3 = new Mention(0, 0, 0, null);
    Mention m4 = new Mention(0, 0, 0, null);
    Mention m5 = new Mention(0, 0, 0, null);
    Mention m6 = new Mention(0, 0, 0, null);
    Mention m7 = new Mention(0, 0, 0, null);
    Mention m8 = new Mention(0, 0, 0, null);
    Mention g2 = new Mention(0, 0, 0, null);
    Mention g3 = new Mention(0, 0, 0, null);
    Mention g4 = new Mention(0, 0, 0, null);
    g1.headWord = new CoreLabel();
    g1.headWord.set(CoreAnnotations.SpeakerAnnotation.class, "john abraham bauer");
    m1.headString = "john";
    m2.headString = "bauer";
    m3.headString = "foo";
    m4.headString = "abraham";
    m5.headString = "braham";
    m6.headString = "zabraham";
    m7.headString = "abraha";
    m8.headString = "abrahamz";
    g2.headWord = new CoreLabel();
    g2.headWord.set(CoreAnnotations.SpeakerAnnotation.class, "john");
    g3.headWord = new CoreLabel();
    g3.headWord.set(CoreAnnotations.SpeakerAnnotation.class, "joh");
    g4.headWord = new CoreLabel();
    g4.headWord.set(CoreAnnotations.SpeakerAnnotation.class, "johnz");
    assertTrue(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m1));
    assertTrue(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m2));
    assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m3));
    assertTrue(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m4));
    assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m5));
    assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m6));
    assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m7));
    assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g1, m8));
    assertTrue(Rules.antecedentMatchesMentionSpeakerAnnotation(g2, m1));
    assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g3, m1));
    assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(g4, m1));
    // not symmetrical
    // also, shouldn't blow up if the annotation isn't set
    assertFalse(Rules.antecedentMatchesMentionSpeakerAnnotation(m1, g1));
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Aggregations

CoreLabel (edu.stanford.nlp.ling.CoreLabel)533 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)310 CoreMap (edu.stanford.nlp.util.CoreMap)102 ArrayList (java.util.ArrayList)101 Tree (edu.stanford.nlp.trees.Tree)98 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)96 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)63 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)53 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)41 IndexedWord (edu.stanford.nlp.ling.IndexedWord)38 List (java.util.List)33 Annotation (edu.stanford.nlp.pipeline.Annotation)31 Mention (edu.stanford.nlp.coref.data.Mention)29 Label (edu.stanford.nlp.ling.Label)28 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)26 Properties (java.util.Properties)24 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)21 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)19 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)18 StringReader (java.io.StringReader)18