Search in sources :

Example 16 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class Tree method yieldHasWord.

@SuppressWarnings("unchecked")
public <X extends HasWord> ArrayList<X> yieldHasWord(ArrayList<X> y) {
    if (isLeaf()) {
        Label lab = label();
        // LabeledScoredTreeFactory but passes in a StringLabel to e.g. newLeaf().
        if (lab instanceof HasWord) {
            if (lab instanceof CoreLabel) {
                CoreLabel cl = (CoreLabel) lab;
                if (cl.word() == null)
                    cl.setWord(cl.value());
                y.add((X) cl);
            } else {
                y.add((X) lab);
            }
        } else {
            y.add((X) new Word(lab));
        }
    } else {
        Tree[] kids = children();
        for (Tree kid : kids) {
            kid.yield(y);
        }
    }
    return y;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) Word(edu.stanford.nlp.ling.Word) LabeledWord(edu.stanford.nlp.ling.LabeledWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label)

Example 17 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class Tree method makeDependencyLabel.

/**
   * Convert a constituency label to a dependency label. Options are provided for selecting annotations
   * to copy.
   *
   * @param oldLabel
   * @param copyLabel
   * @param copyIndex
   * @param copyPosTag
   */
private static Label makeDependencyLabel(Label oldLabel, boolean copyLabel, boolean copyIndex, boolean copyPosTag) {
    if (!copyLabel)
        return oldLabel;
    String wordForm = (oldLabel instanceof HasWord) ? ((HasWord) oldLabel).word() : oldLabel.value();
    Label newLabel = oldLabel.labelFactory().newLabel(wordForm);
    if (newLabel instanceof HasWord)
        ((HasWord) newLabel).setWord(wordForm);
    if (copyPosTag && newLabel instanceof HasTag && oldLabel instanceof HasTag) {
        String tag = ((HasTag) oldLabel).tag();
        ((HasTag) newLabel).setTag(tag);
    }
    if (copyIndex && newLabel instanceof HasIndex && oldLabel instanceof HasIndex) {
        int index = ((HasIndex) oldLabel).index();
        ((HasIndex) newLabel).setIndex(index);
    }
    return newLabel;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) HasTag(edu.stanford.nlp.ling.HasTag) HasIndex(edu.stanford.nlp.ling.HasIndex)

Example 18 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class BuildBinarizedDataset method main.

/**
   * Turns a text file into trees for use in a RNTN classifier such as
   * the treebank used in the Sentiment project.
   * <br>
   * The expected input file is one sentence per line, with sentences
   * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
   * Lines after the first sentence line but before
   * the blank line will be treated as labeled sub-phrases.  The
   * labels should start with the label and then contain a list of
   * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
   *  For example:
   * <br>
   * <code>
   * 1 Today is not a good day.<br>
   * 3 good<br>
   * 3 good day <br>
   * 3 a good day <br>
   * <br>
   * (next block starts here) <br>
   * </code>
   * By default the englishPCFG parser is used.  This can be changed
   * with the <code>-parserModel</code> flag.  Specify an input file
   * with <code>-input</code>.
   * <br>
   * If a sentiment model is provided with -sentimentModel, that model
   * will be used to prelabel the sentences.  Any spans with given
   * labels will then be used to adjust those labels.
   */
public static void main(String[] args) {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String inputPath = null;
    String sentimentModelPath = null;
    SentimentModel sentimentModel = null;
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-input")) {
            inputPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
            parserModel = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
            sentimentModelPath = args[argIndex + 1];
            argIndex += 2;
        } else {
            log.info("Unknown argument " + args[argIndex]);
            System.exit(2);
        }
    }
    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }
    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }
    String text = IOUtils.slurpFileNoExceptions(inputPath);
    // need blank line to make a new chunk
    String[] chunks = text.split("\\n\\s*\\n+");
    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.
        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //log.info(tokens);
        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }
        // TODO: add an option which treats the spans as constraints when parsing
        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
        }
        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();
        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        System.out.println(collapsedUnary);
    //System.out.println();
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) TreeBinarizer(edu.stanford.nlp.parser.lexparser.TreeBinarizer) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) StringReader(java.io.StringReader) Tree(edu.stanford.nlp.trees.Tree) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair)

Example 19 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class SequenceGibbsSampler method printSamples.

public void printSamples(List samples, PrintStream out) {
    for (int i = 0; i < document.size(); i++) {
        HasWord word = (HasWord) document.get(i);
        String s = "null";
        if (word != null) {
            s = word.word();
        }
        out.print(StringUtils.padOrTrim(s, 10));
        for (Object sample : samples) {
            int[] sequence = (int[]) sample;
            out.print(" " + StringUtils.padLeft(sequence[i], 2));
        }
        out.println();
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord)

Example 20 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class AbstractSequenceClassifier method preprocessTokens.

private List<IN> preprocessTokens(List<? extends HasWord> tokenSequence) {
    // log.info("knownLCWords.size is " + knownLCWords.size() + "; knownLCWords.maxSize is " + knownLCWords.getMaxSize() +
    //                   ", prior to NER for " + getClass().toString());
    List<IN> document = new ArrayList<>();
    int i = 0;
    for (HasWord word : tokenSequence) {
        // initialized below
        IN wi;
        if (word instanceof CoreMap) {
            // copy all annotations! some are required later in
            // AbstractSequenceClassifier.classifyWithInlineXML
            // wi = (IN) new ArrayCoreMap((ArrayCoreMap) word);
            wi = tokenFactory.makeToken((IN) word);
        } else {
            wi = tokenFactory.makeToken();
            wi.set(CoreAnnotations.TextAnnotation.class, word.word());
        // wi.setWord(word.word());
        }
        wi.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(i));
        wi.set(CoreAnnotations.AnswerAnnotation.class, backgroundSymbol());
        document.add(wi);
        i++;
    }
    // TODO get rid of ObjectBankWrapper
    ObjectBankWrapper<IN> wrapper = new ObjectBankWrapper<>(flags, null, knownLCWords);
    wrapper.processDocument(document);
    // log.info("Size of knownLCWords is " + knownLCWords.size() + ", after NER for " + getClass().toString());
    return document;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Aggregations

HasWord (edu.stanford.nlp.ling.HasWord)57 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 TaggedWord (edu.stanford.nlp.ling.TaggedWord)15 ArrayList (java.util.ArrayList)14 HasTag (edu.stanford.nlp.ling.HasTag)13 Tree (edu.stanford.nlp.trees.Tree)13 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)11 StringReader (java.io.StringReader)11 Label (edu.stanford.nlp.ling.Label)10 Word (edu.stanford.nlp.ling.Word)10 List (java.util.List)8 BufferedReader (java.io.BufferedReader)6 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 File (java.io.File)5 PrintWriter (java.io.PrintWriter)5 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4 Pair (edu.stanford.nlp.util.Pair)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 HasIndex (edu.stanford.nlp.ling.HasIndex)3 Sentence (edu.stanford.nlp.ling.Sentence)3