Search in sources :

Example 46 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class AnnotatedTextReader method parseFile.

public static List<CoreMap> parseFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) throws IOException {
    Pattern startingLabelToken = Pattern.compile("<(" + StringUtils.join(categoriesAllowed, "|") + ")>");
    Pattern endLabelToken = Pattern.compile("</(" + StringUtils.join(categoriesAllowed, "|") + ")>");
    String backgroundSymbol = "O";
    List<CoreMap> sentences = new ArrayList<>();
    int lineNum = -1;
    String l = null;
    while ((l = reader.readLine()) != null) {
        lineNum++;
        String[] t = l.split("\t", 2);
        String id = null;
        String text = null;
        if (t.length == 2) {
            id = t[0];
            text = t[1];
        } else if (t.length == 1) {
            text = t[0];
            id = String.valueOf(lineNum);
        }
        id = sentIDprefix + id;
        DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
        PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
        dp.setTokenizerFactory(tokenizerFactory);
        String label = backgroundSymbol;
        int sentNum = -1;
        for (List<HasWord> sentence : dp) {
            sentNum++;
            String sentStr = "";
            List<CoreLabel> sent = new ArrayList<>();
            for (HasWord tokw : sentence) {
                String tok = tokw.word();
                Matcher startingMatcher = startingLabelToken.matcher(tok);
                Matcher endMatcher = endLabelToken.matcher(tok);
                if (startingMatcher.matches()) {
                    //System.out.println("matched starting");
                    label = startingMatcher.group(1);
                } else if (endMatcher.matches()) {
                    //System.out.println("matched end");
                    label = backgroundSymbol;
                } else {
                    CoreLabel c = new CoreLabel();
                    List<String> toks = new ArrayList<>();
                    toks.add(tok);
                    for (String toksplit : toks) {
                        sentStr += " " + toksplit;
                        c.setWord(toksplit);
                        c.setLemma(toksplit);
                        c.setValue(toksplit);
                        c.set(CoreAnnotations.TextAnnotation.class, toksplit);
                        c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);
                        if (setGoldClass) {
                            c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
                        }
                        if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
                            c.set(setClassForTheseLabels.get(label), label);
                        sent.add(c);
                    }
                }
            }
            CoreMap sentcm = new ArrayCoreMap();
            sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
            sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
            sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
            sentences.add(sentcm);
        }
    }
    return sentences;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap) Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) CoreMap(edu.stanford.nlp.util.CoreMap) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap)

Example 47 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class WordSegmentingTokenizer method getNext.

@Override
protected HasWord getNext() {
    while (wordIter == null || !wordIter.hasNext()) {
        if (!tok.hasNext()) {
            return null;
        }
        CoreLabel token = tok.next();
        String s = token.word();
        if (s == null) {
            return null;
        }
        if (s.equals(WhitespaceLexer.NEWLINE)) {
            // if newlines were significant, we should make sure to return
            // them when we see them
            List<HasWord> se = Collections.<HasWord>singletonList(token);
            wordIter = se.iterator();
        } else {
            List<HasWord> se = wordSegmenter.segment(s);
            wordIter = se.iterator();
        }
    }
    return wordIter.next();
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Example 48 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class PTBEscapingProcessor method fixQuotes.

private static List<HasWord> fixQuotes(List<HasWord> input) {
    int inputSize = input.size();
    LinkedList<HasWord> result = new LinkedList<>();
    if (inputSize == 0) {
        return result;
    }
    boolean begin;
    // see if there is a quote at the end
    if (input.get(inputSize - 1).word().equals("\"")) {
        // alternate from the end
        begin = false;
        for (int i = inputSize - 1; i >= 0; i--) {
            HasWord hw = input.get(i);
            String tok = hw.word();
            if (tok.equals("\"")) {
                if (begin) {
                    hw.setWord("``");
                    begin = false;
                } else {
                    hw.setWord("\'\'");
                    begin = true;
                }
            }
            // otherwise leave it alone
            result.addFirst(hw);
        }
    // end loop
    } else {
        // alternate from the beginning
        begin = true;
        for (HasWord hw : input) {
            String tok = hw.word();
            if (tok.equals("\"")) {
                if (begin) {
                    hw.setWord("``");
                    begin = false;
                } else {
                    hw.setWord("\'\'");
                    begin = true;
                }
            }
            // otherwise leave it alone
            result.addLast(hw);
        }
    // end loop
    }
    return result;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord)

Example 49 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class PlainTextDocumentReaderAndWriter method init.

@Override
public void init(SeqClassifierFlags flags) {
    String options = "tokenizeNLs=false,invertible=true";
    if (flags.tokenizerOptions != null) {
        options = options + ',' + flags.tokenizerOptions;
    }
    TokenizerFactory<IN> factory;
    if (flags.tokenizerFactory != null) {
        try {
            Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(flags.tokenizerFactory));
            Method factoryMethod = clazz.getMethod("newCoreLabelTokenizerFactory", String.class);
            factory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, options));
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    } else {
        factory = ErasureUtils.uncheckedCast(PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory(options));
    }
    init(flags, factory);
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) TokenizerFactory(edu.stanford.nlp.process.TokenizerFactory) Method(java.lang.reflect.Method)

Example 50 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class Tree method yield.

/**
   * Gets the yield of the tree.  The <code>Label</code> of all leaf nodes
   * is returned
   * as a list ordered by the natural left to right order of the
   * leaves.  Null values, if any, are inserted into the list like any
   * other value.  This has been rewritten to thread, so only one List
   * is used.
   *
   * @param y The list in which the yield of the tree will be placed.
   *          Normally, this will be empty when the routine is called, but
   *          if not, the new yield is added to the end of the list.
   * @return a <code>List</code> of the data in the tree's leaves.
   */
@SuppressWarnings("unchecked")
public <T> List<T> yield(List<T> y) {
    if (isLeaf()) {
        if (label() instanceof HasWord) {
            HasWord hw = (HasWord) label();
            hw.setWord(label().value());
        }
        y.add((T) label());
    } else {
        Tree[] kids = children();
        for (Tree kid : kids) {
            kid.yield(y);
        }
    }
    return y;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord)

Aggregations

HasWord (edu.stanford.nlp.ling.HasWord)58 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 TaggedWord (edu.stanford.nlp.ling.TaggedWord)15 ArrayList (java.util.ArrayList)15 HasTag (edu.stanford.nlp.ling.HasTag)13 Tree (edu.stanford.nlp.trees.Tree)13 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)12 StringReader (java.io.StringReader)12 Label (edu.stanford.nlp.ling.Label)10 Word (edu.stanford.nlp.ling.Word)10 List (java.util.List)8 BufferedReader (java.io.BufferedReader)6 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 File (java.io.File)5 PrintWriter (java.io.PrintWriter)5 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4 Pair (edu.stanford.nlp.util.Pair)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 HasIndex (edu.stanford.nlp.ling.HasIndex)3 Sentence (edu.stanford.nlp.ling.Sentence)3