Search in sources :

Example 1 with HasWord

use of edu.stanford.nlp.ling.HasWord in project lucida by claritylab.

the class StanfordPosTagger method tagPos.

/**
	 * Tags the tokens with part of speech
	 * 
	 * @param tokens Array of token strings
	 * @return Part of speech tags
	 */
public static String[] tagPos(String[] tokens) {
    Sentence untagged = createSentence(tokens);
    Sentence tagged = MaxentTagger.tagSentence(untagged);
    String[] pos = new String[tagged.size()];
    for (int i = 0; i < tagged.size(); i++) {
        HasWord w = (HasWord) tagged.get(i);
        String[] s = w.toString().split("/");
        if (s.length > 1)
            pos[i] = s[s.length - 1];
        else
            pos[i] = "";
    }
    return pos;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Sentence(edu.stanford.nlp.ling.Sentence)

Example 2 with HasWord

use of edu.stanford.nlp.ling.HasWord in project lucida by claritylab.

the class StanfordPosTagger method createSentence.

/**
	 * Combines the tokens into a <code>Sentence</code> 
	 * 
	 * @param tokens
	 * @return <code>Sentence</code> made of the tokens
	 */
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
    ArrayList<HasWord> wordList = new ArrayList<HasWord>();
    for (String s : tokens) {
        HasWord w = new Word(s);
        wordList.add(w);
    }
    Sentence sentence = new Sentence();
    sentence.setWords(wordList);
    return sentence;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) ArrayList(java.util.ArrayList) Sentence(edu.stanford.nlp.ling.Sentence)

Example 3 with HasWord

use of edu.stanford.nlp.ling.HasWord in project common-crawl by matpalm.

the class SentenceTokeniser method extractSentences.

public List<String> extractSentences(String text) {
    this.inputReader = new StringReader(text);
    List<String> sentences = new ArrayList<String>();
    for (List<HasWord> sentence : this) {
        StringBuilder sentenceBuffer = new StringBuilder();
        for (HasWord word : sentence) {
            sentenceBuffer.append(word.word() + " ");
        }
        String sentenceWithoutTrailingSpace = sentenceBuffer.toString().substring(0, sentenceBuffer.length() - 1);
        sentences.add(sentenceWithoutTrailingSpace);
    }
    return sentences;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList)

Example 4 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ParseFiles method parseFiles.

public void parseFiles(String[] args, int argIndex, boolean tokenized, TokenizerFactory<? extends HasWord> tokenizerFactory, String elementDelimiter, String sentenceDelimiter, Function<List<HasWord>, List<HasWord>> escaper, String tagDelimiter) {
    final DocType docType = (elementDelimiter == null) ? DocType.Plain : DocType.XML;
    if (op.testOptions.verbose) {
        if (tokenizerFactory != null)
            pwErr.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
    }
    final Timing timer = new Timing();
    //Loop over the files
    for (int i = argIndex; i < args.length; i++) {
        final String filename = args[i];
        final DocumentPreprocessor documentPreprocessor;
        if (filename.equals("-")) {
            try {
                documentPreprocessor = new DocumentPreprocessor(IOUtils.readerFromStdin(op.tlpParams.getInputEncoding()), docType);
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
        } else {
            documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.getInputEncoding());
        }
        //Unused values are null per the main() method invocation below
        //null is the default for these properties
        documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
        documentPreprocessor.setEscaper(escaper);
        documentPreprocessor.setSentenceDelimiter(sentenceDelimiter);
        documentPreprocessor.setTagDelimiter(tagDelimiter);
        documentPreprocessor.setElementDelimiter(elementDelimiter);
        if (tokenizerFactory == null)
            documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
        else
            documentPreprocessor.setTokenizerFactory(tokenizerFactory);
        //Setup the output
        PrintWriter pwo = pwOut;
        if (op.testOptions.writeOutputFiles) {
            String normalizedName = filename;
            try {
                // this will exception if not a URL
                new URL(normalizedName);
                normalizedName = normalizedName.replaceAll("/", "_");
            } catch (MalformedURLException e) {
            //It isn't a URL, so silently ignore
            }
            String ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
            String fname = normalizedName + '.' + ext;
            if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.isEmpty()) {
                String fseparator = System.getProperty("file.separator");
                if (fseparator == null || fseparator.isEmpty()) {
                    fseparator = "/";
                }
                File fnameFile = new File(fname);
                fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.getName();
            }
            try {
                pwo = op.tlpParams.pw(new FileOutputStream(fname));
            } catch (IOException ioe) {
                throw new RuntimeIOException(ioe);
            }
        }
        treePrint.printHeader(pwo, op.tlpParams.getOutputEncoding());
        pwErr.println("Parsing file: " + filename);
        int num = 0;
        int numProcessed = 0;
        if (op.testOptions.testingThreads != 1) {
            MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
            for (List<HasWord> sentence : documentPreprocessor) {
                num++;
                numSents++;
                int len = sentence.size();
                numWords += len;
                pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
                wrapper.put(sentence);
                while (wrapper.peek()) {
                    ParserQuery pq = wrapper.poll();
                    processResults(pq, numProcessed++, pwo);
                }
            }
            wrapper.join();
            while (wrapper.peek()) {
                ParserQuery pq = wrapper.poll();
                processResults(pq, numProcessed++, pwo);
            }
        } else {
            ParserQuery pq = pqFactory.parserQuery();
            for (List<HasWord> sentence : documentPreprocessor) {
                num++;
                numSents++;
                int len = sentence.size();
                numWords += len;
                pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
                pq.parseAndReport(sentence, pwErr);
                processResults(pq, numProcessed++, pwo);
            }
        }
        treePrint.printFooter(pwo);
        if (op.testOptions.writeOutputFiles)
            pwo.close();
        pwErr.println("Parsed file: " + filename + " [" + num + " sentences].");
    }
    long millis = timer.stop();
    if (summary) {
        if (pcfgLL != null)
            pcfgLL.display(false, pwErr);
        if (depLL != null)
            depLL.display(false, pwErr);
        if (factLL != null)
            factLL.display(false, pwErr);
    }
    if (saidMemMessage) {
        ParserUtils.printOutOfMemory(pwErr);
    }
    double wordspersec = numWords / (((double) millis) / 1000);
    double sentspersec = numSents / (((double) millis) / 1000);
    // easier way!
    NumberFormat nf = new DecimalFormat("0.00");
    pwErr.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec).");
    if (numFallback > 0) {
        pwErr.println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
    }
    if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
        pwErr.println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
        if (numUnparsable > 0) {
            pwErr.println("    " + numUnparsable + " were not parsable with non-zero probability.");
        }
        if (numNoMemory > 0) {
            pwErr.println("    " + numNoMemory + " were skipped because of insufficient memory.");
        }
        if (numSkipped > 0) {
            pwErr.println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
        }
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) MalformedURLException(java.net.MalformedURLException) MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) DecimalFormat(java.text.DecimalFormat) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) URL(java.net.URL) ParsingThreadsafeProcessor(edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor) FileOutputStream(java.io.FileOutputStream) List(java.util.List) Timing(edu.stanford.nlp.util.Timing) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) File(java.io.File) DocType(edu.stanford.nlp.process.DocumentPreprocessor.DocType) PrintWriter(java.io.PrintWriter) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) NumberFormat(java.text.NumberFormat)

Example 5 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class DependencyParser method predict.

/**
   * Convenience method for {@link #predict(edu.stanford.nlp.util.CoreMap)}. The tokens of the provided sentence must
   * also have tag annotations (the parser requires part-of-speech tags).
   *
   * @see #predict(edu.stanford.nlp.util.CoreMap)
   */
public GrammaticalStructure predict(List<? extends HasWord> sentence) {
    CoreLabel sentenceLabel = new CoreLabel();
    List<CoreLabel> tokens = new ArrayList<>();
    int i = 1;
    for (HasWord wd : sentence) {
        CoreLabel label;
        if (wd instanceof CoreLabel) {
            label = (CoreLabel) wd;
            if (label.tag() == null)
                throw new IllegalArgumentException("Parser requires words " + "with part-of-speech tag annotations");
        } else {
            label = new CoreLabel();
            label.setValue(wd.word());
            label.setWord(wd.word());
            if (!(wd instanceof HasTag))
                throw new IllegalArgumentException("Parser requires words " + "with part-of-speech tag annotations");
            label.setTag(((HasTag) wd).tag());
        }
        label.setIndex(i);
        i++;
        tokens.add(label);
    }
    sentenceLabel.set(CoreAnnotations.TokensAnnotation.class, tokens);
    return predict(sentenceLabel);
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) HasTag(edu.stanford.nlp.ling.HasTag)

Aggregations

HasWord (edu.stanford.nlp.ling.HasWord)57 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 TaggedWord (edu.stanford.nlp.ling.TaggedWord)15 ArrayList (java.util.ArrayList)14 HasTag (edu.stanford.nlp.ling.HasTag)13 Tree (edu.stanford.nlp.trees.Tree)13 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)11 StringReader (java.io.StringReader)11 Label (edu.stanford.nlp.ling.Label)10 Word (edu.stanford.nlp.ling.Word)10 List (java.util.List)8 BufferedReader (java.io.BufferedReader)6 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 File (java.io.File)5 PrintWriter (java.io.PrintWriter)5 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4 Pair (edu.stanford.nlp.util.Pair)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 HasIndex (edu.stanford.nlp.ling.HasIndex)3 Sentence (edu.stanford.nlp.ling.Sentence)3