Search in sources :

Example 1 with HasWord

use of edu.stanford.nlp.ling.HasWord in project lucida by claritylab.

the class StanfordPosTagger method tagPos.

	 * Tags the tokens with part of speech
	 * @param tokens Array of token strings
	 * @return Part of speech tags
public static String[] tagPos(String[] tokens) {
    Sentence untagged = createSentence(tokens);
    Sentence tagged = MaxentTagger.tagSentence(untagged);
    String[] pos = new String[tagged.size()];
    for (int i = 0; i < tagged.size(); i++) {
        HasWord w = (HasWord) tagged.get(i);
        String[] s = w.toString().split("/");
        if (s.length > 1)
            pos[i] = s[s.length - 1];
            pos[i] = "";
    return pos;
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Sentence(edu.stanford.nlp.ling.Sentence)

Example 2 with HasWord

use of edu.stanford.nlp.ling.HasWord in project lucida by claritylab.

the class StanfordPosTagger method createSentence.

	 * Combines the tokens into a <code>Sentence</code> 
	 * @param tokens
	 * @return <code>Sentence</code> made of the tokens
private static Sentence createSentence(String[] tokens) {
    ArrayList<HasWord> wordList = new ArrayList<HasWord>();
    for (String s : tokens) {
        HasWord w = new Word(s);
    Sentence sentence = new Sentence();
    return sentence;
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) ArrayList(java.util.ArrayList) Sentence(edu.stanford.nlp.ling.Sentence)

Example 3 with HasWord

use of edu.stanford.nlp.ling.HasWord in project common-crawl by matpalm.

the class SentenceTokeniser method extractSentences.

public List<String> extractSentences(String text) {
    this.inputReader = new StringReader(text);
    List<String> sentences = new ArrayList<String>();
    for (List<HasWord> sentence : this) {
        StringBuilder sentenceBuffer = new StringBuilder();
        for (HasWord word : sentence) {
            sentenceBuffer.append(word.word() + " ");
        String sentenceWithoutTrailingSpace = sentenceBuffer.toString().substring(0, sentenceBuffer.length() - 1);
    return sentences;
Also used : HasWord(edu.stanford.nlp.ling.HasWord) StringReader( ArrayList(java.util.ArrayList)

Example 4 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class ParseFiles method parseFiles.

public void parseFiles(String[] args, int argIndex, boolean tokenized, TokenizerFactory<? extends HasWord> tokenizerFactory, String elementDelimiter, String sentenceDelimiter, Function<List<HasWord>, List<HasWord>> escaper, String tagDelimiter) {
    final DocType docType = (elementDelimiter == null) ? DocType.Plain : DocType.XML;
    if (op.testOptions.verbose) {
        if (tokenizerFactory != null)
            pwErr.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
    final Timing timer = new Timing();
    //Loop over the files
    for (int i = argIndex; i < args.length; i++) {
        final String filename = args[i];
        final DocumentPreprocessor documentPreprocessor;
        if (filename.equals("-")) {
            try {
                documentPreprocessor = new DocumentPreprocessor(IOUtils.readerFromStdin(op.tlpParams.getInputEncoding()), docType);
            } catch (IOException e) {
                throw new RuntimeIOException(e);
        } else {
            documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.getInputEncoding());
        //Unused values are null per the main() method invocation below
        //null is the default for these properties
        if (tokenizerFactory == null)
            documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
        //Setup the output
        PrintWriter pwo = pwOut;
        if (op.testOptions.writeOutputFiles) {
            String normalizedName = filename;
            try {
                // this will exception if not a URL
                new URL(normalizedName);
                normalizedName = normalizedName.replaceAll("/", "_");
            } catch (MalformedURLException e) {
            //It isn't a URL, so silently ignore
            String ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
            String fname = normalizedName + '.' + ext;
            if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.isEmpty()) {
                String fseparator = System.getProperty("file.separator");
                if (fseparator == null || fseparator.isEmpty()) {
                    fseparator = "/";
                File fnameFile = new File(fname);
                fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.getName();
            try {
                pwo = FileOutputStream(fname));
            } catch (IOException ioe) {
                throw new RuntimeIOException(ioe);
        treePrint.printHeader(pwo, op.tlpParams.getOutputEncoding());
        pwErr.println("Parsing file: " + filename);
        int num = 0;
        int numProcessed = 0;
        if (op.testOptions.testingThreads != 1) {
            MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
            for (List<HasWord> sentence : documentPreprocessor) {
                int len = sentence.size();
                numWords += len;
                pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
                while (wrapper.peek()) {
                    ParserQuery pq = wrapper.poll();
                    processResults(pq, numProcessed++, pwo);
            while (wrapper.peek()) {
                ParserQuery pq = wrapper.poll();
                processResults(pq, numProcessed++, pwo);
        } else {
            ParserQuery pq = pqFactory.parserQuery();
            for (List<HasWord> sentence : documentPreprocessor) {
                int len = sentence.size();
                numWords += len;
                pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
                pq.parseAndReport(sentence, pwErr);
                processResults(pq, numProcessed++, pwo);
        if (op.testOptions.writeOutputFiles)
        pwErr.println("Parsed file: " + filename + " [" + num + " sentences].");
    long millis = timer.stop();
    if (summary) {
        if (pcfgLL != null)
            pcfgLL.display(false, pwErr);
        if (depLL != null)
            depLL.display(false, pwErr);
        if (factLL != null)
            factLL.display(false, pwErr);
    if (saidMemMessage) {
    double wordspersec = numWords / (((double) millis) / 1000);
    double sentspersec = numSents / (((double) millis) / 1000);
    // easier way!
    NumberFormat nf = new DecimalFormat("0.00");
    pwErr.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec).");
    if (numFallback > 0) {
        pwErr.println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
    if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
        pwErr.println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
        if (numUnparsable > 0) {
            pwErr.println("    " + numUnparsable + " were not parsable with non-zero probability.");
        if (numNoMemory > 0) {
            pwErr.println("    " + numNoMemory + " were skipped because of insufficient memory.");
        if (numSkipped > 0) {
            pwErr.println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
Also used : HasWord(edu.stanford.nlp.ling.HasWord) RuntimeIOException( MalformedURLException( MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) DecimalFormat(java.text.DecimalFormat) RuntimeIOException( IOException( URL( ParsingThreadsafeProcessor(edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor) FileOutputStream( List(java.util.List) Timing(edu.stanford.nlp.util.Timing) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) File( DocType(edu.stanford.nlp.process.DocumentPreprocessor.DocType) PrintWriter( ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) NumberFormat(java.text.NumberFormat)

Example 5 with HasWord

use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.

the class DependencyParser method predict.

   * Convenience method for {@link #predict(edu.stanford.nlp.util.CoreMap)}. The tokens of the provided sentence must
   * also have tag annotations (the parser requires part-of-speech tags).
   * @see #predict(edu.stanford.nlp.util.CoreMap)
public GrammaticalStructure predict(List<? extends HasWord> sentence) {
    CoreLabel sentenceLabel = new CoreLabel();
    List<CoreLabel> tokens = new ArrayList<>();
    int i = 1;
    for (HasWord wd : sentence) {
        CoreLabel label;
        if (wd instanceof CoreLabel) {
            label = (CoreLabel) wd;
            if (label.tag() == null)
                throw new IllegalArgumentException("Parser requires words " + "with part-of-speech tag annotations");
        } else {
            label = new CoreLabel();
            if (!(wd instanceof HasTag))
                throw new IllegalArgumentException("Parser requires words " + "with part-of-speech tag annotations");
            label.setTag(((HasTag) wd).tag());
    sentenceLabel.set(CoreAnnotations.TokensAnnotation.class, tokens);
    return predict(sentenceLabel);
Also used : HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) HasTag(edu.stanford.nlp.ling.HasTag)


HasWord (edu.stanford.nlp.ling.HasWord)57 CoreLabel (edu.stanford.nlp.ling.CoreLabel)17 TaggedWord (edu.stanford.nlp.ling.TaggedWord)15 ArrayList (java.util.ArrayList)14 HasTag (edu.stanford.nlp.ling.HasTag)13 Tree (edu.stanford.nlp.trees.Tree)13 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)11 StringReader ( Label (edu.stanford.nlp.ling.Label)10 Word (edu.stanford.nlp.ling.Word)10 List (java.util.List)8 BufferedReader ( MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 File ( PrintWriter ( ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)4 Pair (edu.stanford.nlp.util.Pair)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 HasIndex (edu.stanford.nlp.ling.HasIndex)3 Sentence (edu.stanford.nlp.ling.Sentence)3