Search in sources :

Example 1 with SentenceSplitter

use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.

the class PlainTextReader method parseTextRaw.

/**
     * This method will normalize and parse the raw text returning a representation of sentences,
     * where each sentence is a primitive array of words as strings. This representation is more
     * compatible with the new core data structures which no long take vectors.
     * 
     * @param text the text to parse.
     * @return a list of sentences represented as an array of words.
     */
public static List<String[]> parseTextRaw(String text) {
    text = normalizeText(text);
    // sentences split by newlines. will keep
    ArrayList<String> sentences1 = new ArrayList<>();
    // used...
    if (ParametersForLbjCode.currentParameters.forceNewSentenceOnLineBreaks || ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) {
        StringTokenizer st = new StringTokenizer(text, "\n");
        while (st.hasMoreTokens()) sentences1.add(st.nextToken());
    } else
        sentences1.add(text);
    // we add Lbj sentence splitting on
    ArrayList<String> sentences2 = new ArrayList<>();
    // top.
    if (!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) {
        for (String aSentences1 : sentences1) {
            SentenceSplitter parser = new SentenceSplitter(new String[] { aSentences1 });
            Sentence s = (Sentence) parser.next();
            while (s != null) {
                sentences2.add(s.text);
                s = (Sentence) parser.next();
            }
        }
    } else
        sentences2 = sentences1;
    ArrayList<String[]> res = new ArrayList<>();
    // tokenizing
    for (String sentenceText : sentences2) {
        if (sentenceText.length() > 0) {
            // this is just a formatting issue with LBJ sentence splitter that can happen
            if (sentenceText.charAt(sentenceText.length() - 1) == '.' && !ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting)
                sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . ";
            // now tokenizing for real...
            String[] sentence = sentenceText.split("[ \\n\\t]");
            if (sentence.length > 0) {
                // fixing a bug in LBJ sentence splitter if needed
                if ((!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.length == 1 && res.size() > 0 && (sentence[0].equals("\"") || sentence[0].equals("''") || sentence[0].equals("'"))) {
                    int where = res.size() - 1;
                    String[] tmp = res.remove(where);
                    if (tmp == null) {
                        tmp = new String[0];
                    }
                    int len = tmp.length;
                    String[] newtmp = new String[len + 1];
                    System.arraycopy(tmp, 0, newtmp, 0, len);
                    newtmp[len] = sentence[0];
                    res.add(newtmp);
                } else
                    res.add(sentence);
            }
        }
    }
    return res;
}
Also used : StringTokenizer(java.util.StringTokenizer) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) ArrayList(java.util.ArrayList) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)

Example 2 with SentenceSplitter

use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testWhitespaceBehavior.

/**
 * Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
 * xml markup has been replaced with whitespace of equal span.
 */
@Test
public void testWhitespaceBehavior() {
    String origText = null;
    try {
        origText = LineIO.slurp(INFILE);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
    Matcher xmlMatcher = xmlTagPattern.matcher(origText);
    StringBuilder cleanTextBldr = new StringBuilder();
    int lastAppendedCharOffset = 0;
    while (xmlMatcher.find()) {
        int start = xmlMatcher.start();
        int end = xmlMatcher.end();
        cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
        for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
        lastAppendedCharOffset = end;
    }
    cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
    String cleanText = cleanTextBldr.toString();
    // count whitespace chars in string
    // check token offsets in tokens returned by SentenceSplitter
    Pattern sun = Pattern.compile("\\w*Sun\\w*");
    Matcher sunMatcher = sun.matcher(cleanText);
    Set<IntPair> sunSpans = new HashSet<>();
    while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
    SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
    Sentence[] sents = splitter.splitAll();
    Sentence s = sents[0];
    LinkedVector words = s.wordSplit();
    for (int i = 0; i < words.size(); ++i) {
        Word firstWord = (Word) words.get(0);
        if ("Sun".equals(firstWord.form)) {
            IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
            assertTrue(sunSpans.contains(tokenCharOffsets));
        }
    }
    StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
    Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
    assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
    for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
        String tok = tokenInfo.getTokens()[i];
        if (tok.equals("Sun")) {
            IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
            if (!sunSpans.contains(tokCharOffsets)) {
                String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
                System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
            }
            assertTrue(sunSpans.contains(tokCharOffsets));
        }
    }
    TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
    assertNotNull(statefulTa);
}
Also used : Pattern(java.util.regex.Pattern) Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Matcher(java.util.regex.Matcher) FileNotFoundException(java.io.FileNotFoundException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with SentenceSplitter

use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.

the class PlainTextReader method parseTextRaw.

/**
 * This method will normalize and parse the raw text returning a representation of sentences,
 * where each sentence is a primitive array of words as strings. This representation is more
 * compatible with the new core data structures which no long take vectors.
 *
 * @param text the text to parse.
 * @return a list of sentences represented as an array of words.
 */
public static List<String[]> parseTextRaw(String text, ParametersForLbjCode cp) {
    text = normalizeText(text, cp);
    // sentences split by newlines. will keep
    ArrayList<String> sentences1 = new ArrayList<>();
    // used...
    if (cp.forceNewSentenceOnLineBreaks || cp.keepOriginalFileTokenizationAndSentenceSplitting) {
        StringTokenizer st = new StringTokenizer(text, "\n");
        while (st.hasMoreTokens()) sentences1.add(st.nextToken());
    } else
        sentences1.add(text);
    // we add Lbj sentence splitting on
    ArrayList<String> sentences2 = new ArrayList<>();
    // top.
    if (!cp.keepOriginalFileTokenizationAndSentenceSplitting) {
        for (String aSentences1 : sentences1) {
            SentenceSplitter parser = new SentenceSplitter(new String[] { aSentences1 });
            Sentence s = (Sentence) parser.next();
            while (s != null) {
                sentences2.add(s.text);
                s = (Sentence) parser.next();
            }
        }
    } else
        sentences2 = sentences1;
    ArrayList<String[]> res = new ArrayList<>();
    // tokenizing
    for (String sentenceText : sentences2) {
        if (sentenceText.length() > 0) {
            // this is just a formatting issue with LBJ sentence splitter that can happen
            if (sentenceText.charAt(sentenceText.length() - 1) == '.' && !cp.keepOriginalFileTokenizationAndSentenceSplitting)
                sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . ";
            // now tokenizing for real...
            String[] sentence = sentenceText.split("[ \\n\\t]");
            if (sentence.length > 0) {
                // fixing a bug in LBJ sentence splitter if needed
                if ((!cp.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.length == 1 && res.size() > 0 && (sentence[0].equals("\"") || sentence[0].equals("''") || sentence[0].equals("'"))) {
                    int where = res.size() - 1;
                    String[] tmp = res.remove(where);
                    if (tmp == null) {
                        tmp = new String[0];
                    }
                    int len = tmp.length;
                    String[] newtmp = new String[len + 1];
                    System.arraycopy(tmp, 0, newtmp, 0, len);
                    newtmp[len] = sentence[0];
                    res.add(newtmp);
                } else
                    res.add(sentence);
            }
        }
    }
    return res;
}
Also used : StringTokenizer(java.util.StringTokenizer) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) ArrayList(java.util.ArrayList) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)

Example 4 with SentenceSplitter

use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.

the class PlainTextReader method sentenceSplitAndTokenizeText.

public static Vector<Vector<String>> sentenceSplitAndTokenizeText(String text, ParametersForLbjCode cp) {
    text = normalizeText(text, cp);
    // sentences split by newlines. will keep just
    Vector<String> sentences1 = new Vector<>();
    // on newlines is used...
    if (cp.forceNewSentenceOnLineBreaks || cp.keepOriginalFileTokenizationAndSentenceSplitting) {
        StringTokenizer st = new StringTokenizer(text, "\n");
        while (st.hasMoreTokens()) sentences1.addElement(st.nextToken());
    } else
        sentences1.addElement(text);
    // we add Lbj sentence splitting on top.
    Vector<String> sentences2 = new Vector<>();
    if (!cp.keepOriginalFileTokenizationAndSentenceSplitting) {
        for (int i = 0; i < sentences1.size(); i++) {
            SentenceSplitter parser = new SentenceSplitter(new String[] { sentences1.elementAt(i) });
            Sentence s = (Sentence) parser.next();
            while (s != null) {
                sentences2.addElement(s.text);
                s = (Sentence) parser.next();
            }
        }
    } else
        sentences2 = sentences1;
    Vector<Vector<String>> res = new Vector<>();
    // tokenizing
    for (int i = 0; i < sentences2.size(); i++) {
        String sentenceText = sentences2.elementAt(i);
        if (sentenceText.length() > 0) {
            // this is just a formatting issue with LBJ sentence splitter that can happen
            if (sentenceText.charAt(sentenceText.length() - 1) == '.' && !cp.keepOriginalFileTokenizationAndSentenceSplitting)
                sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . ";
            // now tokenizing for real...
            StringTokenizer st = new StringTokenizer(sentenceText, " \n\t");
            Vector<String> sentence = new Vector<>();
            while (st.hasMoreTokens()) sentence.addElement(st.nextToken());
            if (sentence.size() > 0) {
                // fixing a bug in LBJ sentence splitter if needed
                if ((!cp.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.size() == 1 && res.size() > 0 && (sentence.elementAt(0).equals("\"") || sentence.elementAt(0).equals("''") || sentence.elementAt(0).equals("'")))
                    res.elementAt(res.size() - 1).add(sentence.elementAt(0));
                else
                    res.addElement(sentence);
            }
        }
    }
    return res;
}
Also used : StringTokenizer(java.util.StringTokenizer) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)

Example 5 with SentenceSplitter

use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.

the class PlainTextReader method sentenceSplitAndTokenizeText.

public static Vector<Vector<String>> sentenceSplitAndTokenizeText(String text) {
    text = normalizeText(text);
    // sentences split by newlines. will keep just
    Vector<String> sentences1 = new Vector<>();
    // on newlines is used...
    if (ParametersForLbjCode.currentParameters.forceNewSentenceOnLineBreaks || ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) {
        StringTokenizer st = new StringTokenizer(text, "\n");
        while (st.hasMoreTokens()) sentences1.addElement(st.nextToken());
    } else
        sentences1.addElement(text);
    // we add Lbj sentence splitting on top.
    Vector<String> sentences2 = new Vector<>();
    if (!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) {
        for (int i = 0; i < sentences1.size(); i++) {
            SentenceSplitter parser = new SentenceSplitter(new String[] { sentences1.elementAt(i) });
            Sentence s = (Sentence) parser.next();
            while (s != null) {
                sentences2.addElement(s.text);
                s = (Sentence) parser.next();
            }
        }
    } else
        sentences2 = sentences1;
    Vector<Vector<String>> res = new Vector<>();
    // tokenizing
    for (int i = 0; i < sentences2.size(); i++) {
        String sentenceText = sentences2.elementAt(i);
        if (sentenceText.length() > 0) {
            // this is just a formatting issue with LBJ sentence splitter that can happen
            if (sentenceText.charAt(sentenceText.length() - 1) == '.' && !ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting)
                sentenceText = sentenceText.substring(0, sentenceText.length() - 1) + " . ";
            // now tokenizing for real...
            StringTokenizer st = new StringTokenizer(sentenceText, " \n\t");
            Vector<String> sentence = new Vector<>();
            while (st.hasMoreTokens()) sentence.addElement(st.nextToken());
            if (sentence.size() > 0) {
                // fixing a bug in LBJ sentence splitter if needed
                if ((!ParametersForLbjCode.currentParameters.keepOriginalFileTokenizationAndSentenceSplitting) && sentence.size() == 1 && res.size() > 0 && (sentence.elementAt(0).equals("\"") || sentence.elementAt(0).equals("''") || sentence.elementAt(0).equals("'")))
                    res.elementAt(res.size() - 1).add(sentence.elementAt(0));
                else
                    res.addElement(sentence);
            }
        }
    }
    return res;
}
Also used : StringTokenizer(java.util.StringTokenizer) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)

Aggregations

SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)12 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)7 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)5 WordSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.WordSplitter)5 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)5 Parser (edu.illinois.cs.cogcomp.lbjava.parse.Parser)5 StringTokenizer (java.util.StringTokenizer)4 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 PlainToTokenParser (edu.illinois.cs.cogcomp.lbjava.nlp.seg.PlainToTokenParser)3 ArrayList (java.util.ArrayList)3 Vector (java.util.Vector)3 Test (org.junit.Test)3 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)2 Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)2 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 Classifier (edu.illinois.cs.cogcomp.lbjava.classify.Classifier)1 POSTagger (edu.illinois.cs.cogcomp.pos.lbjava.POSTagger)1 FileNotFoundException (java.io.FileNotFoundException)1 HashSet (java.util.HashSet)1