Examples with Tokenizer - edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer

Example 1 with Tokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer in project cogcomp-nlp by CogComp.

the class TokenizerTextAnnotationBuilder method createTextAnnotation.

/**
 * Tokenize the input text (split into sentences and "words" within sentences) and populate a
 * TextAnnotation object. Specifies token character offsets with respect to original text. Input
 * text should be English and avoid html and xml tags, and non-English characters may cause
 * problems if you use the TextAnnotation as input to other NLP components.
 *
 * @param corpusId a field in TextAnnotation that can be used by the client for book-keeping
 *        (e.g. track texts from the same corpus)
 * @param textId a field in TextAnnotation that can be used by the client for book-keeping (e.g.
 *        identify a specific document by some reference string)
 * @param text the plain English text to process
 * @return a TextAnnotation object with {@link ViewNames#TOKENS} and {@link ViewNames#SENTENCE}
 *         views.
 * @throws IllegalArgumentException if the tokenizer has problems with the input text.
 */
@Override
public TextAnnotation createTextAnnotation(String corpusId, String textId, String text) throws IllegalArgumentException {
    Tokenizer.Tokenization tokenization = tokenizer.tokenizeTextSpan(text);
    TextAnnotation ta = new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
    SpanLabelView view = new SpanLabelView(ViewNames.SENTENCE, NAME, ta, 1.0);
    int start = 0;
    for (int s : tokenization.getSentenceEndTokenIndexes()) {
        view.addSpanLabel(start, s, ViewNames.SENTENCE, 1d);
        start = s;
    }
    ta.addView(ViewNames.SENTENCE, view);
    return ta;
}

Also used : TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) Tokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer)

Example 2 with Tokenizer

use of edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer in project cogcomp-nlp by CogComp.

the class MascXCESReader method loadAnnotationFile.

private static TextAnnotation loadAnnotationFile(String corpusName, String filename, String textId) throws Exception {
    final String TOKEN_IDENTIFIER_KEY = "id";
    // Parse the XML file into DOM
    DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
    DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
    Document doc = dBuilder.parse(new File(filename));
    // Merge adjacent texts with no tags between
    doc.getDocumentElement().normalize();
    List<String> tokens = new ArrayList<>();
    Map<String, List<Pair<Integer, String>>> tokenLabels = new HashMap<>();
    for (TokenLabelProcessor processor : TOKEN_LABEL_PROCESSORS) {
        tokenLabels.putIfAbsent(processor.getViewName(), new ArrayList<>());
    }
    int currentTokenId = 0;
    NodeList tokenNodes = doc.getElementsByTagName(TOKEN_ELEMENT);
    for (int i = 0; i < tokenNodes.getLength(); ++i) {
        Element tokenNode = (Element) tokenNodes.item(i);
        String tokenLabel = TOKEN_VALUE_PROCESSOR.apply(tokenNode);
        if (tokenLabel == null) {
            continue;
        }
        tokens.add(tokenLabel);
        for (TokenLabelProcessor processor : TOKEN_LABEL_PROCESSORS) {
            String label = processor.getProcessor().apply(tokenNode);
            if (label != null) {
                tokenLabels.get(processor.getViewName()).add(new Pair<>(currentTokenId, label));
            }
        }
        tokenNode.setUserData(TOKEN_IDENTIFIER_KEY, currentTokenId, null);
        currentTokenId += 1;
    }
    Map<String, List<Pair<IntPair, String>>> spanLabels = new HashMap<>();
    for (SpanLabelProcessor processor : SPAN_LABEL_PROCESSORS) {
        spanLabels.putIfAbsent(processor.getViewName(), new ArrayList<>());
        NodeList spanNodes = doc.getElementsByTagName(processor.getElementName());
        for (int i = 0; i < spanNodes.getLength(); ++i) {
            Element spanNode = (Element) spanNodes.item(i);
            String label = processor.getProcessor().apply(spanNode);
            if (label != null) {
                // A span label covers all the (direct or indirect) child "tok" elements
                NodeList coveredTokenNodes = spanNode.getElementsByTagName(TOKEN_ELEMENT);
                List<Integer> coveredTokenIds = new ArrayList<>();
                for (int j = 0; j < coveredTokenNodes.getLength(); ++j) {
                    Object tokenId = coveredTokenNodes.item(j).getUserData(TOKEN_IDENTIFIER_KEY);
                    if (tokenId != null) {
                        coveredTokenIds.add((int) tokenId);
                    }
                }
                // and the span is from the minimum child id to the maximum child id + 1
                if (coveredTokenIds.size() > 0) {
                    int beginToken = coveredTokenIds.stream().reduce(Integer::min).orElseThrow(NoSuchElementException::new);
                    int endToken = coveredTokenIds.stream().reduce(Integer::max).orElseThrow(NoSuchElementException::new);
                    spanLabels.get(processor.getViewName()).add(new Pair<>(new IntPair(beginToken, endToken + 1), label));
                }
            }
        }
    }
    String rawText = doc.getDocumentElement().getTextContent();
    List<Pair<IntPair, String>> sentencesWithLabel = spanLabels.get(ViewNames.SENTENCE);
    List<IntPair> sentences = sentencesWithLabel.stream().map(Pair::getFirst).collect(Collectors.toList());
    spanLabels.remove(ViewNames.SENTENCE);
    OracleTokenizer tokenizer = new OracleTokenizer();
    Tokenizer.Tokenization tokenization = tokenizer.tokenize(rawText, tokens, sentences);
    TextAnnotation ta = new TextAnnotation(corpusName, textId, rawText, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
    for (Map.Entry<String, List<Pair<Integer, String>>> entry : tokenLabels.entrySet()) {
        createTokenLabelView(entry.getValue().stream(), ta, entry.getKey());
    }
    for (Map.Entry<String, List<Pair<IntPair, String>>> entry : spanLabels.entrySet()) {
        // span labels in MASC dataset might overlap
        createSpanLabelView(entry.getValue().stream(), ta, entry.getKey(), true);
    }
    return ta;
}

Also used : DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) HashMap(java.util.HashMap) Element(org.w3c.dom.Element) ArrayList(java.util.ArrayList) Document(org.w3c.dom.Document) ArrayList(java.util.ArrayList) NodeList(org.w3c.dom.NodeList) List(java.util.List) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Tokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) NodeList(org.w3c.dom.NodeList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) DocumentBuilder(javax.xml.parsers.DocumentBuilder) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) NoSuchElementException(java.util.NoSuchElementException)

Aggregations

TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)2 Tokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer)2 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 File (java.io.File)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 NoSuchElementException (java.util.NoSuchElementException)1 DocumentBuilder (javax.xml.parsers.DocumentBuilder)1 DocumentBuilderFactory (javax.xml.parsers.DocumentBuilderFactory)1 Document (org.w3c.dom.Document)1 Element (org.w3c.dom.Element)1 NodeList (org.w3c.dom.NodeList)1