use of edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer in project cogcomp-nlp by CogComp.
the class TokenizerTextAnnotationBuilder method createTextAnnotation.
/**
* Tokenize the input text (split into sentences and "words" within sentences) and populate a
* TextAnnotation object. Specifies token character offsets with respect to original text. Input
* text should be English and avoid html and xml tags, and non-English characters may cause
* problems if you use the TextAnnotation as input to other NLP components.
*
* @param corpusId a field in TextAnnotation that can be used by the client for book-keeping
* (e.g. track texts from the same corpus)
* @param textId a field in TextAnnotation that can be used by the client for book-keeping (e.g.
* identify a specific document by some reference string)
* @param text the plain English text to process
* @return a TextAnnotation object with {@link ViewNames#TOKENS} and {@link ViewNames#SENTENCE}
* views.
* @throws IllegalArgumentException if the tokenizer has problems with the input text.
*/
@Override
public TextAnnotation createTextAnnotation(String corpusId, String textId, String text) throws IllegalArgumentException {
Tokenizer.Tokenization tokenization = tokenizer.tokenizeTextSpan(text);
TextAnnotation ta = new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
SpanLabelView view = new SpanLabelView(ViewNames.SENTENCE, NAME, ta, 1.0);
int start = 0;
for (int s : tokenization.getSentenceEndTokenIndexes()) {
view.addSpanLabel(start, s, ViewNames.SENTENCE, 1d);
start = s;
}
ta.addView(ViewNames.SENTENCE, view);
return ta;
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer in project cogcomp-nlp by CogComp.
the class MascXCESReader method loadAnnotationFile.
private static TextAnnotation loadAnnotationFile(String corpusName, String filename, String textId) throws Exception {
final String TOKEN_IDENTIFIER_KEY = "id";
// Parse the XML file into DOM
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(new File(filename));
// Merge adjacent texts with no tags between
doc.getDocumentElement().normalize();
List<String> tokens = new ArrayList<>();
Map<String, List<Pair<Integer, String>>> tokenLabels = new HashMap<>();
for (TokenLabelProcessor processor : TOKEN_LABEL_PROCESSORS) {
tokenLabels.putIfAbsent(processor.getViewName(), new ArrayList<>());
}
int currentTokenId = 0;
NodeList tokenNodes = doc.getElementsByTagName(TOKEN_ELEMENT);
for (int i = 0; i < tokenNodes.getLength(); ++i) {
Element tokenNode = (Element) tokenNodes.item(i);
String tokenLabel = TOKEN_VALUE_PROCESSOR.apply(tokenNode);
if (tokenLabel == null) {
continue;
}
tokens.add(tokenLabel);
for (TokenLabelProcessor processor : TOKEN_LABEL_PROCESSORS) {
String label = processor.getProcessor().apply(tokenNode);
if (label != null) {
tokenLabels.get(processor.getViewName()).add(new Pair<>(currentTokenId, label));
}
}
tokenNode.setUserData(TOKEN_IDENTIFIER_KEY, currentTokenId, null);
currentTokenId += 1;
}
Map<String, List<Pair<IntPair, String>>> spanLabels = new HashMap<>();
for (SpanLabelProcessor processor : SPAN_LABEL_PROCESSORS) {
spanLabels.putIfAbsent(processor.getViewName(), new ArrayList<>());
NodeList spanNodes = doc.getElementsByTagName(processor.getElementName());
for (int i = 0; i < spanNodes.getLength(); ++i) {
Element spanNode = (Element) spanNodes.item(i);
String label = processor.getProcessor().apply(spanNode);
if (label != null) {
// A span label covers all the (direct or indirect) child "tok" elements
NodeList coveredTokenNodes = spanNode.getElementsByTagName(TOKEN_ELEMENT);
List<Integer> coveredTokenIds = new ArrayList<>();
for (int j = 0; j < coveredTokenNodes.getLength(); ++j) {
Object tokenId = coveredTokenNodes.item(j).getUserData(TOKEN_IDENTIFIER_KEY);
if (tokenId != null) {
coveredTokenIds.add((int) tokenId);
}
}
// and the span is from the minimum child id to the maximum child id + 1
if (coveredTokenIds.size() > 0) {
int beginToken = coveredTokenIds.stream().reduce(Integer::min).orElseThrow(NoSuchElementException::new);
int endToken = coveredTokenIds.stream().reduce(Integer::max).orElseThrow(NoSuchElementException::new);
spanLabels.get(processor.getViewName()).add(new Pair<>(new IntPair(beginToken, endToken + 1), label));
}
}
}
}
String rawText = doc.getDocumentElement().getTextContent();
List<Pair<IntPair, String>> sentencesWithLabel = spanLabels.get(ViewNames.SENTENCE);
List<IntPair> sentences = sentencesWithLabel.stream().map(Pair::getFirst).collect(Collectors.toList());
spanLabels.remove(ViewNames.SENTENCE);
OracleTokenizer tokenizer = new OracleTokenizer();
Tokenizer.Tokenization tokenization = tokenizer.tokenize(rawText, tokens, sentences);
TextAnnotation ta = new TextAnnotation(corpusName, textId, rawText, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
for (Map.Entry<String, List<Pair<Integer, String>>> entry : tokenLabels.entrySet()) {
createTokenLabelView(entry.getValue().stream(), ta, entry.getKey());
}
for (Map.Entry<String, List<Pair<IntPair, String>>> entry : spanLabels.entrySet()) {
// span labels in MASC dataset might overlap
createSpanLabelView(entry.getValue().stream(), ta, entry.getKey(), true);
}
return ta;
}
Aggregations