use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ACE_NW_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentID, docID);
pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
matcher = pattern.matcher(content);
while (matcher.find()) {
headLine = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.HeadLine, headLine);
pattern = Pattern.compile("<TEXT>(.*?)</TEXT>");
matcher = pattern.matcher(content);
while (matcher.find()) {
text = (matcher.group(1)).trim();
int index4 = content.indexOf(text);
Paragraph para4 = new Paragraph(index4, text);
Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
paragraphs.add(pair4);
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
index += paragraphs.get(i).getSecond().content.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ACE_UN_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentID, docID);
pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
matcher = pattern.matcher(content);
while (matcher.find()) {
headLine = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.HeadLine, headLine);
pattern = Pattern.compile("<POST>(.*?)</POST>");
matcher = pattern.matcher(content);
while (matcher.find()) {
text = (matcher.group(1)).trim();
int index4 = content.indexOf(text);
Pattern patternQuote = Pattern.compile("<SUBJECT>(.*?)</SUBJECT>");
Matcher matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String subject = (matcherQuote.group(1)).trim();
int indexsubject = text.indexOf(subject) + index4;
Paragraph paraSub = new Paragraph(indexsubject, subject);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postSubject", paraSub);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("<POSTER>(.*?)</POSTER>");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String quote = (matcherQuote.group(1)).trim();
int indexQuote = text.indexOf(quote) + index4;
Paragraph paraSub = new Paragraph(indexQuote, quote);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("poster", paraSub);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("<POSTDATE>(.*?)</POSTDATE>");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String quote = (matcherQuote.group(1)).trim();
int indexQuote = text.indexOf(quote) + index4;
Paragraph paraSub = new Paragraph(indexQuote, quote);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postDate", paraSub);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("<QUOTE PREVIOUSPOST=\"(.*?)\"/>");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String quote = (matcherQuote.group(1)).trim();
int indexQuote = text.indexOf(quote) + index4;
Paragraph paraSub = new Paragraph(indexQuote, quote);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postQuote", paraSub);
paragraphs.add(pair);
}
if (text.contains("<QUOTE PREVIOUSPOST=")) {
patternQuote = Pattern.compile("</SUBJECT>(.*?)<QUOTE PREVIOUSPOST=");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String newText = (matcherQuote.group(1)).trim();
if (newText.equals("")) {
continue;
}
if (newText.contains("</SUBJECT>"))
newText = newText.substring(newText.indexOf("</SUBJECT>") + "</SUBJECT>".length()).trim();
int indexNewText = text.indexOf(newText) + index4;
Paragraph paraNewText = new Paragraph(indexNewText, newText);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("\"/>(.*?)</POST>");
matcherQuote = patternQuote.matcher(text + "</POST>");
while (matcherQuote.find()) {
String newText = (matcherQuote.group(1)).trim();
if (newText.equals("") || newText.contains("<QUOTE PREVIOUSPOST=")) {
continue;
}
int indexNewText = text.indexOf(newText) + index4;
Paragraph paraNewText = new Paragraph(indexNewText, newText);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("\"/>(.*?)<QUOTE PREVIOUSPOST=");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String newText = (matcherQuote.group(1)).trim();
if (newText.equals("")) {
continue;
}
int indexNewText = text.indexOf(newText) + index4;
Paragraph paraNewText = new Paragraph(indexNewText, newText);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
paragraphs.add(pair);
}
} else {
patternQuote = Pattern.compile("</SUBJECT>(.*?)</POST>");
matcherQuote = patternQuote.matcher(text + "</POST>");
while (matcherQuote.find()) {
String newText = (matcherQuote.group(1)).trim();
int indexNewText = text.indexOf(newText) + index4;
Paragraph paraNewText = new Paragraph(indexNewText, newText);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
paragraphs.add(pair);
}
}
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
if (offsetWithFiltering == -1) {
continue;
}
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
if (paragraphs.get(i).getFirst().equals("poster"))
index += paragraphs.get(i).getSecond().content.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
if (paragraphs.get(i).getSecond().offsetFilterTags == -1) {
logger.info("[No match phrase in filtered content.]");
} else {
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
}
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class DepAnnotator method addView.
@Override
public void addView(TextAnnotation ta) throws AnnotatorException {
for (String reqView : requiredViews) if (!ta.hasView(reqView))
throw new AnnotatorException("TextAnnotation must have view: " + reqView);
DepInst sent = new DepInst(ta);
DepStruct deptree;
try {
deptree = (DepStruct) model.infSolver.getBestStructure(model.wv, sent);
} catch (Exception e) {
throw new AnnotatorException("Sentence cannot be parsed");
}
TreeView treeView = new TreeView(ViewNames.DEPENDENCY, ta);
int rootPos = findRoot(deptree);
// All the node positions are -1 to account for the extra <root> node added
Pair<String, Integer> nodePair = new Pair<>(sent.forms[rootPos], rootPos - 1);
Tree<Pair<String, Integer>> tree = new Tree<>(nodePair);
populateChildren(tree, deptree, sent, rootPos);
treeView.setDependencyTree(0, tree);
ta.addView(ViewNames.DEPENDENCY, treeView);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class FeatureUtilities method convert.
/**
* Convert a feature set into a pair of arrays of integers and doubles by looking up the feature
* name in the provided lexicon.
*
* @param features The feature set
* @param lexicon The lexicon
* @param trainingMode Should an unseen feature string be added to the lexicon? If this is
* false, unseen features will be given an ID whose value is one more than the number of
* features.
* @return a pair of int[] and double[], representing the feature ids and values.
*/
public static Pair<int[], double[]> convert(Set<Feature> features, Lexicon lexicon, boolean trainingMode) {
TIntDoubleHashMap fMap = new TIntDoubleHashMap(features.size());
for (Feature feature : features) {
final int featureId = FeatureUtilities.getFeatureId(lexicon, trainingMode, feature);
if (featureId < 0)
continue;
double value = feature.getValue() + fMap.get(featureId);
fMap.put(featureId, value);
}
int[] idsOriginal = fMap.keys();
int[] ids = new int[idsOriginal.length];
System.arraycopy(idsOriginal, 0, ids, 0, ids.length);
Arrays.sort(ids);
double[] vals = new double[fMap.size()];
int count = 0;
for (int key : ids) {
vals[count++] = fMap.get(key);
}
return new Pair<>(ids, vals);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ParseHelper method getPhraseFromHead.
/**
* Primarily a fix for prepSRL objects; converts them from single head words to constituents.
* E.g. for the sentence "the man with the telescope", the object of the preposition will be
* "the telescope" instead of just "telescope".
*
* @param predicate The predicate of the construction (e.g. "with")
* @param argHead The head-word of the argument of the construction (e.g. "telescope")
* @param parseViewName The name of the parse view used to extract the phrase-structure tree
* @return The full constituent phrase containing the argument head
*/
public static Constituent getPhraseFromHead(Constituent predicate, Constituent argHead, String parseViewName) {
// Get the path from the argument to the preposition
// but only if the predicate node "m-commands" the arg
TextAnnotation ta = argHead.getTextAnnotation();
int sentenceOffset = ta.getSentence(ta.getSentenceId(argHead)).getStartSpan();
int argStart = argHead.getStartSpan() - sentenceOffset;
Tree<Pair<String, IntPair>> predParentTree = getTokenIndexedTreeCovering(predicate, parseViewName).getParent();
boolean found = false;
for (Tree<Pair<String, IntPair>> s : predParentTree.getYield()) {
if (s.getLabel().getSecond().getFirst() == argStart)
found = true;
}
if (!found)
return null;
// Now follow the path from the argument node to get to the preposition
Tree<Pair<String, IntPair>> argPhrase = getTokenIndexedTreeCovering(argHead, parseViewName);
while (!checkForPredicate(argPhrase.getParent(), predicate.getStartSpan() - sentenceOffset)) {
if (argPhrase.getParent() == null)
break;
argPhrase = argPhrase.getParent();
}
// If the phrase covering the constituent is the whole sentence then the annotation is wrong
if (argPhrase.getParent() == null)
return null;
int start = predicate.getStartSpan() + 1;
int end = start + argPhrase.getYield().size();
return new Constituent(argHead.getLabel(), argHead.getViewName(), argHead.getTextAnnotation(), start, end);
}
Aggregations