use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.
the class ACE_BN_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags, boolean is2004) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = is2004 ? Pattern.compile("<DOCNO>(.*?)</DOCNO>") : Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentID, docID);
pattern = is2004 ? Pattern.compile("<DATE_TIME>(.*?)</DATE_TIME>") : Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
if (is2004) {
pattern = Pattern.compile("<TEXT>(.*?)<TURN>|<TURN>(.*?)<TURN>|<TURN>(.*?)</TEXT>|<TEXT>(.*?)</TEXT>");
} else {
pattern = Pattern.compile("<TURN>(.*?)</TURN>");
}
matcher = pattern.matcher(content);
int regionStart = 0;
while (matcher.find(regionStart)) {
// Pick the first non-empty group.
for (int i = 1; i <= matcher.groupCount(); ++i) {
if (matcher.group(i) != null) {
text = (matcher.group(i)).trim();
break;
}
}
int index4 = content.indexOf(text);
Paragraph para4 = new Paragraph(index4, text);
Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
paragraphs.add(pair4);
if (is2004) {
// Hack to move back to the overlapping <TURN> tag
regionStart = matcher.end() - 6;
} else {
regionStart = matcher.end();
}
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
String paraContent = paragraphs.get(i).getSecond().content;
int offsetWithFiltering = contentRemovingTags.indexOf(paraContent, index);
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
index += paraContent.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.
the class ACE_NW_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentID, docID);
pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
matcher = pattern.matcher(content);
while (matcher.find()) {
headLine = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.HeadLine, headLine);
pattern = Pattern.compile("<TEXT>(.*?)</TEXT>");
matcher = pattern.matcher(content);
while (matcher.find()) {
text = (matcher.group(1)).trim();
int index4 = content.indexOf(text);
Paragraph para4 = new Paragraph(index4, text);
Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
paragraphs.add(pair4);
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
index += paragraphs.get(i).getSecond().content.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.
the class ACE_UN_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentID, docID);
pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
matcher = pattern.matcher(content);
while (matcher.find()) {
headLine = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.HeadLine, headLine);
pattern = Pattern.compile("<POST>(.*?)</POST>");
matcher = pattern.matcher(content);
while (matcher.find()) {
text = (matcher.group(1)).trim();
int index4 = content.indexOf(text);
Pattern patternQuote = Pattern.compile("<SUBJECT>(.*?)</SUBJECT>");
Matcher matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String subject = (matcherQuote.group(1)).trim();
int indexsubject = text.indexOf(subject) + index4;
Paragraph paraSub = new Paragraph(indexsubject, subject);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postSubject", paraSub);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("<POSTER>(.*?)</POSTER>");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String quote = (matcherQuote.group(1)).trim();
int indexQuote = text.indexOf(quote) + index4;
Paragraph paraSub = new Paragraph(indexQuote, quote);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("poster", paraSub);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("<POSTDATE>(.*?)</POSTDATE>");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String quote = (matcherQuote.group(1)).trim();
int indexQuote = text.indexOf(quote) + index4;
Paragraph paraSub = new Paragraph(indexQuote, quote);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postDate", paraSub);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("<QUOTE PREVIOUSPOST=\"(.*?)\"/>");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String quote = (matcherQuote.group(1)).trim();
int indexQuote = text.indexOf(quote) + index4;
Paragraph paraSub = new Paragraph(indexQuote, quote);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("postQuote", paraSub);
paragraphs.add(pair);
}
if (text.contains("<QUOTE PREVIOUSPOST=")) {
patternQuote = Pattern.compile("</SUBJECT>(.*?)<QUOTE PREVIOUSPOST=");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String newText = (matcherQuote.group(1)).trim();
if (newText.equals("")) {
continue;
}
if (newText.contains("</SUBJECT>"))
newText = newText.substring(newText.indexOf("</SUBJECT>") + "</SUBJECT>".length()).trim();
int indexNewText = text.indexOf(newText) + index4;
Paragraph paraNewText = new Paragraph(indexNewText, newText);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("\"/>(.*?)</POST>");
matcherQuote = patternQuote.matcher(text + "</POST>");
while (matcherQuote.find()) {
String newText = (matcherQuote.group(1)).trim();
if (newText.equals("") || newText.contains("<QUOTE PREVIOUSPOST=")) {
continue;
}
int indexNewText = text.indexOf(newText) + index4;
Paragraph paraNewText = new Paragraph(indexNewText, newText);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
paragraphs.add(pair);
}
patternQuote = Pattern.compile("\"/>(.*?)<QUOTE PREVIOUSPOST=");
matcherQuote = patternQuote.matcher(text);
while (matcherQuote.find()) {
String newText = (matcherQuote.group(1)).trim();
if (newText.equals("")) {
continue;
}
int indexNewText = text.indexOf(newText) + index4;
Paragraph paraNewText = new Paragraph(indexNewText, newText);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
paragraphs.add(pair);
}
} else {
patternQuote = Pattern.compile("</SUBJECT>(.*?)</POST>");
matcherQuote = patternQuote.matcher(text + "</POST>");
while (matcherQuote.find()) {
String newText = (matcherQuote.group(1)).trim();
int indexNewText = text.indexOf(newText) + index4;
Paragraph paraNewText = new Paragraph(indexNewText, newText);
Pair<String, Paragraph> pair = new Pair<String, Paragraph>("text", paraNewText);
paragraphs.add(pair);
}
}
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
if (offsetWithFiltering == -1) {
continue;
}
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
if (paragraphs.get(i).getFirst().equals("poster"))
index += paragraphs.get(i).getSecond().content.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
if (paragraphs.get(i).getSecond().offsetFilterTags == -1) {
logger.info("[No match phrase in filtered content.]");
} else {
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
}
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.
the class ACE_BC_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentID, docID);
pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
matcher = pattern.matcher(content);
while (matcher.find()) {
headLine = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.HeadLine, headLine);
pattern = Pattern.compile("<TURN>(.*?)</TURN>");
matcher = pattern.matcher(content);
while (matcher.find()) {
text = (matcher.group(1)).trim();
text = text.substring(text.indexOf("</SPEAKER>") + "</SPEAKER>".length()).trim();
int index4 = content.indexOf(text);
Paragraph para4 = new Paragraph(index4, text);
Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
paragraphs.add(pair4);
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
index += paragraphs.get(i).getSecond().content.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.
the class ACE_CTS_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentID, docID);
pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
pattern = Pattern.compile("<TURN>(.*?)</TURN>");
matcher = pattern.matcher(content);
while (matcher.find()) {
text = (matcher.group(1)).trim();
text = text.substring(text.indexOf("</SPEAKER>") + "</SPEAKER>".length()).trim();
int index4 = content.indexOf(text);
Paragraph para4 = new Paragraph(index4, text);
Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
paragraphs.add(pair4);
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
index += paragraphs.get(i).getSecond().content.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}