use of edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph in project cogcomp-nlp by CogComp.
the class ACE_WL_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, docID);
pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
matcher = pattern.matcher(content);
while (matcher.find()) {
headLine = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.HeadLine, headLine);
pattern = Pattern.compile("<POST>(.*?)</POST>");
matcher = pattern.matcher(content);
while (matcher.find()) {
text = (matcher.group(1)).trim();
text = text.substring(text.indexOf("</POSTDATE>") + "</POSTDATE>".length()).trim();
int index4 = content.indexOf(text);
Paragraph para4 = new Paragraph(index4, text);
Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
paragraphs.add(pair4);
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
index += paragraphs.get(i).getSecond().content.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}