use of org.cyberneko.html.HTMLConfiguration in project gate-core by GateNLP.
the class NekoHtmlDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format into annotations in GATE format. If the document was
* created from a String, then is recomandable to set the doc's
* sourceUrl to <b>null</b>. So, if the document has a valid URL,
* then the parser will try to parse the XML document pointed by the
* URL.If the URL is not valid, or is null, then the doc's content
* will be parsed. If the doc's content is not a valid XML then the
* parser might crash.
*
* @param doc The gate document you want to parse. If
* <code>doc.getSourceUrl()</code> returns <b>null</b>
* then the content of doc will be parsed. Using a URL is
* recomended because the parser will report errors corectlly
* if the document is not well formed.
*/
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
NekoHtmlDocumentHandler handler = null;
try {
org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
// convert element and attribute names to lower case
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
// make parser augment infoset with location information
parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
// Create a new Xml document handler
handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
// Register a status listener with it
handler.addStatusListener(statusListener);
// set repositioning object
handler.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
handler.setAmpCodingInfo(ampCodingInfo);
// construct the list of offsets for each line of the document
int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
handler.setLineOffsets(lineOffsets);
// set the handlers
parser.setDocumentHandler(handler);
parser.setErrorHandler(handler);
// Parse the XML Document with the appropriate encoding
XMLInputSource is;
if (docHasContentButNoValidURL) {
// no URL, so parse from string
is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
} else if (doc instanceof TextualDocument) {
// textual document - load with user specified encoding
String docEncoding = ((TextualDocument) doc).getEncoding();
// XML, so no BOM stripping.
URLConnection conn = doc.getSourceUrl().openConnection();
InputStream uStream = conn.getInputStream();
if ("gzip".equals(conn.getContentEncoding())) {
uStream = new GZIPInputStream(uStream);
}
Reader docReader = new InputStreamReader(uStream, docEncoding);
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
// since we control the encoding, tell the parser to ignore any
// meta http-equiv hints
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
} else {
// let the parser decide the encoding
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
}
/* The following line can forward an
* ArrayIndexOutOfBoundsException from
* org.cyberneko.html.HTMLConfiguration.parse and crash GATE. */
parser.parse(is);
// Angel - end
((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
}/* Handle IOException specially. */
catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
}/* Handle XNIException and ArrayIndexOutOfBoundsException:
* flag the parsing error and keep going. */
catch (Exception e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} finally {
if (handler != null)
handler.removeStatusListener(statusListener);
}
// End if else try
}
use of org.cyberneko.html.HTMLConfiguration in project muikku by otavanopisto.
the class WorkspaceMaterialController method createContentNode.
private ContentNode createContentNode(WorkspaceNode rootMaterialNode, int level, boolean processHtml, boolean includeHidden) throws WorkspaceMaterialException {
boolean viewRestricted = false;
try {
switch(rootMaterialNode.getType()) {
case FOLDER:
WorkspaceFolder workspaceFolder = (WorkspaceFolder) rootMaterialNode;
viewRestricted = !sessionController.isLoggedIn() && workspaceFolder.getViewRestrict() == MaterialViewRestrict.LOGGED_IN;
ContentNode folderContentNode = new ContentNode(workspaceFolder.getTitle(), "folder", rootMaterialNode.getId(), null, level, null, null, rootMaterialNode.getParent().getId(), rootMaterialNode.getHidden(), null, 0l, 0l, workspaceFolder.getPath(), null, null, workspaceFolder.getViewRestrict(), viewRestricted);
List<WorkspaceNode> children = includeHidden ? workspaceNodeDAO.listByParentSortByOrderNumber(workspaceFolder) : workspaceNodeDAO.listByParentAndHiddenSortByOrderNumber(workspaceFolder, Boolean.FALSE);
List<FlattenedWorkspaceNode> flattenedChildren;
if (level >= FLATTENING_LEVEL) {
flattenedChildren = flattenWorkspaceNodes(children, level, includeHidden);
} else {
flattenedChildren = new ArrayList<>();
for (WorkspaceNode node : children) {
flattenedChildren.add(new FlattenedWorkspaceNode(false, null, node, level, node.getParent().getId(), node.getHidden()));
}
}
for (FlattenedWorkspaceNode child : flattenedChildren) {
ContentNode contentNode;
if (child.isEmptyFolder) {
contentNode = new ContentNode(child.emptyFolderTitle, "folder", rootMaterialNode.getId(), null, child.level, null, null, child.parentId, child.hidden, null, 0l, 0l, child.node.getPath(), null, null, MaterialViewRestrict.NONE, false);
} else {
contentNode = createContentNode(child.node, child.level, processHtml, includeHidden);
}
folderContentNode.addChild(contentNode);
}
return folderContentNode;
case MATERIAL:
DOMParser parser = null;
Transformer transformer = null;
if (processHtml) {
parser = new DOMParser(new HTMLConfiguration());
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty(OutputKeys.INDENT, "no");
}
WorkspaceMaterial workspaceMaterial = (WorkspaceMaterial) rootMaterialNode;
Material material = materialController.findMaterialById(workspaceMaterial.getMaterialId());
Long currentRevision = material instanceof HtmlMaterial ? htmlMaterialController.lastHtmlMaterialRevision((HtmlMaterial) material) : 0l;
Long publishedRevision = material instanceof HtmlMaterial ? ((HtmlMaterial) material).getRevisionNumber() : 0l;
List<String> producerNames = null;
String html;
List<MaterialProducer> producers = materialController.listMaterialProducers(material);
if ((producers != null) && !producers.isEmpty()) {
producerNames = new ArrayList<>();
for (MaterialProducer producer : producers) {
producerNames.add(StringUtils.replace(StringEscapeUtils.escapeHtml4(producer.getName()), ",", ","));
}
}
viewRestricted = !sessionController.isLoggedIn() && material.getViewRestrict() == MaterialViewRestrict.LOGGED_IN;
if (!viewRestricted) {
html = processHtml ? getMaterialHtml(material, parser, transformer) : null;
} else {
html = String.format("<p class=\"content-view-restricted-message\">%s</p>", localeController.getText(sessionController.getLocale(), "plugin.workspace.materialViewRestricted"));
}
return new ContentNode(workspaceMaterial.getTitle(), material.getType(), rootMaterialNode.getId(), material.getId(), level, workspaceMaterial.getAssignmentType(), workspaceMaterial.getCorrectAnswers(), workspaceMaterial.getParent().getId(), workspaceMaterial.getHidden(), html, currentRevision, publishedRevision, workspaceMaterial.getPath(), material.getLicense(), StringUtils.join(producerNames, ','), material.getViewRestrict(), viewRestricted);
default:
return null;
}
} catch (SAXNotRecognizedException | SAXNotSupportedException | TransformerConfigurationException e) {
throw new WorkspaceMaterialException(e);
}
}
use of org.cyberneko.html.HTMLConfiguration in project muikku by otavanopisto.
the class DeusNexMachinaController method postProcessHtml.
private void postProcessHtml(HtmlMaterial material) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformerException {
StringReader htmlReader = new StringReader(material.getHtml());
DOMParser parser = new DOMParser(new HTMLConfiguration());
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
InputSource inputSource = new InputSource(htmlReader);
parser.parse(inputSource);
org.w3c.dom.Document domDocument = parser.getDocument();
boolean modified = false;
// Embedded YouTube clips; strip protocol
List<Element> elements = DeusNexXmlUtils.getElementsByXPath(domDocument.getDocumentElement(), "//iframe");
if (!elements.isEmpty()) {
for (Element element : elements) {
String src = element.getAttribute("src");
if (src != null && src.startsWith("http://www.youtube.com/")) {
element.setAttribute("src", src.substring(5));
modified = true;
}
}
}
// Embedded documents; add data attributes and determine correct material title
elements = DeusNexXmlUtils.getElementsByXPath(domDocument.getDocumentElement(), "//iframe[@data-type=\"embedded-document\"]");
if (!elements.isEmpty()) {
modified = true;
for (Element element : elements) {
Integer resourceNo = Integer.valueOf(element.getAttribute("data-resource-no"));
WorkspaceMaterial workspaceMaterial = workspaceMaterialController.findWorkspaceMaterialById(getResourceWorkspaceNodeId(resourceNo));
HtmlMaterial htmlMaterial = htmlMaterialController.findHtmlMaterialById(workspaceMaterial.getMaterialId());
// If a header precedes an embedded document, use its text as the embedded
// document's title and remove it from the parent document altogether
Node possibleHeaderNode = getPreviousSiblingElement(element);
if (isHeader(possibleHeaderNode)) {
String headerText = StringUtils.trim(possibleHeaderNode.getTextContent());
if (!StringUtils.isBlank(headerText)) {
htmlMaterialController.updateHtmlMaterialTitle(htmlMaterial, headerText);
possibleHeaderNode.getParentNode().removeChild(possibleHeaderNode);
}
}
element.setAttribute("data-material-id", String.valueOf(htmlMaterial.getId()));
element.setAttribute("data-material-type", htmlMaterial.getType());
element.setAttribute("data-workspace-material-id", String.valueOf(workspaceMaterial.getId()));
}
}
// Update to post-processed version, if applicable
if (modified) {
StringWriter writer = new StringWriter();
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty(OutputKeys.INDENT, "no");
transformer.transform(new DOMSource(domDocument), new StreamResult(writer));
htmlMaterialController.updateHtmlMaterialHtml(material, writer.getBuffer().toString());
}
}
use of org.cyberneko.html.HTMLConfiguration in project muikku by otavanopisto.
the class HtmlMaterialCleaner method cleanMaterial.
public void cleanMaterial(HtmlMaterial htmlMaterial, WorkspaceMaterial ownerMaterial) {
Long maxRevision = getMaterialRevision(htmlMaterial);
try {
// Document
String html = htmlMaterialController.getRevisionHtml(htmlMaterial, maxRevision);
DOMParser parser = new DOMParser(new HTMLConfiguration());
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
InputSource inputSource = new InputSource(new StringReader(html));
parser.parse(inputSource);
Document document = parser.getDocument();
// Tasks
Iterator<HtmlMaterialCleanerTask> taskIterator = analyzerTasks.iterator();
List<HtmlMaterialCleanerTask> cleanerTasks = new ArrayList<HtmlMaterialCleanerTask>();
while (taskIterator.hasNext()) {
cleanerTasks.add(taskIterator.next());
}
Collections.sort(cleanerTasks, new Comparator<HtmlMaterialCleanerTask>() {
@Override
public int compare(HtmlMaterialCleanerTask o1, HtmlMaterialCleanerTask o2) {
return o1.getPriority().compareTo(o2.getPriority());
}
});
String newHtml = null;
for (HtmlMaterialCleanerTask cleanerTask : cleanerTasks) {
if (cleanerTask.process(document, ownerMaterial)) {
newHtml = DeusNexXmlUtils.serializeElement(document.getDocumentElement(), true, false, "html");
patch(htmlMaterial, newHtml);
}
}
} catch (Exception e) {
logger.log(Level.SEVERE, "Failed to clean material " + htmlMaterial.getId(), e);
}
}
use of org.cyberneko.html.HTMLConfiguration in project zm-mailbox by Zimbra.
the class HtmlDetag method detag.
public String detag(String html) {
StringWriter out = new StringWriter();
UnescapeWriter writer = new UnescapeWriter(out, "utf-8");
XMLDocumentFilter[] filters = { this, writer };
XMLParserConfiguration parser = new HTMLConfiguration();
parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
parser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
parser.setFeature("http://xml.org/sax/features/namespaces", false);
XMLInputSource source = new XMLInputSource(null, null, null, new StringReader(html), null);
try {
parser.parse(source);
} catch (Exception x) {
ZimbraLog.misc.warn("Can't detag HTML [" + html + "]");
}
// return whatever has been done
return out.toString();
}
Aggregations