use of com.smartandroid.sa.tag.nodes.Element in project SmartAndroidSource by jaychou2012.
the class DataUtil method parseByteData.
// reads bytes first into a buffer, then decodes with the appropriate
// charset. done this way to support
// switching the chartset midstream when a meta http-equiv tag defines the
// charset.
// todo - this is getting gnarly. needs a rewrite.
static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
String docData;
Document doc = null;
if (charsetName == null) {
// determine from meta. safe parse as UTF-8
// look for <meta http-equiv="Content-Type"
// content="text/html;charset=gb2312"> or HTML5 <meta
// charset="gb2312">
docData = Charset.forName(defaultCharset).decode(byteData).toString();
doc = parser.parseInput(docData, baseUri);
Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
if (meta != null) {
// if not found, will keep utf-8 as best attempt
String foundCharset;
if (meta.hasAttr("http-equiv")) {
foundCharset = getCharsetFromContentType(meta.attr("content"));
if (foundCharset == null && meta.hasAttr("charset")) {
try {
if (Charset.isSupported(meta.attr("charset"))) {
foundCharset = meta.attr("charset");
}
} catch (IllegalCharsetNameException e) {
foundCharset = null;
}
}
} else {
foundCharset = meta.attr("charset");
}
if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) {
// need to
// re-decode
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
charsetName = foundCharset;
byteData.rewind();
docData = Charset.forName(foundCharset).decode(byteData).toString();
doc = null;
}
}
} else {
// specified by content type header (or by user on file load)
Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
docData = Charset.forName(charsetName).decode(byteData).toString();
}
// used. re-decodes incase above decoded incorrectly
if (docData.length() > 0 && docData.charAt(0) == 65279) {
byteData.rewind();
docData = Charset.forName(defaultCharset).decode(byteData).toString();
docData = docData.substring(1);
charsetName = defaultCharset;
doc = null;
}
if (doc == null) {
doc = parser.parseInput(docData, baseUri);
doc.outputSettings().charset(charsetName);
}
return doc;
}
use of com.smartandroid.sa.tag.nodes.Element in project SmartAndroidSource by jaychou2012.
the class Parser method parseBodyFragment.
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml
* fragment of HTML
* @param baseUri
* base URI of document (i.e. original fetch location), for
* resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
// the node
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
// re-parented
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}
use of com.smartandroid.sa.tag.nodes.Element in project SmartAndroidSource by jaychou2012.
the class XmlTreeBuilder method popStackToClose.
/**
* If the stack contains an element with this tag's name, pop up the stack
* to remove the first occurrence. If not found, skips.
*
* @param endTag
*/
private void popStackToClose(Token.EndTag endTag) {
String elName = endTag.name();
Element firstFound = null;
Iterator<Element> it = stack.descendingIterator();
while (it.hasNext()) {
Element next = it.next();
if (next.nodeName().equals(elName)) {
firstFound = next;
break;
}
}
if (firstFound == null)
// not found, skip
return;
it = stack.descendingIterator();
while (it.hasNext()) {
Element next = it.next();
if (next == firstFound) {
it.remove();
break;
} else {
it.remove();
}
}
}
use of com.smartandroid.sa.tag.nodes.Element in project SmartAndroidSource by jaychou2012.
the class XmlTreeBuilder method insert.
Element insert(Token.StartTag startTag) {
Tag tag = Tag.valueOf(startTag.name());
// todo: wonder if for xml parsing, should treat all tags as unknown?
// because it's not html.
Element el = new Element(tag, baseUri, startTag.attributes);
insertNode(el);
if (startTag.isSelfClosing()) {
tokeniser.acknowledgeSelfClosingFlag();
if (// unknown tag, remember this is self closing
!tag.isKnownTag())
// for output. see above.
tag.setSelfClosing();
} else {
stack.add(el);
}
return el;
}
use of com.smartandroid.sa.tag.nodes.Element in project SmartAndroidSource by jaychou2012.
the class Cleaner method createSafeElement.
private ElementMeta createSafeElement(Element sourceEl) {
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
destAttrs.put(sourceAttr);
else
numDiscarded++;
}
Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
destAttrs.addAll(enforcedAttrs);
return new ElementMeta(dest, numDiscarded);
}
Aggregations