use of org.htmlparser.beans.StringBean in project lucida by claritylab.
the class HTMLConverter method url2text.
/**
* Fetches an HTML document from a URL and converts it into plain text.
*
* @param url URL of HTML document
* @return plain text or <code>null</code> if the fetching or conversion failed
*/
public static synchronized String url2text(String url) throws SocketTimeoutException {
// connect to URL
URLConnection conn = null;
try {
conn = (new URL(url)).openConnection();
// only allow HTTP connections
if (!(conn instanceof HttpURLConnection))
return null;
} catch (IOException e) {
return null;
}
// pretend to be a browser
conn.setRequestProperty("User-agent", "Mozilla/4.0");
conn.setConnectTimeout(TIMEOUT);
conn.setReadTimeout(TIMEOUT);
// fetch URL and convert HTML document
StringBean sb = new StringBean();
// no links
sb.setLinks(false);
// replace non-breaking spaces
sb.setReplaceNonBreakingSpaces(true);
// replace sequences of whitespaces
sb.setCollapse(true);
sb.setConnection(conn);
String docText = sb.getStrings();
return docText;
}
use of org.htmlparser.beans.StringBean in project lucida by claritylab.
the class HTMLConverter method file2text.
/**
* Reads an HTML document from a file and converts it into plain text.
*
* @param filename name of file containing HTML documents
* @return plain text or <code>null</code> if the reading or conversion failed
*/
public static synchronized String file2text(String filename) {
// read from file and convert HTML document
StringBean sb = new StringBean();
// no links
sb.setLinks(false);
// replace non-breaking spaces
sb.setReplaceNonBreakingSpaces(true);
// replace sequences of whitespaces
sb.setCollapse(true);
Parser parser = new Parser();
try {
parser.setResource(filename);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
return docText;
}
use of org.htmlparser.beans.StringBean in project lucida by claritylab.
the class HTMLConverter method html2text.
/**
* Converts an HTML document into plain text.
*
* @param html HTML document
* @return plain text or <code>null</code> if the conversion failed
*/
public static synchronized String html2text(String html) {
// convert HTML document
StringBean sb = new StringBean();
// no links
sb.setLinks(false);
// replace non-breaking spaces
sb.setReplaceNonBreakingSpaces(true);
// replace sequences of whitespaces
sb.setCollapse(true);
Parser parser = new Parser();
try {
parser.setInputHTML(html);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
// no content
if (docText == null)
docText = "";
return docText;
}
Aggregations