use of org.htmlparser.Parser in project lucida by claritylab.
the class HTMLConverter method file2text.
/**
* Reads an HTML document from a file and converts it into plain text.
*
* @param filename name of file containing HTML documents
* @return plain text or <code>null</code> if the reading or conversion failed
*/
public static synchronized String file2text(String filename) {
// read from file and convert HTML document
StringBean sb = new StringBean();
// no links
sb.setLinks(false);
// replace non-breaking spaces
sb.setReplaceNonBreakingSpaces(true);
// replace sequences of whitespaces
sb.setCollapse(true);
Parser parser = new Parser();
try {
parser.setResource(filename);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
return docText;
}
use of org.htmlparser.Parser in project lucida by claritylab.
the class HTMLConverter method html2text.
/**
* Converts an HTML document into plain text.
*
* @param html HTML document
* @return plain text or <code>null</code> if the conversion failed
*/
public static synchronized String html2text(String html) {
// convert HTML document
StringBean sb = new StringBean();
// no links
sb.setLinks(false);
// replace non-breaking spaces
sb.setReplaceNonBreakingSpaces(true);
// replace sequences of whitespaces
sb.setCollapse(true);
Parser parser = new Parser();
try {
parser.setInputHTML(html);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
// no content
if (docText == null)
docText = "";
return docText;
}
use of org.htmlparser.Parser in project dhis2-core by dhis2.
the class GridUtils method fromHtml.
/**
* Creates a list of Grids based on the given HTML string. This works only
* for table-based HTML documents.
*
* @param html the HTML string.
* @param title the title to use for the grids.
* @return a list of Grids.
*/
public static List<Grid> fromHtml(String html, String title) throws Exception {
if (html == null || html.trim().isEmpty()) {
return null;
}
List<Grid> grids = new ArrayList<>();
Parser parser = Parser.createParser(html, "UTF-8");
Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray();
for (Node t : tables) {
Grid grid = new ListGrid();
grid.setTitle(title);
TableTag table = (TableTag) t;
TableRow[] rows = table.getRows();
Integer firstColumnCount = null;
for (TableRow row : rows) {
if (// Ignore if no cells
getColumnCount(row) == 0) {
log.warn("Ignoring row with no columns");
continue;
}
Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray();
if (// First row becomes header
firstColumnCount == null) {
firstColumnCount = getColumnCount(row);
for (Node c : cells) {
TagNode cell = (TagNode) c;
grid.addHeader(new GridHeader(getValue(cell), false, false));
Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));
if (colSpan != null && colSpan > 1) {
grid.addEmptyHeaders((colSpan - 1));
}
}
} else // Rest becomes rows
{
if (// Ignore
firstColumnCount != getColumnCount(row)) {
log.warn("Ignoring row which has " + row.getColumnCount() + " columns since table has " + firstColumnCount + " columns");
continue;
}
grid.addRow();
for (Node c : cells) {
// TODO row span
TagNode cell = (TagNode) c;
grid.addValue(getValue(cell));
Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));
if (colSpan != null && colSpan > 1) {
grid.addEmptyValues((colSpan - 1));
}
}
}
}
grids.add(grid);
}
return grids;
}
Aggregations