use of org.htmlparser.util.ParserException in project lucida by claritylab.
the class HTMLConverter method file2text.
/**
* Reads an HTML document from a file and converts it into plain text.
*
* @param filename name of file containing HTML documents
* @return plain text or <code>null</code> if the reading or conversion failed
*/
public static synchronized String file2text(String filename) {
// read from file and convert HTML document
StringBean sb = new StringBean();
// no links
sb.setLinks(false);
// replace non-breaking spaces
sb.setReplaceNonBreakingSpaces(true);
// replace sequences of whitespaces
sb.setCollapse(true);
Parser parser = new Parser();
try {
parser.setResource(filename);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
return docText;
}
use of org.htmlparser.util.ParserException in project omegat by omegat-org.
the class HTMLFilter2 method processFile.
@Override
public void processFile(BufferedReader infile, BufferedWriter outfile, org.omegat.filters2.FilterContext fc) throws IOException, TranslationException {
StringBuilder all = null;
try {
all = new StringBuilder();
char[] cbuf = new char[1000];
int len = -1;
while ((len = infile.read(cbuf)) > 0) {
all.append(cbuf, 0, len);
}
} catch (OutOfMemoryError e) {
// out of memory?
all = null;
System.gc();
throw new IOException(OStrings.getString("HTML__FILE_TOO_BIG"));
}
HTMLOptions options = new HTMLOptions(processOptions);
// Prepare matcher
String skipRegExp = options.getSkipRegExp();
if (!StringUtil.isEmpty(skipRegExp)) {
try {
this.skipRegExpPattern = Pattern.compile(skipRegExp, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) {
Log.log(e);
}
}
// prepare set of attributes that indicate not to translate a meta-tag
String skipMetaString = options.getSkipMeta();
skipMetaAttributes = new HashMap<String, String>();
String[] skipMetaAttributesStringarray = skipMetaString.split(",");
for (int i = 0; i < skipMetaAttributesStringarray.length; i++) {
String keyvalue = skipMetaAttributesStringarray[i].trim().toUpperCase(Locale.ENGLISH);
skipMetaAttributes.put(keyvalue, "");
}
// Prepare set of attributes that indicate not to translate a tag
String ignoreTagString = options.getIgnoreTags();
ignoreTagsAttributes = new HashMap<String, String>();
String[] ignoreTagsAttributesStringarray = ignoreTagString.split(",");
for (int i = 0; i < ignoreTagsAttributesStringarray.length; i++) {
String keyvalue = ignoreTagsAttributesStringarray[i].trim().toUpperCase(Locale.ENGLISH);
ignoreTagsAttributes.put(keyvalue, "");
}
Parser parser = new Parser();
try {
parser.setInputHTML(all.toString());
parser.visitAllNodesWith(new FilterVisitor(this, outfile, options));
} catch (ParserException pe) {
System.out.println(pe);
} catch (StringIndexOutOfBoundsException se) {
throw new StringIndexOutOfBoundsException(OStrings.getString("HTML__INVALID_HTML"));
}
}
use of org.htmlparser.util.ParserException in project omegat by omegat-org.
the class HHCFilter2 method processFile.
@Override
public void processFile(BufferedReader infile, BufferedWriter outfile, FilterContext fc) throws IOException, TranslationException {
StringBuilder all = null;
try {
all = new StringBuilder();
char[] cbuf = new char[1000];
int len = -1;
while ((len = infile.read(cbuf)) > 0) {
all.append(cbuf, 0, len);
}
} catch (OutOfMemoryError e) {
// out of memory?
all = null;
System.gc();
throw new IOException(OStrings.getString("HHC__FILE_TOO_BIG"));
}
Parser parser = new Parser();
try {
parser.setInputHTML(all.toString());
parser.visitAllNodesWith(new HHCFilterVisitor(this, outfile));
} catch (ParserException pe) {
System.out.println(pe);
}
}
use of org.htmlparser.util.ParserException in project portfolio by buchen.
the class DestatisCPIFeed method getConsumerPriceIndices.
@Override
public List<ConsumerPriceIndex> getConsumerPriceIndices() throws IOException {
try {
disableCertificateValidation();
URL url = new URL(// $NON-NLS-1$
"https://www.destatis.de/DE/ZahlenFakten/GesamtwirtschaftUmwelt/Preise/Verbraucherpreisindizes/Tabellen_/VerbraucherpreiseKategorien.html");
Lexer lexer = new Lexer(url.openConnection());
List<ConsumerPriceIndex> prices = new Visitor().visit(lexer);
if (prices.isEmpty())
throw new IOException(Messages.MsgResponseContainsNoIndices);
return prices;
} catch (ParserException e) {
throw new IOException(e);
}
}
use of org.htmlparser.util.ParserException in project liferay-docs by liferay.
the class CheckLinks method isLdnUrlValid.
/**
* Returns <code>true</code> if the LDN URL is valid. This method is used to
* check legacy URLs hosted on LDN.
*
* @param url the URL to check
* @param fileName the article's name
* @param lineNumber the line number
* @return <code>true</code> if the LDN URL is valid; <code>false</code>
* otherwise
* @throws IOException if an IO exception occurred
*/
private static boolean isLdnUrlValid(String url, File article, int lineNumber) throws IOException {
NodeList list = new NodeList();
boolean validLDNURL = false;
try {
Parser htmlParser = new Parser(url);
list = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
} catch (ParserException e) {
logInvalidUrl(article, lineNumber, ldnArticle, false);
}
List<String> results = new LinkedList<String>();
for (int i = 0; i < list.size(); i++) {
LinkTag link = (LinkTag) list.elementAt(i);
String linkString = link.getLink();
results.add(linkString);
}
for (String x : results) {
if (x.contains("2Fsearch%2Fsearch&_3_redirect=")) {
logInvalidUrl(article, lineNumber, ldnArticle, false);
} else {
validLDNURL = true;
}
}
return validLDNURL;
}
Aggregations