Search in sources :

Example 1 with ParserException

use of org.htmlparser.util.ParserException in project lucida by claritylab.

the class HTMLConverter method file2text.

/**
	 * Reads an HTML document from a file and converts it into plain text.
	 * 
	 * @param filename name of file containing HTML documents
	 * @return plain text or <code>null</code> if the reading or conversion failed
	 */
public static synchronized String file2text(String filename) {
    // read from file and convert HTML document
    StringBean sb = new StringBean();
    // no links
    sb.setLinks(false);
    // replace non-breaking spaces
    sb.setReplaceNonBreakingSpaces(true);
    // replace sequences of whitespaces
    sb.setCollapse(true);
    Parser parser = new Parser();
    try {
        parser.setResource(filename);
        parser.visitAllNodesWith(sb);
    } catch (ParserException e) {
        return null;
    }
    String docText = sb.getStrings();
    return docText;
}
Also used : ParserException(org.htmlparser.util.ParserException) StringBean(org.htmlparser.beans.StringBean) Parser(org.htmlparser.Parser)

Example 2 with ParserException

use of org.htmlparser.util.ParserException in project omegat by omegat-org.

the class HTMLFilter2 method processFile.

@Override
public void processFile(BufferedReader infile, BufferedWriter outfile, org.omegat.filters2.FilterContext fc) throws IOException, TranslationException {
    StringBuilder all = null;
    try {
        all = new StringBuilder();
        char[] cbuf = new char[1000];
        int len = -1;
        while ((len = infile.read(cbuf)) > 0) {
            all.append(cbuf, 0, len);
        }
    } catch (OutOfMemoryError e) {
        // out of memory?
        all = null;
        System.gc();
        throw new IOException(OStrings.getString("HTML__FILE_TOO_BIG"));
    }
    HTMLOptions options = new HTMLOptions(processOptions);
    // Prepare matcher
    String skipRegExp = options.getSkipRegExp();
    if (!StringUtil.isEmpty(skipRegExp)) {
        try {
            this.skipRegExpPattern = Pattern.compile(skipRegExp, Pattern.CASE_INSENSITIVE);
        } catch (PatternSyntaxException e) {
            Log.log(e);
        }
    }
    // prepare set of attributes that indicate not to translate a meta-tag
    String skipMetaString = options.getSkipMeta();
    skipMetaAttributes = new HashMap<String, String>();
    String[] skipMetaAttributesStringarray = skipMetaString.split(",");
    for (int i = 0; i < skipMetaAttributesStringarray.length; i++) {
        String keyvalue = skipMetaAttributesStringarray[i].trim().toUpperCase(Locale.ENGLISH);
        skipMetaAttributes.put(keyvalue, "");
    }
    // Prepare set of attributes that indicate not to translate a tag
    String ignoreTagString = options.getIgnoreTags();
    ignoreTagsAttributes = new HashMap<String, String>();
    String[] ignoreTagsAttributesStringarray = ignoreTagString.split(",");
    for (int i = 0; i < ignoreTagsAttributesStringarray.length; i++) {
        String keyvalue = ignoreTagsAttributesStringarray[i].trim().toUpperCase(Locale.ENGLISH);
        ignoreTagsAttributes.put(keyvalue, "");
    }
    Parser parser = new Parser();
    try {
        parser.setInputHTML(all.toString());
        parser.visitAllNodesWith(new FilterVisitor(this, outfile, options));
    } catch (ParserException pe) {
        System.out.println(pe);
    } catch (StringIndexOutOfBoundsException se) {
        throw new StringIndexOutOfBoundsException(OStrings.getString("HTML__INVALID_HTML"));
    }
}
Also used : ParserException(org.htmlparser.util.ParserException) IOException(java.io.IOException) Parser(org.htmlparser.Parser) PatternSyntaxException(java.util.regex.PatternSyntaxException)

Example 3 with ParserException

use of org.htmlparser.util.ParserException in project omegat by omegat-org.

the class HHCFilter2 method processFile.

@Override
public void processFile(BufferedReader infile, BufferedWriter outfile, FilterContext fc) throws IOException, TranslationException {
    StringBuilder all = null;
    try {
        all = new StringBuilder();
        char[] cbuf = new char[1000];
        int len = -1;
        while ((len = infile.read(cbuf)) > 0) {
            all.append(cbuf, 0, len);
        }
    } catch (OutOfMemoryError e) {
        // out of memory?
        all = null;
        System.gc();
        throw new IOException(OStrings.getString("HHC__FILE_TOO_BIG"));
    }
    Parser parser = new Parser();
    try {
        parser.setInputHTML(all.toString());
        parser.visitAllNodesWith(new HHCFilterVisitor(this, outfile));
    } catch (ParserException pe) {
        System.out.println(pe);
    }
}
Also used : ParserException(org.htmlparser.util.ParserException) IOException(java.io.IOException) Parser(org.htmlparser.Parser)

Example 4 with ParserException

use of org.htmlparser.util.ParserException in project portfolio by buchen.

the class DestatisCPIFeed method getConsumerPriceIndices.

@Override
public List<ConsumerPriceIndex> getConsumerPriceIndices() throws IOException {
    try {
        disableCertificateValidation();
        URL url = new URL(// $NON-NLS-1$
        "https://www.destatis.de/DE/ZahlenFakten/GesamtwirtschaftUmwelt/Preise/Verbraucherpreisindizes/Tabellen_/VerbraucherpreiseKategorien.html");
        Lexer lexer = new Lexer(url.openConnection());
        List<ConsumerPriceIndex> prices = new Visitor().visit(lexer);
        if (prices.isEmpty())
            throw new IOException(Messages.MsgResponseContainsNoIndices);
        return prices;
    } catch (ParserException e) {
        throw new IOException(e);
    }
}
Also used : ParserException(org.htmlparser.util.ParserException) Lexer(org.htmlparser.lexer.Lexer) ConsumerPriceIndex(name.abuchen.portfolio.model.ConsumerPriceIndex) IOException(java.io.IOException) URL(java.net.URL)

Example 5 with ParserException

use of org.htmlparser.util.ParserException in project liferay-docs by liferay.

the class CheckLinks method isLdnUrlValid.

/**
 * Returns <code>true</code> if the LDN URL is valid. This method is used to
 * check legacy URLs hosted on LDN.
 *
 * @param  url the URL to check
 * @param  fileName the article's name
 * @param  lineNumber the line number
 * @return <code>true</code> if the LDN URL is valid; <code>false</code>
 *         otherwise
 * @throws IOException if an IO exception occurred
 */
private static boolean isLdnUrlValid(String url, File article, int lineNumber) throws IOException {
    NodeList list = new NodeList();
    boolean validLDNURL = false;
    try {
        Parser htmlParser = new Parser(url);
        list = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
    } catch (ParserException e) {
        logInvalidUrl(article, lineNumber, ldnArticle, false);
    }
    List<String> results = new LinkedList<String>();
    for (int i = 0; i < list.size(); i++) {
        LinkTag link = (LinkTag) list.elementAt(i);
        String linkString = link.getLink();
        results.add(linkString);
    }
    for (String x : results) {
        if (x.contains("2Fsearch&#x25;2Fsearch&#x26;_3_redirect&#x3d;")) {
            logInvalidUrl(article, lineNumber, ldnArticle, false);
        } else {
            validLDNURL = true;
        }
    }
    return validLDNURL;
}
Also used : NodeClassFilter(org.htmlparser.filters.NodeClassFilter) ParserException(org.htmlparser.util.ParserException) LinkTag(org.htmlparser.tags.LinkTag) NodeList(org.htmlparser.util.NodeList) LinkedList(java.util.LinkedList) Parser(org.htmlparser.Parser)

Aggregations

ParserException (org.htmlparser.util.ParserException)6 Parser (org.htmlparser.Parser)5 IOException (java.io.IOException)3 StringBean (org.htmlparser.beans.StringBean)2 URL (java.net.URL)1 LinkedList (java.util.LinkedList)1 PatternSyntaxException (java.util.regex.PatternSyntaxException)1 ConsumerPriceIndex (name.abuchen.portfolio.model.ConsumerPriceIndex)1 NodeClassFilter (org.htmlparser.filters.NodeClassFilter)1 Lexer (org.htmlparser.lexer.Lexer)1 LinkTag (org.htmlparser.tags.LinkTag)1 NodeList (org.htmlparser.util.NodeList)1