Examples with HTMLStripCharFilter - org.apache.lucene.analysis.charfilter.HTMLStripCharFilter

Example 1 with HTMLStripCharFilter

use of org.apache.lucene.analysis.charfilter.HTMLStripCharFilter in project lucene-solr by apache.

the class HTMLStripTransformer method stripHTML.

private Object stripHTML(String value, String column) {
    StringBuilder out = new StringBuilder();
    StringReader strReader = new StringReader(value);
    try {
        HTMLStripCharFilter html = new HTMLStripCharFilter(strReader.markSupported() ? strReader : new BufferedReader(strReader));
        char[] cbuf = new char[1024 * 10];
        while (true) {
            int count = html.read(cbuf);
            if (count == -1)
                // end of stream mark is -1
                break;
            if (count > 0)
                out.append(cbuf, 0, count);
        }
        html.close();
    } catch (IOException e) {
        throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Failed stripping HTML for column: " + column, e);
    }
    return out.toString();
}

Also used : StringReader(java.io.StringReader) BufferedReader(java.io.BufferedReader) HTMLStripCharFilter(org.apache.lucene.analysis.charfilter.HTMLStripCharFilter) IOException(java.io.IOException)

Example 2 with HTMLStripCharFilter

use of org.apache.lucene.analysis.charfilter.HTMLStripCharFilter in project lucene-solr by apache.

the class HTMLStripFieldUpdateProcessorFactory method getInstance.

@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
    return valueMutator(getSelector(), next, src -> {
        if (src instanceof CharSequence) {
            CharSequence s = (CharSequence) src;
            StringWriter result = new StringWriter(s.length());
            Reader in = null;
            try {
                in = new HTMLStripCharFilter(new StringReader(s.toString()));
                IOUtils.copy(in, result);
                return result.toString();
            } catch (IOException e) {
                return s;
            } finally {
                IOUtils.closeQuietly(in);
            }
        }
        return src;
    });
}

Also used : StringWriter(java.io.StringWriter) StringReader(java.io.StringReader) StringReader(java.io.StringReader) Reader(java.io.Reader) HTMLStripCharFilter(org.apache.lucene.analysis.charfilter.HTMLStripCharFilter) IOException(java.io.IOException)

Example 3 with HTMLStripCharFilter

use of org.apache.lucene.analysis.charfilter.HTMLStripCharFilter in project OpenGrok by OpenGrok.

the class SearchEngine method results.

/**
 * get results , if no search was started before, no results are returned
 * this method will requery if end is more than first query from search,
 * hence performance hit applies, if you want results in later pages than
 * number of cachePages also end has to be bigger than start !
 *
 * @param start start of the hit list
 * @param end end of the hit list
 * @param ret list of results from start to end or null/empty if no search
 * was started
 */
public void results(int start, int end, List<Hit> ret) {
    // return if no start search() was done
    if (hits == null || (end < start)) {
        ret.clear();
        return;
    }
    ret.clear();
    // TODO check if below fits for if end=old hits.length, or it should include it
    if (end > hits.length & !allCollected) {
        // do the requery, we want more than 5 pages
        collector = TopScoreDocCollector.create(totalHits);
        try {
            searcher.search(query, collector);
        } catch (Exception e) {
            // this exception should never be hit, since search() will hit this before
            LOGGER.log(Level.WARNING, SEARCH_EXCEPTION_MSG, e);
        }
        hits = collector.topDocs().scoreDocs;
        Document d = null;
        for (int i = start; i < hits.length; i++) {
            int docId = hits[i].doc;
            try {
                d = searcher.doc(docId);
            } catch (Exception e) {
                LOGGER.log(Level.SEVERE, SEARCH_EXCEPTION_MSG, e);
            }
            docs.add(d);
        }
        allCollected = true;
    }
    // the only problem is that count of docs is usually smaller than number of results
    for (int ii = start; ii < end; ++ii) {
        boolean alt = (ii % 2 == 0);
        boolean hasContext = false;
        try {
            Document doc = docs.get(ii);
            String filename = doc.get(QueryBuilder.PATH);
            Genre genre = Genre.get(doc.get(QueryBuilder.T));
            Definitions tags = null;
            IndexableField tagsField = doc.getField(QueryBuilder.TAGS);
            if (tagsField != null) {
                tags = Definitions.deserialize(tagsField.binaryValue().bytes);
            }
            Scopes scopes = null;
            IndexableField scopesField = doc.getField(QueryBuilder.SCOPES);
            if (scopesField != null) {
                scopes = Scopes.deserialize(scopesField.binaryValue().bytes);
            }
            int nhits = docs.size();
            if (sourceContext != null) {
                try {
                    if (Genre.PLAIN == genre && (source != null)) {
                        // SRCROOT is read with UTF-8 as a default.
                        hasContext = sourceContext.getContext(new InputStreamReader(new FileInputStream(source + filename), StandardCharsets.UTF_8), null, null, null, filename, tags, nhits > 100, false, ret, scopes);
                    } else if (Genre.XREFABLE == genre && data != null && summarizer != null) {
                        int l;
                        /**
                         * For backward compatibility, read the
                         * OpenGrok-produced document using the system
                         * default charset.
                         */
                        try (Reader r = RuntimeEnvironment.getInstance().isCompressXref() ? new HTMLStripCharFilter(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(data + Prefix.XREF_P + filename + ".gz"))))) : new HTMLStripCharFilter(new BufferedReader(new FileReader(data + Prefix.XREF_P + filename)))) {
                            l = r.read(content);
                        }
                        // TODO FIX below fragmenter according to either summarizer or context (to get line numbers, might be hard, since xref writers will need to be fixed too, they generate just one line of html code now :( )
                        Summary sum = summarizer.getSummary(new String(content, 0, l));
                        Fragment[] fragments = sum.getFragments();
                        for (int jj = 0; jj < fragments.length; ++jj) {
                            String match = fragments[jj].toString();
                            if (match.length() > 0) {
                                if (!fragments[jj].isEllipsis()) {
                                    Hit hit = new Hit(filename, fragments[jj].toString(), "", true, alt);
                                    ret.add(hit);
                                }
                                hasContext = true;
                            }
                        }
                    } else {
                        LOGGER.log(Level.WARNING, "Unknown genre: {0} for {1}", new Object[] { genre, filename });
                        hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes);
                    }
                } catch (FileNotFoundException exp) {
                    LOGGER.log(Level.WARNING, "Couldn''t read summary from {0} ({1})", new Object[] { filename, exp.getMessage() });
                    hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes);
                }
            }
            if (historyContext != null) {
                hasContext |= historyContext.getContext(source + filename, filename, ret);
            }
            if (!hasContext) {
                ret.add(new Hit(filename, "...", "", false, alt));
            }
        } catch (IOException | ClassNotFoundException | HistoryException e) {
            LOGGER.log(Level.WARNING, SEARCH_EXCEPTION_MSG, e);
        }
    }
}

Also used : InputStreamReader(java.io.InputStreamReader) Definitions(org.opensolaris.opengrok.analysis.Definitions) HistoryException(org.opensolaris.opengrok.history.HistoryException) FileNotFoundException(java.io.FileNotFoundException) MultiReader(org.apache.lucene.index.MultiReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) IndexReader(org.apache.lucene.index.IndexReader) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) ParseException(org.apache.lucene.queryparser.classic.ParseException) HistoryException(org.opensolaris.opengrok.history.HistoryException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) FileInputStream(java.io.FileInputStream) IndexableField(org.apache.lucene.index.IndexableField) GZIPInputStream(java.util.zip.GZIPInputStream) Scopes(org.opensolaris.opengrok.analysis.Scopes) BufferedReader(java.io.BufferedReader) HTMLStripCharFilter(org.apache.lucene.analysis.charfilter.HTMLStripCharFilter) FileReader(java.io.FileReader) Genre(org.opensolaris.opengrok.analysis.FileAnalyzer.Genre)

Aggregations

IOException (java.io.IOException)3 HTMLStripCharFilter (org.apache.lucene.analysis.charfilter.HTMLStripCharFilter)3 BufferedReader (java.io.BufferedReader)2 Reader (java.io.Reader)2 StringReader (java.io.StringReader)2 FileInputStream (java.io.FileInputStream)1 FileNotFoundException (java.io.FileNotFoundException)1 FileReader (java.io.FileReader)1 InputStreamReader (java.io.InputStreamReader)1 StringWriter (java.io.StringWriter)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 Document (org.apache.lucene.document.Document)1 DirectoryReader (org.apache.lucene.index.DirectoryReader)1 IndexReader (org.apache.lucene.index.IndexReader)1 IndexableField (org.apache.lucene.index.IndexableField)1 MultiReader (org.apache.lucene.index.MultiReader)1 ParseException (org.apache.lucene.queryparser.classic.ParseException)1 Definitions (org.opensolaris.opengrok.analysis.Definitions)1 Genre (org.opensolaris.opengrok.analysis.FileAnalyzer.Genre)1 Scopes (org.opensolaris.opengrok.analysis.Scopes)1